def get_args()

in build_obelics/05_filtering_web_docs.py [0:0]


def get_args():
    parser = argparse.ArgumentParser(description="Filtering of WebDocs.")
    parser.add_argument(
        "idx_job",
        type=int,
        help="Index of the job (between 0 and 199).",
    )
    parser.add_argument(
        "--path_web_document_dataset",
        type=str,
        default="s3://m4-datasets/webdocs/web_document_dataset/",
        help="Path of the dataset containing the web documents.",
    )
    parser.add_argument(
        "--path_save_web_document_dataset_filtered",
        type=str,
        default="s3://m4-datasets/webdocs/web_document_dataset_filtered/",
        help="The path to save the filtered web document dataset.",
    )
    parser.add_argument(
        "--num_proc",
        type=int,
        default=cpu_count(),
        help="Number of processes to use for the multiprocessing.",
    )
    parser.add_argument(
        "--path_config_filter_web_documents",
        type=str,
        default="./m4/sourcing/data_collection/configs/config_filter_web_documents.yaml",
        help="The path of the config file containing the filtering parameters.",
    )
    parser.add_argument(
        "--path_common_words",
        type=str,
        default="/fsx/hugo/filtering_web_docs/models/common_words.json",
        help="The path of the dictionary containing the common words.",
    )
    parser.add_argument(
        "--path_lang_id_model",
        type=str,
        default="/fsx/hugo/filtering_web_docs/models/lid.176.bin",
        help="The path of the lang id model (FastText).",
    )
    parser.add_argument(
        "--path_sentencepiece_model",
        type=str,
        default="/fsx/hugo/filtering_web_docs/models/en.sp.model",
        help="The path of the SentencePiece model.",
    )
    parser.add_argument(
        "--path_kenlm_model",
        type=str,
        default="/fsx/hugo/filtering_web_docs/models/en.arpa.bin",
        help="The path of the KenLM model.",
    )
    args = parser.parse_args()
    return args