in build_obelics/05_filtering_web_docs.py [0:0]
def get_args():
parser = argparse.ArgumentParser(description="Filtering of WebDocs.")
parser.add_argument(
"idx_job",
type=int,
help="Index of the job (between 0 and 199).",
)
parser.add_argument(
"--path_web_document_dataset",
type=str,
default="s3://m4-datasets/webdocs/web_document_dataset/",
help="Path of the dataset containing the web documents.",
)
parser.add_argument(
"--path_save_web_document_dataset_filtered",
type=str,
default="s3://m4-datasets/webdocs/web_document_dataset_filtered/",
help="The path to save the filtered web document dataset.",
)
parser.add_argument(
"--num_proc",
type=int,
default=cpu_count(),
help="Number of processes to use for the multiprocessing.",
)
parser.add_argument(
"--path_config_filter_web_documents",
type=str,
default="./m4/sourcing/data_collection/configs/config_filter_web_documents.yaml",
help="The path of the config file containing the filtering parameters.",
)
parser.add_argument(
"--path_common_words",
type=str,
default="/fsx/hugo/filtering_web_docs/models/common_words.json",
help="The path of the dictionary containing the common words.",
)
parser.add_argument(
"--path_lang_id_model",
type=str,
default="/fsx/hugo/filtering_web_docs/models/lid.176.bin",
help="The path of the lang id model (FastText).",
)
parser.add_argument(
"--path_sentencepiece_model",
type=str,
default="/fsx/hugo/filtering_web_docs/models/en.sp.model",
help="The path of the SentencePiece model.",
)
parser.add_argument(
"--path_kenlm_model",
type=str,
default="/fsx/hugo/filtering_web_docs/models/en.arpa.bin",
help="The path of the KenLM model.",
)
args = parser.parse_args()
return args