in obelics/callers/filter_web_documents.py [0:0]
def get_args():
parser = argparse.ArgumentParser(description="Web document filtering.")
parser.add_argument(
"--path_web_document_dataset",
type=str,
default="./large_files/web_document_dataset_100",
help="Path of the dataset containing the web documents.",
)
parser.add_argument(
"--path_save_dir_web_document_dataset_filtered",
type=str,
default="./large_files/web_document_dataset_100_filtered",
help="The directory to save the filtered web document dataset.",
)
parser.add_argument(
"--num_proc",
type=int,
default=cpu_count(),
help="Number of processes to use for the multiprocessing.",
)
parser.add_argument(
"--path_config_filter_web_documents",
type=str,
default="./obelics/configs/config_filter_web_documents.yaml",
help="The path of the config file containing the filtering parameters.",
)
parser.add_argument(
"--path_common_words",
type=str,
default="./large_files/common_words.json", # Find it at https://drive.google.com/file/d/1TeydSroOOmlEuxIcwgsJQ2YF4kPJR6N4/view?usp=sharing
help="The path of the dictionary containing the common words.",
)
parser.add_argument(
"--path_lang_id_model",
type=str,
default="./large_files/lid.176.bin", # Find it at https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
help="The path of the lang id model (FastText).",
)
parser.add_argument(
"--path_sentencepiece_model",
type=str,
default="./large_files/en.sp.model", # Find it at https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.sp.model
help="The path of the SentencePiece model.",
)
parser.add_argument(
"--path_kenlm_model",
type=str,
default="./large_files/en.arpa.bin", # Find it at https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.arpa.bin
help="The path of the KenLM model.",
)
args = parser.parse_args()
return args