in obelics/callers/extract_web_documents.py [0:0]
def get_args():
parser = argparse.ArgumentParser(description="Extract web documents.")
parser.add_argument(
"--path_config_extract_web_documents",
type=str,
default="./obelics/configs/config_extract_web_documents.yaml",
help="The path of the config file containing the extraction parameters.",
)
parser.add_argument(
"--path_html_dataset",
type=str,
default="./large_files/html_documents_10000",
help="Path of the dataset containing the HTML documents.",
)
parser.add_argument(
"--path_save_dir_dataset",
type=str,
default="./large_files/output_extraction/web_documents_10000",
help="The directory to save the dataset.",
)
parser.add_argument(
"--num_proc",
type=int,
default=cpu_count(),
help="Number of processes to use for the multiprocessing.",
)
parser.add_argument(
"--path_save_file_image_urls",
type=str,
default="./large_files/output_extraction/image_urls.txt",
help="The file to save the urls of all images.",
)
parser.add_argument(
"--path_save_dir_downloaded_images",
type=str,
default="./large_files/output_extraction/downloaded_images",
help="The directory to save all images.",
)
parser.add_argument(
"--thread_count",
type=int,
default=256,
help="The number of threads used for downloading the pictures.",
)
parser.add_argument(
"--number_sample_per_shard",
type=int,
default=10_000,
help="The number of images that will be downloaded in one shard.",
)
parser.add_argument(
"--path_save_dir_tmp_datasets_images",
type=str,
default="./large_files/output_extraction/tmp_datasets_images",
help=(
"The directory to save the temporary datasets containing all images (useful for the code but can be"
" forgotten after)."
),
)
parser.add_argument(
"--path_save_dir_dataset_images",
type=str,
default="./large_files/output_extraction/dataset_images",
help="The directory to save the dataset containing all images.",
)
parser.add_argument(
"--path_save_file_map_url_idx",
type=str,
default="./large_files/output_extraction/map_url_idx.json",
help="The file to save the map to go from urls to indices of the dataset containing all images.",
)
parser.add_argument(
"--num_proc_urls_to_images",
type=int,
default=15,
help="Number of processes to use for the multiprocessing for the step `urls_to_images`. Reduce if OOM errors.",
)
parser.add_argument(
"--path_save_dir_sharded_dataset",
type=str,
default="./large_files/output_extraction/web_documents_10000_sharded",
help="The directory to save the sharded dataset.",
)
parser.add_argument(
"--shard_size",
type=int,
default=20_000, # 500 shards for 10M web documents
help="The size of a shard for the sharded dataset.",
)
args = parser.parse_args()
return args