def get_args()

in vision/m4/sourcing/data_collection/callers/extract_web_documents.py [0:0]


def get_args():
    parser = argparse.ArgumentParser(description="Extract web documents.")
    parser.add_argument(
        "--path_config_extract_web_documents",
        type=str,
        default="./m4/sourcing/data_collection/configs/config_extract_web_documents.yaml",
        help="The path of the config file containing the extraction parameters.",
    )
    parser.add_argument(
        "--path_html_dataset",
        type=str,
        default="./large_files/html_documents_10000",
        help="Path of the dataset containing the HTML documents.",
    )
    parser.add_argument(
        "--path_save_dir_dataset",
        type=str,
        default="./large_files/output_extraction/web_documents_10000",
        help="The directory to save the dataset.",
    )
    parser.add_argument(
        "--num_proc",
        type=int,
        default=cpu_count(),
        help="Number of processes to use for the multiprocessing.",
    )
    parser.add_argument(
        "--path_save_file_image_urls",
        type=str,
        default="./large_files/output_extraction/image_urls.txt",
        help="The file to save the urls of all images.",
    )
    parser.add_argument(
        "--path_save_dir_downloaded_images",
        type=str,
        default="./large_files/output_extraction/downloaded_images",
        help="The directory to save all images.",
    )
    parser.add_argument(
        "--thread_count",
        type=int,
        default=256,
        help="The number of threads used for downloading the pictures.",
    )
    parser.add_argument(
        "--number_sample_per_shard",
        type=int,
        default=10_000,
        help="The number of images that will be downloaded in one shard.",
    )
    parser.add_argument(
        "--path_save_dir_tmp_datasets_images",
        type=str,
        default="./large_files/output_extraction/tmp_datasets_images",
        help=(
            "The directory to save the temporary datasets containing all images (useful for the code but can be"
            " forgotten after)."
        ),
    )
    parser.add_argument(
        "--path_save_dir_dataset_images",
        type=str,
        default="./large_files/output_extraction/dataset_images",
        help="The directory to save the dataset containing all images.",
    )
    parser.add_argument(
        "--path_save_file_map_url_idx",
        type=str,
        default="./large_files/output_extraction/map_url_idx.json",
        help="The file to save the map to go from urls to indices of the dataset containing all images.",
    )
    parser.add_argument(
        "--num_proc_urls_to_images",
        type=int,
        default=15,
        help="Number of processes to use for the multiprocessing for the step `urls_to_images`. Reduce if OOM errors.",
    )
    parser.add_argument(
        "--path_save_dir_sharded_dataset",
        type=str,
        default="./large_files/output_extraction/web_documents_10000_sharded",
        help="The directory to save the sharded dataset.",
    )
    parser.add_argument(
        "--shard_size",
        type=int,
        default=20_000,  # 500 shards for 10M web documents
        help="The size of a shard for the sharded dataset.",
    )
    args = parser.parse_args()
    return args