def get_args()

in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts/03_dl_images_create_dataset.py [0:0]


def get_args():
    parser = argparse.ArgumentParser(description="Download images and create a dataset containing them.")
    parser.add_argument(
        "idx_job",
        type=int,
        help="Index of the job (between 0 and 199).",
    )
    parser.add_argument(
        "--U",
        type=int,
        default=0,
        help="Indicate if the download of the images is already done.",
    )
    parser.add_argument(
        "--download_only",
        type=int,
        default=0,
        help="Indicate if we only want to download the images, and not create the image dataset.",
    )
    parser.add_argument(
        "--path_image_urls",
        type=str,
        default="s3://m4-datasets/webdocs/image_urls_2/",
        help="The path of the file containing the urls of all images.",
    )
    parser.add_argument(
        "--path_save_dir_downloaded_images",
        type=str,
        default="/scratch/storage_hugo/downloaded_images",
        help="The directory to save all images.",
    )
    parser.add_argument(
        "--thread_count",
        type=int,
        default=128,
        help="The number of threads used for downloading the pictures.",
    )
    parser.add_argument(
        "--number_sample_per_shard",
        type=int,
        default=10_000,
        help="The number of images that will be downloaded in one shard.",
    )
    parser.add_argument(
        "--image_size",
        type=int,
        default=256,
        help="The size to resize image to. Not used if --resize_mode=no.",
    )
    parser.add_argument(
        "--resize_mode",
        type=str,
        default="no",
        help="The way to resize pictures, can be no, border or keep_ratio.",
    )
    parser.add_argument(
        "--num_proc",
        type=int,
        default=cpu_count(),
        help="Number of processes to use for the multiprocessing.",
    )
    parser.add_argument(
        "--path_save_dir_tmp_datasets_images",
        type=str,
        default="/scratch/storage_hugo/tmp_datasets_images",
        help=(
            "The directory to save the temporary datasets containing all images (useful for the code but can be"
            " forgotten after)."
        ),
    )
    parser.add_argument(
        "--path_save_dir_dataset_images",
        type=str,
        default="s3://m4-datasets/webdocs/image_dataset_2/",
        help="The directory to save the dataset containing all images.",
    )
    parser.add_argument(
        "--path_save_file_map_url_idx",
        type=str,
        default="s3://m4-datasets/webdocs/map_url_idx_2/",
        help="The file to save the map to go from urls to indices of the dataset containing all images.",
    )
    args = parser.parse_args()
    return args