def get_args()

in build_obelics/06_03_remove_image_duplicates.py [0:0]


def get_args():
    parser = argparse.ArgumentParser(description="Remove the images that are too duplicated.")
    parser.add_argument(
        "idx_job",
        type=int,
        help="Index of the job (between 0 and 199).",
    )
    parser.add_argument(
        "--path_web_document_dataset_filtered",
        type=str,
        default="s3://m4-datasets/webdocs/web_document_dataset_filtered/",
        help="Path of the web document dataset filtered.",
    )
    parser.add_argument(
        "--path_tot_image_urls_in_web_document_dataset_filtered_too_duplicated",
        type=str,
        default="s3://m4-datasets/webdocs/tot_image_urls_in_web_document_dataset_filtered_too_duplicated.pickle",
        help="Path of the file containing the image urls to remove.",
    )
    parser.add_argument(
        "--path_save_web_document_dataset_filtered_imgurldedup",
        type=str,
        default="s3://m4-datasets/webdocs/web_document_dataset_filtered_imgurldedup/",
        help="Path to save the web document dataset filtered with the deduplication of image urls.",
    )
    parser.add_argument(
        "--num_proc",
        type=int,
        default=48,
        help="Number of processes to use for the multiprocessing.",
    )
    args = parser.parse_args()
    return args