def get_args()

in vision/m4/sourcing/data_collection/callers/deduplicate_images_web_documents.py [0:0]


def get_args():
    parser = argparse.ArgumentParser(description="Deduplicate images in web documents.")
    parser.add_argument(
        "--path_web_document_dataset_train",
        type=str,
        default="./large_files/output_extraction/web_document_dataset_train_100",
        help="Path of the dataset containing the web documents (train split).",
    )
    parser.add_argument(
        "--path_web_document_dataset_valid",
        type=str,
        default="./large_files/output_extraction/web_document_dataset_valid_100",
        help="Path of the dataset containing the web documents (valid split).",
    )
    parser.add_argument(
        "--num_proc",
        type=int,
        default=cpu_count(),
        help="Number of processes to use for the multiprocessing.",
    )
    parser.add_argument(
        "--path_save_file_map_image_url_to_pos",
        type=str,
        default="./large_files/output_deduplication/map_image_url_to_pos.json",
        help="The path to save the map to go from image urls to their positions in the web document dataset.",
    )
    parser.add_argument(
        "--path_images_web_document_dataset_extraction",
        type=str,
        default="./large_files/output_extraction/dataset_images",
        help="The dataset containing all the images created during the web document extraction.",
    )
    parser.add_argument(
        "--path_save_dir_images_web_document_dataset_train",
        type=str,
        default="./large_files/output_deduplication/images_web_document_dataset_train",
        help=(
            "The path of the directory to save the dataset containing all the images of the web document dataset"
            " (after a potential filtering)."
        ),
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=0,
        help="Seed to generate random numbers. -1 for no seed.",
    )
    parser.add_argument(
        "--path_save_file_to_be_deduplicated",
        type=str,
        default="./large_files/output_deduplication/to_be_deduplicated.json",
        help="The path to save the json containing the positions of the images to be deduplicated.",
    )
    parser.add_argument(
        "--path_save_dir_images_evaluation_tasks_dataset",
        type=str,
        default="./large_files/output_deduplication/images_evaluation_tasks_dataset",
        help="The path of the directory to save the dataset containing all the images of the evaluation tasks.",
    )
    parser.add_argument(
        "--hamming_distance_threshold",
        type=int,
        default=3,
        help="The hamming distance threshold to consider two images as near duplicates.",
    )
    parser.add_argument(
        "--type_dedup",
        type=str,
        default="remove_image",
        help="The type of deduplication to perform. Choose between 'remove_all_doc' and 'remove_image'.",
    )
    parser.add_argument(
        "--path_save_dir_web_document_dataset_train_deduplicated",
        type=str,
        default="./large_files/output_deduplication/web_document_dataset_train_deduplicated",
        help="The directory to save the deduplicated web document dataset.",
    )
    args = parser.parse_args()
    return args