def get_args()

in obelics/callers/line_deduplicate_web_documents.py [0:0]


def get_args():
    parser = argparse.ArgumentParser(description="Line deduplicate web documents.")
    parser.add_argument(
        "--path_sharded_dataset",
        type=str,
        default="/gpfsscratch/rech/cnw/commun/local_datasets/web_document_dataset_45M_sharded_filtered_2/train",
        help="Path to the folder containing the shards of the web document dataset.",
    )
    parser.add_argument(
        "--path_save_domain_to_positions",
        type=str,
        default="/gpfswork/rech/cnw/urd43gx/line_dedup/domain_to_positions.json",
        help=(
            "Path of the file to save the dictionary to go from a domain name to positions in the web document"
            " dataset."
        ),
    )
    parser.add_argument(
        "--path_save_domain_to_duplicated_texts",
        type=str,
        default="/gpfswork/rech/cnw/urd43gx/line_dedup/domain_to_duplicated_texts.json",
        help="Path of the file to save the dictionary containing the deduplicated texts for each domain.",
    )
    parser.add_argument(
        "--id_shard_to_line_deduplicate",
        type=int,
        default=2,
        help="Id of the shard to perform line deduplication on.",
    )
    parser.add_argument(
        "--num_proc",
        type=int,
        default=1,
        help="Number of processes for the map operation in the line deduplication.",
    )
    parser.add_argument(
        "--path_save_line_deduplicated_sharded_dataset",
        type=str,
        default="/gpfsscratch/rech/cnw/commun/local_datasets/web_document_dataset_45M_sharded_filtered_2_line_deduplicated/train",
        help="Path to the folder to save the shards of the line deduplicated web document dataset.",
    )
    args = parser.parse_args()
    return args