in build_obelics/04_merge_web_docs_with_images.py [0:0]
def get_args():
parser = argparse.ArgumentParser(
description="Merge the web document dataset without images with the dataset of images."
)
parser.add_argument(
"idx_job",
type=int,
help="Index of the job (between 0 and 199).",
)
parser.add_argument(
"--path_web_document_dataset_without_images",
type=str,
default="s3://m4-datasets/webdocs/web_document_dataset_without_images/",
help="Path of the web document dataset without the images.",
)
parser.add_argument(
"--path_image_dataset_1",
type=str,
default="s3://m4-datasets/webdocs/image_dataset/",
help="Path of the dataset containing the images.",
)
parser.add_argument(
"--path_image_dataset_2",
type=str,
default="s3://m4-datasets/webdocs/image_dataset_2/",
help="Path of the second dataset containing the images.",
)
parser.add_argument(
"--path_save_dir_web_document_dataset",
type=str,
default="s3://m4-datasets/webdocs/web_document_dataset/",
help="Path to save the web document dataset with the images.",
)
parser.add_argument(
"--num_proc",
type=int,
default=cpu_count(),
help="Number of processes to use for the multiprocessing.",
)
args = parser.parse_args()
return args