in build_obelics/03_dl_images_create_dataset.py [0:0]
def get_args():
parser = argparse.ArgumentParser(description="Download images and create a dataset containing them.")
parser.add_argument(
"idx_job",
type=int,
help="Index of the job (between 0 and 199).",
)
parser.add_argument(
"--U",
type=int,
default=0,
help="Indicate if the download of the images is already done.",
)
parser.add_argument(
"--download_only",
type=int,
default=0,
help="Indicate if we only want to download the images, and not create the image dataset.",
)
parser.add_argument(
"--path_image_urls",
type=str,
default="s3://m4-datasets/webdocs/image_urls_2/",
help="The path of the file containing the urls of all images.",
)
parser.add_argument(
"--path_save_dir_downloaded_images",
type=str,
default="/scratch/storage_hugo/downloaded_images",
help="The directory to save all images.",
)
parser.add_argument(
"--thread_count",
type=int,
default=128,
help="The number of threads used for downloading the pictures.",
)
parser.add_argument(
"--number_sample_per_shard",
type=int,
default=10_000,
help="The number of images that will be downloaded in one shard.",
)
parser.add_argument(
"--image_size",
type=int,
default=256,
help="The size to resize image to. Not used if --resize_mode=no.",
)
parser.add_argument(
"--resize_mode",
type=str,
default="no",
help="The way to resize pictures, can be no, border or keep_ratio.",
)
parser.add_argument(
"--num_proc",
type=int,
default=cpu_count(),
help="Number of processes to use for the multiprocessing.",
)
parser.add_argument(
"--path_save_dir_tmp_datasets_images",
type=str,
default="/scratch/storage_hugo/tmp_datasets_images",
help=(
"The directory to save the temporary datasets containing all images (useful for the code but can be"
" forgotten after)."
),
)
parser.add_argument(
"--path_save_dir_dataset_images",
type=str,
default="s3://m4-datasets/webdocs/image_dataset_2/",
help="The directory to save the dataset containing all images.",
)
parser.add_argument(
"--path_save_file_map_url_idx",
type=str,
default="s3://m4-datasets/webdocs/map_url_idx_2/",
help="The file to save the map to go from urls to indices of the dataset containing all images.",
)
args = parser.parse_args()
return args