in build_obelics/02_extract_html_get_image_urls.py [0:0]
def get_args():
parser = argparse.ArgumentParser(
description="Extract html from warc files, simplify them, get the urls of the images."
)
parser.add_argument(
"idx_job",
type=int,
help="Index of the job (between 0 and 199).",
)
parser.add_argument(
"--path_warc_dataset",
type=str,
default="s3://m4-datasets/webdocs/warc_dataset/",
help="Path of the dataset containing the warc files to retrieve the html.",
)
parser.add_argument(
"--path_save_file_image_urls",
type=str,
default="/scratch/storage_hugo/image_urls.txt",
help="The file to save the urls of all images.",
)
parser.add_argument(
"--path_save_dir_html_dataset",
type=str,
default="s3://m4-datasets/webdocs/html_dataset/",
help="The directory to save the html dataset.",
)
parser.add_argument(
"--num_proc",
type=int,
default=cpu_count(),
help="Number of processes to use for the multiprocessing.",
)
args = parser.parse_args()
return args