build_obelics/02_bis_extract_html_get_image_urls_new_rules.py [68:91]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    os.system(f"mkdir {path_save_disk_input}")
    command_sync_s3 = f"aws s3 sync {path_sync_s3} {path_save_disk_input}"
    os.system(command_sync_s3)
    os.system(command_sync_s3)
    os.system(command_sync_s3)

    warc_dataset = load_from_disk(path_save_disk_input)
    if ("html" not in warc_dataset.column_names) and ("html_error" not in warc_dataset.column_names):
        warc_dataset = warc_dataset.add_column("html", [""] * len(warc_dataset))
        warc_dataset = warc_dataset.add_column("html_error", [""] * len(warc_dataset))
    logger.info("Finished loading the warc or previous html dataset")

    html_extractor = HtmlExtractor()
    logger.info("Starting retrieving the html")
    html_dataset = warc_dataset.map(html_extractor, num_proc=args.num_proc)
    logger.info("Finished retrieving the html")

    logger.info("Starting computing the success rate for the html extraction")
    num_successes = len([1 for el in html_dataset["html_error"] if not el])
    logger.info(
        f"Success rate for the html extraction: {num_successes} /"
        f" {len(html_dataset)} ({num_successes / len(html_dataset) * 100}%)"
    )
    logger.info("Finished computing the success rate for the html extraction")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


build_obelics/02_extract_html_get_image_urls.py [75:98]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    os.system(f"mkdir {path_save_disk_input}")
    command_sync_s3 = f"aws s3 sync {path_sync_s3} {path_save_disk_input}"
    os.system(command_sync_s3)
    os.system(command_sync_s3)
    os.system(command_sync_s3)

    warc_dataset = load_from_disk(path_save_disk_input)
    if ("html" not in warc_dataset.column_names) and ("html_error" not in warc_dataset.column_names):
        warc_dataset = warc_dataset.add_column("html", [""] * len(warc_dataset))
        warc_dataset = warc_dataset.add_column("html_error", [""] * len(warc_dataset))
    logger.info("Finished loading the warc or previous html dataset")

    html_extractor = HtmlExtractor()
    logger.info("Starting retrieving the html")
    html_dataset = warc_dataset.map(html_extractor, num_proc=args.num_proc)
    logger.info("Finished retrieving the html")

    logger.info("Starting computing the success rate for the html extraction")
    num_successes = len([1 for el in html_dataset["html_error"] if not el])
    logger.info(
        f"Success rate for the html extraction: {num_successes} /"
        f" {len(html_dataset)} ({num_successes / len(html_dataset) * 100}%)"
    )
    logger.info("Finished computing the success rate for the html extraction")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -