build_obelics/02_bis_extract_html_get_image_urls_new_rules.py [14:41]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
)


logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def get_args():
    parser = argparse.ArgumentParser(
        description="Extract html from warc files, simplify them, get the urls of the images."
    )
    parser.add_argument(
        "idx_job",
        type=int,
        help="Index of the job (between 0 and 199).",
    )
    parser.add_argument(
        "--path_warc_dataset",
        type=str,
        default="s3://m4-datasets/webdocs/warc_dataset/",
        help="Path of the dataset containing the warc files to retrieve the html.",
    )
    parser.add_argument(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



build_obelics/02_extract_html_get_image_urls.py [13:40]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
)


logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def get_args():
    parser = argparse.ArgumentParser(
        description="Extract html from warc files, simplify them, get the urls of the images."
    )
    parser.add_argument(
        "idx_job",
        type=int,
        help="Index of the job (between 0 and 199).",
    )
    parser.add_argument(
        "--path_warc_dataset",
        type=str,
        default="s3://m4-datasets/webdocs/warc_dataset/",
        help="Path of the dataset containing the warc files to retrieve the html.",
    )
    parser.add_argument(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



