build_obelics/10_final_cleaning.py [124:135]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    logger.info("Starting saving the image urls in the web document dataset")
    img_urls = [[el["src"] for el in json.loads(md) if el] for md in web_docs["metadata"]]
    img_urls = [sub_el for el in img_urls for sub_el in el]
    img_urls = Counter(img_urls)

    with open(PATH_SAVE_DISK_IMG_URLS_IN_FINAL_WEB_DOCS, "wb") as f:
        pickle.dump(img_urls, f, pickle.HIGHEST_PROTOCOL)
    command_sync_s3 = (
        f"aws s3 cp {PATH_SAVE_DISK_IMG_URLS_IN_FINAL_WEB_DOCS} {PATH_SAVE_S3_IMG_URLS_IN_FINAL_WEB_DOCS}"
    )
    os.system(command_sync_s3)
    logger.info("Finished saving the image urls in the web document dataset")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



build_obelics/13_final_processing.py [315:326]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    logger.info("Starting saving the image urls in the web document dataset")
    img_urls = [[el["src"] for el in json.loads(md) if el] for md in web_docs["metadata"]]
    img_urls = [sub_el for el in img_urls for sub_el in el]
    img_urls = Counter(img_urls)

    with open(PATH_SAVE_DISK_IMG_URLS_IN_FINAL_WEB_DOCS, "wb") as f:
        pickle.dump(img_urls, f, pickle.HIGHEST_PROTOCOL)
    command_sync_s3 = (
        f"aws s3 cp {PATH_SAVE_DISK_IMG_URLS_IN_FINAL_WEB_DOCS} {PATH_SAVE_S3_IMG_URLS_IN_FINAL_WEB_DOCS}"
    )
    os.system(command_sync_s3)
    logger.info("Finished saving the image urls in the web document dataset")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



