def urls_to_images()

in build_obelics/04_merge_web_docs_with_images.py [0:0]


def urls_to_images(web_document_dataset_without_images, image_dataset, map_url_idx, url_ban_words, num_proc):
    def retrieve_image(url):
        if url not in map_url_idx:
            return None
        if any([url_ban_word in url for url_ban_word in url_ban_words]):
            return None
        # Uncomment if one process seems silently killed without throwing any error in the `map` function.
        # It's rare, but approximately 1/100M pages contain many huge pictures that break things.
        # 2M in bytes was chosen by looking at the distribution of the length in bytes of pictures.
        # It would remove only 1/1000 picture.
        # if len(image_dataset[map_url_idx[url]]["image"]) > 2_000_000:
        #     return None
        image = {"path": None, "bytes": image_dataset[map_url_idx[url]]["image"]}
        return image

    def func_urls_to_images_urls_in_images_col(example):
        # Uncomment if one process seems silently killed without throwing any error in the `map` function.
        # It's rare, but approximately 1/100M pages contain many huge pictures that break things.
        # num_images = len([1 for url in example["images"] if url in map_url_idx])
        # if num_images > 50:
        #    example["images"] = [None for url in example["images"]]
        #    return example
        example["images"] = [retrieve_image(url) if url else None for url in example["images"]]
        return example

    logger.info("Starting replacing urls by images")
    new_features = deepcopy(web_document_dataset_without_images.features)
    new_features["images"] = datasets.Sequence(datasets.Image())
    web_document_dataset = web_document_dataset_without_images.map(
        func_urls_to_images_urls_in_images_col,
        features=new_features,
        num_proc=num_proc,
        load_from_cache_file=False,
    )
    logger.info("Finished replacing urls by images")
    return web_document_dataset