in build_obelics/04_merge_web_docs_with_images.py [0:0]
def urls_to_images(web_document_dataset_without_images, image_dataset, map_url_idx, url_ban_words, num_proc):
def retrieve_image(url):
if url not in map_url_idx:
return None
if any([url_ban_word in url for url_ban_word in url_ban_words]):
return None
# Uncomment if one process seems silently killed without throwing any error in the `map` function.
# It's rare, but approximately 1/100M pages contain many huge pictures that break things.
# 2M in bytes was chosen by looking at the distribution of the length in bytes of pictures.
# It would remove only 1/1000 picture.
# if len(image_dataset[map_url_idx[url]]["image"]) > 2_000_000:
# return None
image = {"path": None, "bytes": image_dataset[map_url_idx[url]]["image"]}
return image
def func_urls_to_images_urls_in_images_col(example):
# Uncomment if one process seems silently killed without throwing any error in the `map` function.
# It's rare, but approximately 1/100M pages contain many huge pictures that break things.
# num_images = len([1 for url in example["images"] if url in map_url_idx])
# if num_images > 50:
# example["images"] = [None for url in example["images"]]
# return example
example["images"] = [retrieve_image(url) if url else None for url in example["images"]]
return example
logger.info("Starting replacing urls by images")
new_features = deepcopy(web_document_dataset_without_images.features)
new_features["images"] = datasets.Sequence(datasets.Image())
web_document_dataset = web_document_dataset_without_images.map(
func_urls_to_images_urls_in_images_col,
features=new_features,
num_proc=num_proc,
load_from_cache_file=False,
)
logger.info("Finished replacing urls by images")
return web_document_dataset