in vision/m4/sourcing/data_collection/processors/web_document_extractor.py [0:0]
def urls_to_images(dataset, dataset_images, map_url_idx, num_proc, some_urls_are_already_retrieved=False):
if some_urls_are_already_retrieved:
if "images_urls" not in dataset.features or "images" not in dataset.features:
raise ValueError(
"If some urls are already retrieved, the dataset must contain the features 'images_urls' and 'images'"
)
def retrieve_image(url):
if url not in map_url_idx:
return None
image = {"path": None, "bytes": dataset_images[map_url_idx[url]]["image"]}
return image
def func_urls_to_images_urls_in_images_col(example):
example["images_urls"] = deepcopy(example["images"])
num_urls = sum([(url is not None and url != "") for url in example["images_urls"]])
example["images"] = [retrieve_image(url) if url else None for url in example["images"]]
num_found = sum([img is not None for img in example["images"]])
num_not_found = num_urls - num_found
example["num_found"] = num_found
example["num_not_found"] = num_not_found
return example
def func_urls_to_images_urls_in_images_urls_col(example):
num_urls = sum([(url is not None and url != "") for url in example["images_urls"]])
example["images"] = [
img if img is not None else retrieve_image(url) if url else None
for img, url in zip(example["images"], example["images_urls"])
]
num_found = sum([img is not None for img in example["images"]])
num_not_found = num_urls - num_found
example["num_found"] = num_found
example["num_not_found"] = num_not_found
return example
func_urls_to_images = (
func_urls_to_images_urls_in_images_urls_col
if some_urls_are_already_retrieved
else func_urls_to_images_urls_in_images_col
)
logger.info("Starting replacing urls by images")
new_features = deepcopy(dataset.features)
new_features["images"] = Sequence(Image())
new_features["images_urls"] = Sequence(Value("string"))
new_features["num_found"] = Value("int32")
new_features["num_not_found"] = Value("int32")
dataset = dataset.map(
func_urls_to_images,
features=new_features,
num_proc=num_proc,
load_from_cache_file=False,
)
logger.info("Finished replacing urls by images")
return dataset