in obelics/processors/web_document_extractor.py [0:0]
def get_image_urls(dataset, num_proc, path_save_file_image_urls):
def func_get_image_urls(example):
example["urls"] = [el for el in example["images"] if el]
return example
logger.info("Starting getting the urls of all images")
image_urls = dataset.map(func_get_image_urls, remove_columns=dataset.column_names, num_proc=num_proc)
image_urls = [sub_el for el in image_urls["urls"] for sub_el in el if sub_el]
image_urls = list(set(image_urls))
write_file(path_file=path_save_file_image_urls, to_write="\n".join(image_urls))
logger.info("Finished getting the urls of all images")