in build_obelics/13_final_processing.py [0:0]
def remove_duplicated_images(texts, images, metadata):
indices_to_remove = set()
set_image_urls = set()
for idx, meta in enumerate(metadata):
if meta:
url = meta["src"]
if url not in set_image_urls:
set_image_urls.add(url)
else:
indices_to_remove.add(idx)
if indices_to_remove:
texts = [el for ind, el in enumerate(texts) if ind not in indices_to_remove]
images = [el for ind, el in enumerate(images) if ind not in indices_to_remove]
metadata = [el for ind, el in enumerate(metadata) if ind not in indices_to_remove]
return texts, images, metadata