def remove_duplicated_images()

in build_obelics/13_final_processing.py [0:0]


def remove_duplicated_images(texts, images, metadata):
    indices_to_remove = set()

    set_image_urls = set()
    for idx, meta in enumerate(metadata):
        if meta:
            url = meta["src"]
            if url not in set_image_urls:
                set_image_urls.add(url)
            else:
                indices_to_remove.add(idx)

    if indices_to_remove:
        texts = [el for ind, el in enumerate(texts) if ind not in indices_to_remove]
        images = [el for ind, el in enumerate(images) if ind not in indices_to_remove]
        metadata = [el for ind, el in enumerate(metadata) if ind not in indices_to_remove]
    return texts, images, metadata