in build_obelics/06_03_remove_image_duplicates.py [0:0]
def __call__(self, web_document):
metadata = json.loads(web_document["metadata"])
indices_to_remove = set(
[
ind
for ind, meta in enumerate(metadata)
if (meta is not None) and (meta["src"] in self.image_urls_to_remove)
]
)
if indices_to_remove:
web_document["texts"] = [
el for ind, el in enumerate(web_document["texts"]) if ind not in indices_to_remove
]
web_document["images"] = [
el for ind, el in enumerate(web_document["images"]) if ind not in indices_to_remove
]
web_document["metadata"] = json.dumps(
[el for ind, el in enumerate(metadata) if ind not in indices_to_remove]
)
return web_document