def __call__()

in build_obelics/06_03_remove_image_duplicates.py [0:0]


    def __call__(self, web_document):
        metadata = json.loads(web_document["metadata"])

        indices_to_remove = set(
            [
                ind
                for ind, meta in enumerate(metadata)
                if (meta is not None) and (meta["src"] in self.image_urls_to_remove)
            ]
        )

        if indices_to_remove:
            web_document["texts"] = [
                el for ind, el in enumerate(web_document["texts"]) if ind not in indices_to_remove
            ]
            web_document["images"] = [
                el for ind, el in enumerate(web_document["images"]) if ind not in indices_to_remove
            ]
            web_document["metadata"] = json.dumps(
                [el for ind, el in enumerate(metadata) if ind not in indices_to_remove]
            )

        return web_document