def __call__()

in build_obelics/08_02_urldedup.py [0:0]


    def __call__(self, example):
        general_metadata = json.loads(example["general_metadata"])
        url, warc_filename = general_metadata["url"], general_metadata["warc_filename"]
        if url in self.dup_urls:
            if warc_filename != self.dup_urls[url]:
                return False
        # Bonus: removes documents without any images
        metadata = [meta for meta in json.loads(example["metadata"]) if meta]
        if not metadata:
            return False
        return True