in build_obelics/08_02_urldedup.py [0:0]
def __call__(self, example):
general_metadata = json.loads(example["general_metadata"])
url, warc_filename = general_metadata["url"], general_metadata["warc_filename"]
if url in self.dup_urls:
if warc_filename != self.dup_urls[url]:
return False
# Bonus: removes documents without any images
metadata = [meta for meta in json.loads(example["metadata"]) if meta]
if not metadata:
return False
return True