in build_obelics/09_06_line_dedup.py [0:0]
def __call__(self, example):
domain = urlparse(json.loads(example["general_metadata"])["url"]).netloc
if domain not in self.domain_to_duplicated_texts:
return example
for idx in range(len(example["texts"])):
if example["texts"][idx] is not None:
example["texts"][idx] = "\n\n".join(
[
paragraph
for paragraph in example["texts"][idx].split("\n\n")
if paragraph not in self.domain_to_duplicated_texts[domain]
]
)
return example