def __call__()

in build_obelics/09_06_line_dedup.py [0:0]


    def __call__(self, example):
        domain = urlparse(json.loads(example["general_metadata"])["url"]).netloc
        if domain not in self.domain_to_duplicated_texts:
            return example

        for idx in range(len(example["texts"])):
            if example["texts"][idx] is not None:
                example["texts"][idx] = "\n\n".join(
                    [
                        paragraph
                        for paragraph in example["texts"][idx].split("\n\n")
                        if paragraph not in self.domain_to_duplicated_texts[domain]
                    ]
                )
        return example