def get_domain_to_duplicated_texts()

in obelics/processors/web_document_line_deduplication.py [0:0]


    def get_domain_to_duplicated_texts(self):
        logger.info("Starting finding the duplicated texts for each domain")
        with open(self.path_save_domain_to_positions) as f:
            self.domain_to_positions = json.load(f)

        self.domain_to_duplicated_texts = {}

        for domain in tqdm(self.domain_to_positions):
            duplicated_texts = {}
            positions = self.domain_to_positions[domain]

            for path_subdatasets in positions:
                sub_ds = load_from_disk(path_subdatasets)
                sub_ds = sub_ds.remove_columns([c_n for c_n in sub_ds.column_names if c_n != "texts"])
                idx_pos = positions[path_subdatasets]

                for idx in idx_pos:
                    tot_texts = self.remove_empty_els_in_list(sub_ds[idx]["texts"])
                    tot_texts = [text.split("\n\n") for text in tot_texts]
                    tot_texts = [paragraph for text in tot_texts for paragraph in text]
                    for text in tot_texts:
                        duplicated_texts[text] = duplicated_texts.get(text, 0) + 1

            duplicated_texts = {k: v for k, v in duplicated_texts.items() if v > 1}
            self.domain_to_duplicated_texts[domain] = duplicated_texts

        with open(self.path_save_domain_to_duplicated_texts, "w") as f:
            json.dump(self.domain_to_duplicated_texts, f)
        logger.info("Finished finding and saving the duplicated texts for each domain")