in obelics/processors/web_document_line_deduplication.py [0:0]
def get_domain_to_duplicated_texts(self):
logger.info("Starting finding the duplicated texts for each domain")
with open(self.path_save_domain_to_positions) as f:
self.domain_to_positions = json.load(f)
self.domain_to_duplicated_texts = {}
for domain in tqdm(self.domain_to_positions):
duplicated_texts = {}
positions = self.domain_to_positions[domain]
for path_subdatasets in positions:
sub_ds = load_from_disk(path_subdatasets)
sub_ds = sub_ds.remove_columns([c_n for c_n in sub_ds.column_names if c_n != "texts"])
idx_pos = positions[path_subdatasets]
for idx in idx_pos:
tot_texts = self.remove_empty_els_in_list(sub_ds[idx]["texts"])
tot_texts = [text.split("\n\n") for text in tot_texts]
tot_texts = [paragraph for text in tot_texts for paragraph in text]
for text in tot_texts:
duplicated_texts[text] = duplicated_texts.get(text, 0) + 1
duplicated_texts = {k: v for k, v in duplicated_texts.items() if v > 1}
self.domain_to_duplicated_texts[domain] = duplicated_texts
with open(self.path_save_domain_to_duplicated_texts, "w") as f:
json.dump(self.domain_to_duplicated_texts, f)
logger.info("Finished finding and saving the duplicated texts for each domain")