in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts/09_04_get_domain_to_duplicated_texts.py [0:0]
def get_domain_to_duplicated_texts(domain_to_positions):
shard_to_domain_to_positions = {
str(idx_shard): {
domain: domain_to_positions[domain][str(idx_shard)]
for domain in domain_to_positions
if str(idx_shard) in domain_to_positions[domain]
}
for idx_shard in range(NUM_SHARDS)
}
domain_to_duplicated_texts = {}
for idx_shard in tqdm(range(NUM_SHARDS)):
ds_shard = load_from_disk(os.path.join(PATH_WEB_DOCS_LOCAL, str(idx_shard)), keep_in_memory=True)
for domain in shard_to_domain_to_positions[str(idx_shard)]:
if domain not in domain_to_duplicated_texts:
domain_to_duplicated_texts[domain] = {}
positions = shard_to_domain_to_positions[str(idx_shard)][domain]
for pos in positions:
tot_texts = [txt for txt in ds_shard[pos]["texts"] if txt]
tot_texts = [text.split("\n\n") for text in tot_texts]
tot_texts = [paragraph for text in tot_texts for paragraph in text]
for text in tot_texts:
domain_to_duplicated_texts[domain][text] = domain_to_duplicated_texts[domain].get(text, 0) + 1
domain_to_duplicated_texts = {
domain: {k: v for k, v in domain_to_duplicated_texts[domain].items() if v > 1}
for domain in domain_to_duplicated_texts
}
return domain_to_duplicated_texts