def get_domain_to_duplicated_texts()

in vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts/09_04_get_domain_to_duplicated_texts.py [0:0]


def get_domain_to_duplicated_texts(domain_to_positions):
    shard_to_domain_to_positions = {
        str(idx_shard): {
            domain: domain_to_positions[domain][str(idx_shard)]
            for domain in domain_to_positions
            if str(idx_shard) in domain_to_positions[domain]
        }
        for idx_shard in range(NUM_SHARDS)
    }
    domain_to_duplicated_texts = {}

    for idx_shard in tqdm(range(NUM_SHARDS)):
        ds_shard = load_from_disk(os.path.join(PATH_WEB_DOCS_LOCAL, str(idx_shard)), keep_in_memory=True)

        for domain in shard_to_domain_to_positions[str(idx_shard)]:
            if domain not in domain_to_duplicated_texts:
                domain_to_duplicated_texts[domain] = {}

            positions = shard_to_domain_to_positions[str(idx_shard)][domain]

            for pos in positions:
                tot_texts = [txt for txt in ds_shard[pos]["texts"] if txt]
                tot_texts = [text.split("\n\n") for text in tot_texts]
                tot_texts = [paragraph for text in tot_texts for paragraph in text]
                for text in tot_texts:
                    domain_to_duplicated_texts[domain][text] = domain_to_duplicated_texts[domain].get(text, 0) + 1

    domain_to_duplicated_texts = {
        domain: {k: v for k, v in domain_to_duplicated_texts[domain].items() if v > 1}
        for domain in domain_to_duplicated_texts
    }
    return domain_to_duplicated_texts