def get_domain_to_positions()

in build_obelics/09_02_get_domain_to_positions.py [0:0]


def get_domain_to_positions():
    domain_to_positions = {}

    for idx_shard in tqdm(range(NUM_SHARDS)):
        path_subdataset = os.path.join(PATH_WEB_DOCS_LOCAL, str(idx_shard))
        sub_ds = load_from_disk(path_subdataset)
        metadata_sub_ds = sub_ds["general_metadata"]
        domains = [urlparse(json.loads(meta)["url"]).netloc for meta in metadata_sub_ds]

        new_domain_to_pos = {}
        for idx, domain in enumerate(domains):
            new_domain_to_pos[domain] = new_domain_to_pos.get(domain, []) + [idx]
        for domain in new_domain_to_pos:
            if domain not in domain_to_positions:
                domain_to_positions[domain] = {}
            domain_to_positions[domain][str(idx_shard)] = new_domain_to_pos[domain]

    return domain_to_positions