def get_domain_to_positions()

in obelics/processors/web_document_line_deduplication.py [0:0]


    def get_domain_to_positions(self):
        logger.info(
            "Starting creating the dictionary to go from a domain name to positions in the web document dataset"
        )
        self.domain_to_positions = {}

        for path_subdatasets in tqdm(self.paths_subdatasets):
            sub_ds = load_from_disk(path_subdatasets)
            sub_ds = sub_ds.remove_columns([c_n for c_n in sub_ds.column_names if c_n != "metadata"])
            metadata_sub_ds = sub_ds["metadata"]
            metadata_sub_ds = [json.loads(meta) for meta in metadata_sub_ds]
            metadata_sub_ds = [self.remove_empty_els_in_list(meta)[0] for meta in metadata_sub_ds]
            domains = [urlparse(meta["document_url"]).netloc for meta in metadata_sub_ds]

            new_domain_to_pos = {}
            for idx, domain in enumerate(domains):
                new_domain_to_pos[domain] = new_domain_to_pos.get(domain, []) + [idx]
            for domain in new_domain_to_pos:
                if domain not in self.domain_to_positions:
                    self.domain_to_positions[domain] = {}
                self.domain_to_positions[domain][path_subdatasets] = new_domain_to_pos[domain]

        with open(self.path_save_domain_to_positions, "w") as f:
            json.dump(self.domain_to_positions, f)
        logger.info(
            "Finished creating and saving the dictionary to go from a domain name to positions in the web document"
            " dataset"
        )