in obelics/processors/web_document_line_deduplication.py [0:0]
def get_domain_to_positions(self):
logger.info(
"Starting creating the dictionary to go from a domain name to positions in the web document dataset"
)
self.domain_to_positions = {}
for path_subdatasets in tqdm(self.paths_subdatasets):
sub_ds = load_from_disk(path_subdatasets)
sub_ds = sub_ds.remove_columns([c_n for c_n in sub_ds.column_names if c_n != "metadata"])
metadata_sub_ds = sub_ds["metadata"]
metadata_sub_ds = [json.loads(meta) for meta in metadata_sub_ds]
metadata_sub_ds = [self.remove_empty_els_in_list(meta)[0] for meta in metadata_sub_ds]
domains = [urlparse(meta["document_url"]).netloc for meta in metadata_sub_ds]
new_domain_to_pos = {}
for idx, domain in enumerate(domains):
new_domain_to_pos[domain] = new_domain_to_pos.get(domain, []) + [idx]
for domain in new_domain_to_pos:
if domain not in self.domain_to_positions:
self.domain_to_positions[domain] = {}
self.domain_to_positions[domain][path_subdatasets] = new_domain_to_pos[domain]
with open(self.path_save_domain_to_positions, "w") as f:
json.dump(self.domain_to_positions, f)
logger.info(
"Finished creating and saving the dictionary to go from a domain name to positions in the web document"
" dataset"
)