in filtering/deduplication/add_dedup_info.py [0:0]
def get_url(row, dataset_name):
if dataset_name == "oscar":
return row["meta"]["warc_headers"]["warc-target-uri"]
if dataset_name == "the_pile" or dataset_name == "roots_en":
return None
return None