def get_url()

in filtering/deduplication/add_dedup_info.py [0:0]


def get_url(row, dataset_name):
    if dataset_name == "oscar":
        return row["meta"]["warc_headers"]["warc-target-uri"]
    if dataset_name == "the_pile" or dataset_name == "roots_en":
        return None
    return None