in filtering/deduplication/add_dedup_info.py [0:0]
def add_duplication_info(example, idx):
example["text"] = dataset_raw[idx]["text"]
text_length = len(example["text"])
example["text_length"] = text_length
example["url"] = get_url(example, dataset_name)
example["domain"] = example["url"].split("/")[2] if example["url"] is not None else None
example["dup_ratio"] = dup_len[idx] / text_length
example["pairs"] = doc_pairs[idx]
example["repetitions"] = doc_bytes[idx]
example["included_in_dedup"] = idx in included_docs
example["cluster"] = clusters[idx]
return example