def add_duplication_info()

in filtering/deduplication/add_dedup_info.py [0:0]


    def add_duplication_info(example, idx):
        example["text"] = dataset_raw[idx]["text"]
        text_length = len(example["text"])
        example["text_length"] = text_length
        example["url"] = get_url(example, dataset_name)
        example["domain"] = example["url"].split("/")[2] if example["url"] is not None else None
        example["dup_ratio"] = dup_len[idx] / text_length
        example["pairs"] = doc_pairs[idx]
        example["repetitions"] = doc_bytes[idx]
        example["included_in_dedup"] = idx in included_docs
        example["cluster"] = clusters[idx]
        return example