def get_pairs()

in filtering/deduplication/add_dedup_info.py [0:0]


def get_pairs(byterange):
    """
    Returns pairs generated by
    https://github.com/google-research/deduplicate-text-datasets#collecting-the-duplicates-together
    """
    print("Getting pairs")
    pairs = []
    with open(byterange, "r") as f:
        save = False
        for line in tqdm(f):
            if line.strip() == "out":
                save = True
                continue
            if save:
                left, right = line.strip().split()
                pairs.append((int(left), int(right)))
    print("num pairs", len(pairs))
    return pairs