in filtering/deduplication/add_dedup_info.py [0:0]
def get_pairs(byterange):
"""
Returns pairs generated by
https://github.com/google-research/deduplicate-text-datasets#collecting-the-duplicates-together
"""
print("Getting pairs")
pairs = []
with open(byterange, "r") as f:
save = False
for line in tqdm(f):
if line.strip() == "out":
save = True
continue
if save:
left, right = line.strip().split()
pairs.append((int(left), int(right)))
print("num pairs", len(pairs))
return pairs