in filtering/deduplication/add_dedup_info.py [0:0]
def get_bytes(pairs, data):
"""
Return bytes constituring the duplicated substring. There seems to be something off here, see:
https://github.com/google-research/deduplicate-text-datasets/issues/24
"""
print("Getting bytes")
byte_array = []
for left, right in tqdm(pairs):
byte_array.append(data[left:right])
print("byte_array size", len(byte_array))
return byte_array