def get_bytes()

in filtering/deduplication/add_dedup_info.py [0:0]


def get_bytes(pairs, data):
    """
    Return bytes constituring the duplicated substring. There seems to be something off here, see:
    https://github.com/google-research/deduplicate-text-datasets/issues/24
    """
    print("Getting bytes")
    byte_array = []
    for left, right in tqdm(pairs):
        byte_array.append(data[left:right])

    print("byte_array size", len(byte_array))
    return byte_array