def CompressText()

in src/indexing.py [0:0]


def CompressText(txt_fname):
    """
    generate ref binary file storing starting offset for each sentence
    """
    fname = txt_fname.replace('.txt', '.ref.bin64')
    offsets = [0]
    with open(txt_fname, 'r', encoding='utf-8', errors='ignore') as fin:
        for line in fin:
            offsets.append(offsets[-1] + len(bytes(line, encoding='utf-8', errors='ignore')))
    offsets = np.array(offsets[:-1], dtype=np.int64) # discard last one
    offsets.tofile(fname)