in src/indexing.py [0:0]
def CompressText(txt_fname):
"""
generate ref binary file storing starting offset for each sentence
"""
fname = txt_fname.replace('.txt', '.ref.bin64')
offsets = [0]
with open(txt_fname, 'r', encoding='utf-8', errors='ignore') as fin:
for line in fin:
offsets.append(offsets[-1] + len(bytes(line, encoding='utf-8', errors='ignore')))
offsets = np.array(offsets[:-1], dtype=np.int64) # discard last one
offsets.tofile(fname)