in src/indexing.py [0:0]
def IndexTextOpen(txt_fname):
# print('Reading text corpus')
# print(' - texts: {:s}'.format(txt_fname))
txt_mmap = np.memmap(txt_fname, mode='r', dtype=np.uint8)
fname = txt_fname.replace('.txt', '.ref.bin32')
if os.path.isfile(fname):
# print(' - sentence start offsets (32 bit): {}'.format(fname))
ref_mmap = np.memmap(fname, mode='r', dtype=np.uint32)
else:
fname = txt_fname.replace('.txt', '.ref.bin64')
if os.path.isfile(fname):
# print(' - sentence start offsets (64 bit): {}'.format(fname))
ref_mmap = np.memmap(fname, mode='r', dtype=np.uint64)
else:
# print('ERROR: no file with sentence start offsets found')
sys.exit(1)
# print(' - found {:d} sentences'.format(ref_mmap.shape[0]))
return txt_mmap, ref_mmap