def IndexTextOpen()

in src/indexing.py [0:0]


def IndexTextOpen(txt_fname):
    # print('Reading text corpus')
    # print(' - texts: {:s}'.format(txt_fname))
    txt_mmap = np.memmap(txt_fname, mode='r', dtype=np.uint8)
    fname = txt_fname.replace('.txt', '.ref.bin32')
    if os.path.isfile(fname):
        # print(' - sentence start offsets (32 bit): {}'.format(fname))
        ref_mmap = np.memmap(fname, mode='r', dtype=np.uint32)
    else:
        fname = txt_fname.replace('.txt', '.ref.bin64')
        if os.path.isfile(fname):
            # print(' - sentence start offsets (64 bit): {}'.format(fname))
            ref_mmap = np.memmap(fname, mode='r', dtype=np.uint64)
        else:
            # print('ERROR: no file with sentence start offsets found')
            sys.exit(1)
    # print(' - found {:d} sentences'.format(ref_mmap.shape[0]))
    return txt_mmap, ref_mmap