def IndexTextOpen()

in source/lib/indexing.py [0:0]


def IndexTextOpen(txt_fname):
    print('Reading text corpus')
    print(' - texts: {:s}'.format(txt_fname))
    txt_mmap = np.memmap(txt_fname, mode='r', dtype=np.uint8)
    fname = txt_fname.replace('.txt', '.ref.bin32')
    if os.path.isfile(fname):
        print(' - sentence start offsets (32 bit): {}'.format(fname))
        ref_mmap = np.memmap(fname, mode='r', dtype=np.uint32)
    else:
        fname = txt_fname.replace('.txt', '.ref.bin64')
        if os.path.isfile(fname):
            print(' - sentence start offsets (64 bit): {}'.format(fname))
            ref_mmap = np.memmap(fname, mode='r', dtype=np.uint64)
        else:
            print('ERROR: no file with sentence start offsets found')
            sys.exit(1)
    print(' - found {:d} sentences'.format(ref_mmap.shape[0]))

    nbw_mmap = None
    fname = txt_fname.replace('.txt', '.nw.bin8')
    if os.path.isfile(fname):
        print(' - word counts: {:s}'.format(fname))
        nbw_mmap = np.memmap(fname, mode='r', dtype=np.uint8)

    M = None
    fname = txt_fname.replace('.txt', '.meta')
    if os.path.isfile(fname):
        M = []
        n = 0
        print(' - metafile: {:s}'.format(fname))
        with open(fname, 'r') as fp:
            for line in fp:
                fields = line.strip().split()
                if len(fields) != 2:
                    print('ERROR: format error in meta file')
                    sys.exit(1)
                n += int(fields[1])
                M.append({'lang': fields[0], 'n': n})
        print(' - found {:d} languages:'.format(len(M)), end='')
        for L in M:
            print(' {:s}'.format(L['lang']), end='')
        print('')

    return txt_mmap, ref_mmap, nbw_mmap, M