in source/lib/indexing.py [0:0]
def IndexTextOpen(txt_fname):
print('Reading text corpus')
print(' - texts: {:s}'.format(txt_fname))
txt_mmap = np.memmap(txt_fname, mode='r', dtype=np.uint8)
fname = txt_fname.replace('.txt', '.ref.bin32')
if os.path.isfile(fname):
print(' - sentence start offsets (32 bit): {}'.format(fname))
ref_mmap = np.memmap(fname, mode='r', dtype=np.uint32)
else:
fname = txt_fname.replace('.txt', '.ref.bin64')
if os.path.isfile(fname):
print(' - sentence start offsets (64 bit): {}'.format(fname))
ref_mmap = np.memmap(fname, mode='r', dtype=np.uint64)
else:
print('ERROR: no file with sentence start offsets found')
sys.exit(1)
print(' - found {:d} sentences'.format(ref_mmap.shape[0]))
nbw_mmap = None
fname = txt_fname.replace('.txt', '.nw.bin8')
if os.path.isfile(fname):
print(' - word counts: {:s}'.format(fname))
nbw_mmap = np.memmap(fname, mode='r', dtype=np.uint8)
M = None
fname = txt_fname.replace('.txt', '.meta')
if os.path.isfile(fname):
M = []
n = 0
print(' - metafile: {:s}'.format(fname))
with open(fname, 'r') as fp:
for line in fp:
fields = line.strip().split()
if len(fields) != 2:
print('ERROR: format error in meta file')
sys.exit(1)
n += int(fields[1])
M.append({'lang': fields[0], 'n': n})
print(' - found {:d} languages:'.format(len(M)), end='')
for L in M:
print(' {:s}'.format(L['lang']), end='')
print('')
return txt_mmap, ref_mmap, nbw_mmap, M