in source/mine_bitexts.py [0:0]
def TextLoadUnify(fname, args):
if args.verbose:
print(' - loading texts {:s}: '.format(fname), end='')
fin = open(fname, encoding=args.encoding, errors='surrogateescape')
inds = []
sents = []
sent2ind = {}
n = 0
nu = 0
for line in fin:
new_ind = len(sent2ind)
inds.append(sent2ind.setdefault(line, new_ind))
if args.unify:
if inds[-1] == new_ind:
sents.append(line[:-1])
nu += 1
else:
sents.append(line[:-1])
nu += 1
n += 1
if args.verbose:
print('{:d} lines, {:d} unique'.format(n, nu))
del sent2ind
return inds, sents