def TextLoadUnify()

in source/mine_bitexts.py [0:0]


def TextLoadUnify(fname, args):
    if args.verbose:
        print(' - loading texts {:s}: '.format(fname), end='')
    fin = open(fname, encoding=args.encoding, errors='surrogateescape')
    inds = []
    sents = []
    sent2ind = {}
    n = 0
    nu = 0
    for line in fin:
        new_ind = len(sent2ind)
        inds.append(sent2ind.setdefault(line, new_ind))
        if args.unify:
            if inds[-1] == new_ind:
                sents.append(line[:-1])
                nu += 1
        else:
            sents.append(line[:-1])
            nu += 1
        n += 1
    if args.verbose:
        print('{:d} lines, {:d} unique'.format(n, nu))
    del sent2ind
    return inds, sents