def SplitLines()

in source/lib/text_processing.py [0:0]


def SplitLines(ifname, of_txt, of_sid):
    if os.path.isfile(of_txt):
        print(' - SplitLines: {} already exists'.format(of_txt))
        return
    nl = 0
    nl_sp = 0
    maxw = 0
    maxw_sp = 0
    fp_sid = open(of_sid, 'w')
    fp_txt = open(of_txt, 'w')
    with open(ifname, 'r') as ifp:
        for line in ifp:
            print('{:d}'.format(nl), file=fp_sid)  # store current sentence ID
            nw = 0
            words = line.strip().split()
            maxw = max(maxw, len(words))
            for i, word in enumerate(words):
                if word == '.' and i != len(words)-1:
                    if nw > 0:
                        print(' {}'.format(word), file=fp_txt)
                    else:
                        print('{}'.format(word), file=fp_txt)
                    # store current sentence ID
                    print('{:d}'.format(nl), file=fp_sid)
                    nl_sp += 1
                    maxw_sp = max(maxw_sp, nw+1)
                    nw = 0
                else:
                    if nw > 0:
                        print(' {}'.format(word), end='', file=fp_txt)
                    else:
                        print('{}'.format(word), end='', file=fp_txt)
                    nw += 1
            if nw > 0:
                # handle remainder of sentence
                print('', file=fp_txt)
                nl_sp += 1
                maxw_sp = max(maxw_sp, nw+1)
            nl += 1
    print(' - Split sentences: {}'.format(ifname))
    print(' -                  lines/max words: {:d}/{:d} -> {:d}/{:d}'
          .format(nl, maxw, nl_sp, maxw_sp))
    fp_sid.close()
    fp_txt.close()