in source/lib/text_processing.py [0:0]
def SplitLines(ifname, of_txt, of_sid):
if os.path.isfile(of_txt):
print(' - SplitLines: {} already exists'.format(of_txt))
return
nl = 0
nl_sp = 0
maxw = 0
maxw_sp = 0
fp_sid = open(of_sid, 'w')
fp_txt = open(of_txt, 'w')
with open(ifname, 'r') as ifp:
for line in ifp:
print('{:d}'.format(nl), file=fp_sid) # store current sentence ID
nw = 0
words = line.strip().split()
maxw = max(maxw, len(words))
for i, word in enumerate(words):
if word == '.' and i != len(words)-1:
if nw > 0:
print(' {}'.format(word), file=fp_txt)
else:
print('{}'.format(word), file=fp_txt)
# store current sentence ID
print('{:d}'.format(nl), file=fp_sid)
nl_sp += 1
maxw_sp = max(maxw_sp, nw+1)
nw = 0
else:
if nw > 0:
print(' {}'.format(word), end='', file=fp_txt)
else:
print('{}'.format(word), end='', file=fp_txt)
nw += 1
if nw > 0:
# handle remainder of sentence
print('', file=fp_txt)
nl_sp += 1
maxw_sp = max(maxw_sp, nw+1)
nl += 1
print(' - Split sentences: {}'.format(ifname))
print(' - lines/max words: {:d}/{:d} -> {:d}/{:d}'
.format(nl, maxw, nl_sp, maxw_sp))
fp_sid.close()
fp_txt.close()