recipes/self_training/librispeech/lm/sentence_ify.py (16 lines of code) (raw):
import nltk
import tqdm
def load():
with open("lmtext_no_am.txt", "r") as fid:
lines = [l.strip() for l in fid]
return lines
if __name__ == "__main__":
lines = load()
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
step = 10000
with open("lmtext_sentences_no_am.txt", "w") as fid:
for i in tqdm.tqdm(range(0, len(lines), step)):
sentences = tokenizer.tokenize(" ".join(lines[i : i + step]))
for l in sentences:
fid.write(l)
fid.write("\n")