recipes/self_training/librispeech/lm/sentence_ify.py (16 lines of code) (raw):

import nltk import tqdm def load(): with open("lmtext_no_am.txt", "r") as fid: lines = [l.strip() for l in fid] return lines if __name__ == "__main__": lines = load() tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") step = 10000 with open("lmtext_sentences_no_am.txt", "w") as fid: for i in tqdm.tqdm(range(0, len(lines), step)): sentences = tokenizer.tokenize(" ".join(lines[i : i + step])) for l in sentences: fid.write(l) fid.write("\n")