recipes/self_training/librispeech/lm/clean_lm_text.py (32 lines of code) (raw):

import re from multiprocessing import Pool import nltk import tqdm PUNCTUATION = set(list(",'\"?!#&(){}[]*+=;:.-")) PUNCTUATION.add("") def clean(line): # try: # new_line = normalise.normalise(line, verbose=False) # except: # print("Could not normalize:", line) # new_line = (t for t in new_line if t not in PUNCTUATION) # new_line = " ".join(new_line).lower() new_line = re.sub('[,"?!#&\(\)\{\}\[\]\*+=;:..]', "", line) new_line = re.sub("-", " ", new_line) return " ".join(new_line.split()).lower() def write(lines, fid): for line in lines: if line: fid.write(line) fid.write("\n") def load(): with open("lmtext_sentences_no_am.txt.filtered", "r") as fid: lines = [l for l in fid] return lines if __name__ == "__main__": lines = load() fid = open("lmtext_clean_no_am.txt", "w") clean_lines = [] step = 1000000 for i in range(0, len(lines), step): print("Cleaning lines {} - {}".format(i, i + step)) pool = Pool() clean_lines = pool.map(clean, lines[i : i + step]) pool.close() pool.join() write(clean_lines, fid) fid.close()