in recipes/self_training/librispeech/lm/filter_contractions.py [0:0]
def run_for_id(file_name):
print("Starting thread")
contractions = []
with open(CONTRACTIONS, "r") as c:
for line in c:
contractions.append(line.strip())
print("Parsing input file")
lines = []
with open(file_name, "r") as f:
for line in f:
lines.append(line)
print("Done parsing input file")
# with open(file_name + ".filtered", "w") as f:
filtered_lines = []
counter = 0
for line in lines:
counter += 1
if counter % 10000 == 0:
print("Counter at " + str(counter))
filtered_words = []
for word in line.strip().split(" "):
word = word.strip()
# Take care of cases like "'you'd" or "can't'"
if word[1:] in contractions:
filtered_words.append(word[1:])
elif word[:-1] in contractions:
filtered_words.append(word[:-1])
elif word in contractions or "'s" in word:
filtered_words.append(word)
else:
# Check if between two letters
idx = word.find("'")
if idx != -1:
# Check if apostrophe occurs between two letters (consider valid if so)
if idx + 1 < len(word) and idx != 0:
filtered_words.append(word)
else:
filtered_words.append(word.strip().replace("'", ""))
else:
filtered_words.append(word)
filtered_lines.append(" ".join(filtered_words))
print("Writing output file")
with open(file_name + ".filtered", "w") as f:
for line in filtered_lines:
f.write(line)
f.write("\n")