in recipes/self_training/pseudo_labeling/generate_synthetic_lexicon.py [0:0]
def generate(infile):
# maps word --> dict mapping wp spellings to the number of
# times that spelling appears
lexicon = {}
with open(infile, "r") as f:
prediction = None
wp_spelling_raw = None
for line in f:
if "|P|" in line:
# format is "|P|: _[wp]..."
prediction = (
line[line.find("|P|: ") + len("|P|: ") :]
.replace(" ", "")
.replace("_", " ")
)
continue
elif "|p|" in line:
wp_spelling_raw = line[line.find("|p|:") + len("|p|: ") :]
elif "|T|" in line:
continue
elif "|t|" in line:
continue
elif "sample" in line:
continue
elif "WARNING" in line:
continue
elif "CHRONOS" in line:
continue
elif "---" in line:
continue
else:
raise Exception("Format invalid; extraneous line: " + line)
transcription = prediction.strip().split(" ")
wp_spelling = [e.strip() for e in wp_spelling_raw.strip().split(" ") if e]
wp_spelling = generate_wp_selling(wp_spelling)
for transcription_word, wp_spelling_word in zip(transcription, wp_spelling):
wp_key = " ".join(wp_spelling_word)
if transcription_word not in lexicon:
lexicon[transcription_word] = {}
if wp_key not in lexicon[transcription_word]:
lexicon[transcription_word][wp_key] = 0
lexicon[transcription_word][wp_key] += 1
return lexicon