def generate()

in recipes/self_training/pseudo_labeling/generate_synthetic_lexicon.py [0:0]


def generate(infile):
    # maps word --> dict mapping wp spellings to the number of
    # times that spelling appears
    lexicon = {}
    with open(infile, "r") as f:
        prediction = None
        wp_spelling_raw = None
        for line in f:
            if "|P|" in line:
                # format is "|P|: _[wp]..."
                prediction = (
                    line[line.find("|P|: ") + len("|P|: ") :]
                    .replace(" ", "")
                    .replace("_", " ")
                )
                continue
            elif "|p|" in line:
                wp_spelling_raw = line[line.find("|p|:") + len("|p|: ") :]
            elif "|T|" in line:
                continue
            elif "|t|" in line:
                continue
            elif "sample" in line:
                continue
            elif "WARNING" in line:
                continue
            elif "CHRONOS" in line:
                continue
            elif "---" in line:
                continue
            else:
                raise Exception("Format invalid; extraneous line: " + line)

            transcription = prediction.strip().split(" ")
            wp_spelling = [e.strip() for e in wp_spelling_raw.strip().split(" ") if e]
            wp_spelling = generate_wp_selling(wp_spelling)

            for transcription_word, wp_spelling_word in zip(transcription, wp_spelling):
                wp_key = " ".join(wp_spelling_word)
                if transcription_word not in lexicon:
                    lexicon[transcription_word] = {}
                if wp_key not in lexicon[transcription_word]:
                    lexicon[transcription_word][wp_key] = 0
                lexicon[transcription_word][wp_key] += 1
    return lexicon