def run()

in recipes/self_training/pseudo_labeling/generate_synthetic_lexicon.py [0:0]


def run():
    parser = argparse.ArgumentParser(
        description="Converts decoder output into train-ready lexicon format"
    )

    parser.add_argument(
        "-i",
        "--inputhyp",
        type=str,
        required=True,
        help="Path to decoder output using --usewordpiece=false file",
    )
    parser.add_argument(
        "-l",
        "--inputlexicon",
        type=str,
        required=True,
        help="Path to the existing lexicon with which to merge a lexicon from the hyp",
    )
    parser.add_argument(
        "-o", "--output", type=str, required=True, help="Path to output lexicon file"
    )

    args = parser.parse_args()

    if not os.path.isfile(args.inputhyp):
        raise Exception("'" + args.inputhyp + "' - input file doesn't exist")
    if not os.path.isfile(args.inputlexicon):
        raise Exception("'" + args.inputlexicon + "' - input file doesn't exist")

    lexicon = generate(args.inputhyp)
    sorted_spellings = order_lexicon(lexicon)
    spellings = create_spellings(sorted_spellings)
    new_lexicon = []
    for key in sorted(spellings.keys()):
        new_lexicon.append(spellings[key])

    old_lexicon_spellings = read_spellings_from_file(args.inputlexicon)
    old = {}
    for entry in old_lexicon_spellings:
        old[entry.word] = entry

    count = 0
    for entry in new_lexicon:
        count += 1
        if count % 1000 == 0:
            print("Processed " + str(count) + " entries in new lexicon.")
        if entry.word in old.keys():
            # entry in lexicon, check if spelling exists, else append to end
            for spelling in entry.sorted_spellings:
                if spelling in old[entry.word].sorted_spellings:
                    continue
                else:
                    # only add spelling if we don't already have it
                    if spelling not in old[entry.word].sorted_spellings:
                        old[entry.word].sorted_spellings.append(spelling)
        else:
            # OOV case: create a new lexicon entry with these spellings
            old[entry.word] = entry

    final = []
    # sort the final spellings
    for key in sorted(old.keys()):
        final.append(old[key])

    write_spellings_to_file(final, args.output)