in recipes/self_training/pseudo_labeling/generate_synthetic_lexicon.py [0:0]
def run():
parser = argparse.ArgumentParser(
description="Converts decoder output into train-ready lexicon format"
)
parser.add_argument(
"-i",
"--inputhyp",
type=str,
required=True,
help="Path to decoder output using --usewordpiece=false file",
)
parser.add_argument(
"-l",
"--inputlexicon",
type=str,
required=True,
help="Path to the existing lexicon with which to merge a lexicon from the hyp",
)
parser.add_argument(
"-o", "--output", type=str, required=True, help="Path to output lexicon file"
)
args = parser.parse_args()
if not os.path.isfile(args.inputhyp):
raise Exception("'" + args.inputhyp + "' - input file doesn't exist")
if not os.path.isfile(args.inputlexicon):
raise Exception("'" + args.inputlexicon + "' - input file doesn't exist")
lexicon = generate(args.inputhyp)
sorted_spellings = order_lexicon(lexicon)
spellings = create_spellings(sorted_spellings)
new_lexicon = []
for key in sorted(spellings.keys()):
new_lexicon.append(spellings[key])
old_lexicon_spellings = read_spellings_from_file(args.inputlexicon)
old = {}
for entry in old_lexicon_spellings:
old[entry.word] = entry
count = 0
for entry in new_lexicon:
count += 1
if count % 1000 == 0:
print("Processed " + str(count) + " entries in new lexicon.")
if entry.word in old.keys():
# entry in lexicon, check if spelling exists, else append to end
for spelling in entry.sorted_spellings:
if spelling in old[entry.word].sorted_spellings:
continue
else:
# only add spelling if we don't already have it
if spelling not in old[entry.word].sorted_spellings:
old[entry.word].sorted_spellings.append(spelling)
else:
# OOV case: create a new lexicon entry with these spellings
old[entry.word] = entry
final = []
# sort the final spellings
for key in sorted(old.keys()):
final.append(old[key])
write_spellings_to_file(final, args.output)