in recipes/lexicon_free/wsj/prepare.py [0:0]
def remap_words_with_same_spelling(data_dst, decoder_dst):
words_dict = defaultdict(int)
spellings_dict = defaultdict(set)
spellings_appearence_dict = defaultdict(int)
with open(os.path.join(data_dst, "lists/si284.lst"), "r") as flist:
for line in flist:
for word in line.strip().split(" ")[3:]:
word = re.sub(r"\(\S+\)", "", word) # not pronounced
words_dict[word] += 1
spelling = re.sub("[^a-z'.]+", "", word)
spellings_dict[spelling].update([word])
spellings_appearence_dict[spelling] += 1
with open(os.path.join(data_dst, "text/lm.txt"), "r") as flm:
for line in flm:
for word in line.strip().split(" "):
word = re.sub(r"\(\S+\)", "", word) # not pronounced
spelling = re.sub("[^a-z'.]+", "", word)
spellings_dict[spelling].update([word])
spellings_appearence_dict[spelling] += 1
sorted_spellings = sorted(
spellings_appearence_dict.items(), key=functools.cmp_to_key(compare)
)
special_mapping = {"al": "al-", "st": "st", "nd": "nd", "rd": "rd"}
remap_result = dict()
with open(os.path.join(decoder_dst, "dict-remap.txt"), "w") as fmap:
for spelling, _ in sorted_spellings:
words_count = {w: words_dict[w] for w in spellings_dict[spelling]}
sorted_words = sorted(
words_count.items(), key=functools.cmp_to_key(compare)
)
for word, _ in sorted_words:
remap_result[word] = (
sorted_words[0][0]
if spelling not in special_mapping
else special_mapping[spelling]
)
fmap.write("{} {}\n".format(word, remap_result[word]))
return remap_result