def remap_words_with_same_spelling()

in recipes/lexicon_free/wsj/prepare.py [0:0]


def remap_words_with_same_spelling(data_dst, decoder_dst):
    words_dict = defaultdict(int)
    spellings_dict = defaultdict(set)
    spellings_appearence_dict = defaultdict(int)

    with open(os.path.join(data_dst, "lists/si284.lst"), "r") as flist:
        for line in flist:
            for word in line.strip().split(" ")[3:]:
                word = re.sub(r"\(\S+\)", "", word)  # not pronounced
                words_dict[word] += 1
                spelling = re.sub("[^a-z'.]+", "", word)
                spellings_dict[spelling].update([word])
                spellings_appearence_dict[spelling] += 1

    with open(os.path.join(data_dst, "text/lm.txt"), "r") as flm:
        for line in flm:
            for word in line.strip().split(" "):
                word = re.sub(r"\(\S+\)", "", word)  # not pronounced
                spelling = re.sub("[^a-z'.]+", "", word)
                spellings_dict[spelling].update([word])
                spellings_appearence_dict[spelling] += 1

    sorted_spellings = sorted(
        spellings_appearence_dict.items(), key=functools.cmp_to_key(compare)
    )

    special_mapping = {"al": "al-", "st": "st", "nd": "nd", "rd": "rd"}
    remap_result = dict()
    with open(os.path.join(decoder_dst, "dict-remap.txt"), "w") as fmap:
        for spelling, _ in sorted_spellings:
            words_count = {w: words_dict[w] for w in spellings_dict[spelling]}
            sorted_words = sorted(
                words_count.items(), key=functools.cmp_to_key(compare)
            )
            for word, _ in sorted_words:
                remap_result[word] = (
                    sorted_words[0][0]
                    if spelling not in special_mapping
                    else special_mapping[spelling]
                )
                fmap.write("{} {}\n".format(word, remap_result[word]))
    return remap_result