def compute_upper_limit_ppl_for_kenlm()

in recipes/lexicon_free/utilities/compute_upper_ppl_kenlm.py [0:0]


def compute_upper_limit_ppl_for_kenlm(known_words_file, kenlm_file):
    with open(known_words_file, "r") as f:
        known_words = set(list(map(transform_asg, f.readline().strip().split(" "))))

    with open(kenlm_file, "r") as f:
        sum_logp = 0
        sum_logp_unk = 0
        n_words = 0
        n_words_unk = 0
        n_letters = 0

        for line in f:
            if "Total" not in line:
                continue
            line = line.strip().split("\t")
            word = ""
            word_logp = 0
            for token in line:
                token_val = token.split("=")[0]
                logp = float(token.split(" ")[-1])
                if token_val == "|":
                    if word in known_words:
                        sum_logp += word_logp + numpy.log(numpy.power(10, logp))
                        n_words += 1
                    else:
                        sum_logp_unk += word_logp + numpy.log(numpy.power(10, logp))
                        n_words_unk += 1
                    word = ""
                    word_logp = 0
                elif token_val == "</s>":
                    sum_logp += numpy.log(numpy.power(10, logp))
                    n_words += 1
                else:
                    word += token_val
                    word_logp += numpy.log(numpy.power(10, logp))
                n_letters += 1
                if token_val == "</s>":
                    break
        loss_letter = -(sum_logp + sum_logp_unk) / n_letters
        ppl_word_no_unk = numpy.exp(-sum_logp / n_words)
        ppl_word_unk = numpy.exp(-sum_logp_unk / n_words_unk)
        ppl_word = numpy.exp(-(sum_logp + sum_logp_unk) / (n_words + n_words_unk))

        print(
            "Letter loss: {}, letter perplexity: {}".format(
                loss_letter, numpy.exp(loss_letter)
            )
        )
        print("Upper word perplexity for all words: {}".format(ppl_word))
        print("Upper word perplexity for unknown words: {}".format(ppl_word_unk))
        print(
            "(Reported in the paper) "
            "Upper word perplexity for known words: {}".format(ppl_word_no_unk)
        )