in recipes/lexicon_free/utilities/compute_upper_ppl_kenlm.py [0:0]
def compute_upper_limit_ppl_for_kenlm(known_words_file, kenlm_file):
with open(known_words_file, "r") as f:
known_words = set(list(map(transform_asg, f.readline().strip().split(" "))))
with open(kenlm_file, "r") as f:
sum_logp = 0
sum_logp_unk = 0
n_words = 0
n_words_unk = 0
n_letters = 0
for line in f:
if "Total" not in line:
continue
line = line.strip().split("\t")
word = ""
word_logp = 0
for token in line:
token_val = token.split("=")[0]
logp = float(token.split(" ")[-1])
if token_val == "|":
if word in known_words:
sum_logp += word_logp + numpy.log(numpy.power(10, logp))
n_words += 1
else:
sum_logp_unk += word_logp + numpy.log(numpy.power(10, logp))
n_words_unk += 1
word = ""
word_logp = 0
elif token_val == "</s>":
sum_logp += numpy.log(numpy.power(10, logp))
n_words += 1
else:
word += token_val
word_logp += numpy.log(numpy.power(10, logp))
n_letters += 1
if token_val == "</s>":
break
loss_letter = -(sum_logp + sum_logp_unk) / n_letters
ppl_word_no_unk = numpy.exp(-sum_logp / n_words)
ppl_word_unk = numpy.exp(-sum_logp_unk / n_words_unk)
ppl_word = numpy.exp(-(sum_logp + sum_logp_unk) / (n_words + n_words_unk))
print(
"Letter loss: {}, letter perplexity: {}".format(
loss_letter, numpy.exp(loss_letter)
)
)
print("Upper word perplexity for all words: {}".format(ppl_word))
print("Upper word perplexity for unknown words: {}".format(ppl_word_unk))
print(
"(Reported in the paper) "
"Upper word perplexity for known words: {}".format(ppl_word_no_unk)
)