recipes/lexicon_free/utilities/compute_upper_ppl_kenlm.py (70 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Compute upper limit on word perplexity for kenlm ngram models Command : python3 compute_upper_ppl_kenlm.py --vocab_file [...] --kenlm_preds [...] Replace [...] with appropriate paths """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import numpy from utils import transform_asg def compute_upper_limit_ppl_for_kenlm(known_words_file, kenlm_file): with open(known_words_file, "r") as f: known_words = set(list(map(transform_asg, f.readline().strip().split(" ")))) with open(kenlm_file, "r") as f: sum_logp = 0 sum_logp_unk = 0 n_words = 0 n_words_unk = 0 n_letters = 0 for line in f: if "Total" not in line: continue line = line.strip().split("\t") word = "" word_logp = 0 for token in line: token_val = token.split("=")[0] logp = float(token.split(" ")[-1]) if token_val == "|": if word in known_words: sum_logp += word_logp + numpy.log(numpy.power(10, logp)) n_words += 1 else: sum_logp_unk += word_logp + numpy.log(numpy.power(10, logp)) n_words_unk += 1 word = "" word_logp = 0 elif token_val == "</s>": sum_logp += numpy.log(numpy.power(10, logp)) n_words += 1 else: word += token_val word_logp += numpy.log(numpy.power(10, logp)) n_letters += 1 if token_val == "</s>": break loss_letter = -(sum_logp + sum_logp_unk) / n_letters ppl_word_no_unk = numpy.exp(-sum_logp / n_words) ppl_word_unk = numpy.exp(-sum_logp_unk / n_words_unk) ppl_word = numpy.exp(-(sum_logp + sum_logp_unk) / (n_words + n_words_unk)) print( "Letter loss: {}, letter perplexity: {}".format( loss_letter, numpy.exp(loss_letter) ) ) print("Upper word perplexity for all words: {}".format(ppl_word)) print("Upper word perplexity for unknown words: {}".format(ppl_word_unk)) print( "(Reported in the paper) " "Upper word perplexity for known words: {}".format(ppl_word_no_unk) ) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Upper limit on word perplexity for kenlm predictions" ) parser.add_argument( "--vocab_file", help="vocabulary of known words, use file " "from --limit_vocab_file during word kenLM training.", ) parser.add_argument( "--kenlm_preds", help="file with kenlm predictions after query run" ) args = parser.parse_args() print("Evaluate file {}".format(args.kenlm_preds)) compute_upper_limit_ppl_for_kenlm(args.vocab_file, args.kenlm_preds)