in src/mlm/loaders.py [0:0]
def from_nmt(cls, fp: TextIO, max_utts=None, vocab=None, tokenizer=None):
"""Loads hypotheses from Toan's NMT beam output format
Args:
fp (TextIO): .nobpe filename
max_utts (None, optional): Number of utterances to process
vocab (None, optional): Vocabulary
Returns:
Predictions: Initialized predictions object
"""
# Just a dictionary for now
# but equipped with this factory method
preds = cls()
pair_idx = 0
sents = []
scores = []
# TODO: Assumes newline at the end
for line_idx, line in enumerate(fp):
if max_utts is not None and max_utts <= pair_idx:
break
line = line.strip()
if line == '':
hyps = Hypotheses(sents, scores, vocab, tokenizer)
preds[pair_idx] = hyps
pair_idx += 1
sents = []
scores = []
continue
line_parts = line.split()
neg_log_prob = float(line_parts[-1])
# TEMPORARY: FOR CATCHING IMPROPER PROCESSING, e.g. ... gedi-25.58
neg_log_prob_ln_str = line_parts[-2]
str_parts = neg_log_prob_ln_str.split('-')
# Were they adjoined?
if len(str_parts[0]) > 0:
neg_log_prob_ln_str = '-' + str_parts[-1]
# logging.warn("Line {}: LN score '{}' was found, treating as '{}'".format(line_idx+1, line_parts[-2], neg_log_prob_ln_str))
neg_log_prob_ln = float(neg_log_prob_ln_str)
hyp = ' '.join(line_parts[:-2])
sents.append(hyp)
scores.append(neg_log_prob_ln)
return preds