in XLM/src/evaluation/evaluator.py [0:0]
def evaluate_mt(self, scores, data_set, lang1, lang2, eval_bleu):
"""
Evaluate perplexity and next word prediction accuracy.
"""
params = self.params
assert data_set in ['valid', 'test']
assert lang1 in params.langs
assert lang2 in params.langs
self.encoder.eval()
self.decoder.eval()
encoder = self.encoder.module if params.multi_gpu else self.encoder
decoder = self.decoder.module if params.multi_gpu else self.decoder
params = params
lang1_id = params.lang2id[lang1]
lang2_id = params.lang2id[lang2]
n_words = 0
xe_loss = 0
n_valid = 0
# only save states / evaluate usage on the validation set
eval_memory = params.use_memory and data_set == 'valid' and self.params.is_master
HashingMemory.EVAL_MEMORY = eval_memory
if eval_memory:
all_mem_att = {k: [] for k, _ in self.memory_list}
# store hypothesis to compute BLEU score
if eval_bleu:
hypothesis = []
back_hypothesis = []
for batch in self.get_iterator(data_set, lang1, lang2):
# generate batch
(x1, len1), (x2, len2) = batch
langs1 = x1.clone().fill_(lang1_id)
langs2 = x2.clone().fill_(lang2_id)
# target words to predict
alen = torch.arange(len2.max(), dtype=torch.long, device=len2.device)
pred_mask = alen[:, None] < len2[None] - 1 # do not predict anything given the last target word
y = x2[1:].masked_select(pred_mask[:-1])
assert len(y) == (len2 - 1).sum().item()
# cuda
x1, len1, langs1, x2, len2, langs2, y = to_cuda(x1, len1, langs1, x2, len2, langs2, y)
# encode source sentence
enc1 = encoder('fwd', x=x1, lengths=len1, langs=langs1, causal=False)
enc1 = enc1.transpose(0, 1)
enc1 = enc1.half() if params.fp16 else enc1
# decode target sentence
dec2 = decoder('fwd', x=x2, lengths=len2, langs=langs2, causal=True, src_enc=enc1, src_len=len1)
# loss
word_scores, loss = decoder('predict', tensor=dec2, pred_mask=pred_mask, y=y, get_scores=True)
# update stats
n_words += y.size(0)
xe_loss += loss.item() * len(y)
n_valid += (word_scores.max(1)[1] == y).sum().item()
if eval_memory:
for k, v in self.memory_list:
all_mem_att[k].append((v.last_indices, v.last_scores))
# generate translation - translate / convert to text
if eval_bleu:
max_len = int(1.5 * len1.max().item() + 10)
if params.beam_size == 1:
generated, lengths = decoder.generate(enc1, len1, lang2_id, max_len=max_len)
else:
generated, lengths = decoder.generate_beam(
enc1, len1, lang2_id, beam_size=params.beam_size,
length_penalty=params.length_penalty,
early_stopping=params.early_stopping,
max_len=max_len
)
hypothesis.extend(convert_to_text(generated, lengths, self.dico, params))
# Back-bleu: encode generated sentence
langs2_generated = generated.clone().fill_(lang2_id)
enc2 = encoder('fwd', x=generated, lengths=lengths, langs=langs2_generated, causal=False)
enc2 = enc2.transpose(0, 1)
enc2 = enc2.half() if params.fp16 else enc2
if params.beam_size == 1:
back_generated, back_lengths = decoder.generate(enc2, lengths, lang1_id, max_len=max_len)
else:
back_generated, back_lengths = decoder.generate_beam(
enc2, lengths, lang1_id, beam_size=params.beam_size,
length_penalty=params.length_penalty,
early_stopping=params.early_stopping,
max_len=max_len
)
back_hypothesis.extend(convert_to_text(back_generated, back_lengths, self.dico, params))
# compute perplexity and prediction accuracy
scores['%s_%s-%s_mt_ppl' % (data_set, lang1, lang2)] = np.exp(xe_loss / n_words)
scores['%s_%s-%s_mt_acc' % (data_set, lang1, lang2)] = 100. * n_valid / n_words
# compute memory usage
if eval_memory:
for mem_name, mem_att in all_mem_att.items():
eval_memory_usage(scores, '%s_%s-%s_%s' % (data_set, lang1, lang2, mem_name), mem_att, params.mem_size)
# compute BLEU
if eval_bleu:
# hypothesis / reference paths
hyp_name = 'hyp{0}.{1}-{2}.{3}.txt'.format(scores['epoch'], lang1, lang2, data_set)
hyp_path = os.path.join(params.hyp_path, hyp_name)
back_hyp_name = 'hyp{0}.{1}-{2}-{3}.{4}.txt'.format(scores['epoch'], lang1, lang2, lang1, data_set)
back_hyp_path = os.path.join(params.hyp_path, back_hyp_name)
ref_path = params.ref_paths[(lang1, lang2, data_set)]
input_path = params.ref_paths[(lang2, lang1, data_set)]
# export sentences to hypothesis file / restore BPE segmentation
with open(hyp_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(hypothesis) + '\n')
restore_segmentation(hyp_path)
with open(back_hyp_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(back_hypothesis) + '\n')
restore_segmentation(back_hyp_path)
# evaluate BLEU score
bleu = eval_moses_bleu(ref_path, hyp_path)
logger.info("BLEU %s %s : %f" % (hyp_path, ref_path, bleu))
scores['%s_%s-%s_mt_bleu' % (data_set, lang1, lang2)] = bleu
# evaluate Back-BLEU score
back_bleu = eval_moses_bleu(input_path, back_hyp_path)
logger.info("Back-BLEU %s %s : %f" % (back_hyp_path, input_path, back_bleu))
scores['%s_%s-%s-%s_mt_back_bleu' % (data_set, lang1, lang2, lang1)] = back_bleu
# calculate ratio of generation length to training distribution length (1 is ideal)
hyp_mean_num_words = mean_num_words(hyp_path)
train_tgt_path = f"{params.data_path.rstrip('/').rsplit('/', 1)[0]}/train.{lang2}.tok"
if os.path.exists(train_tgt_path):
train_tgt_mean_num_words = mean_num_words(train_tgt_path)
scores['%s_%s-%s_mt_hyp2train_num_words_ratio' % (data_set, lang1, lang2)] = hyp_mean_num_words / train_tgt_mean_num_words
# BLEU with input (shouldn't be too high or low)
input_bleu = eval_moses_bleu(input_path, hyp_path)
logger.info("Input BLEU %s %s : %f" % (hyp_path, input_path, input_bleu))
scores['%s_%s-%s_mt_input_bleu' % (data_set, lang1, lang2)] = input_bleu
# Calculate other unsupervised stats (against input or just on hyp)
hyp_lines = read_lines_from_path(hyp_path)
input_lines = read_lines_from_path(input_path)
back_hyp_lines = read_lines_from_path(back_hyp_path)
doubles, contains, unchanged, too_few_qs, too_many_qs, all_q_words_in_subq, subq_longer_than_q, bads = 0, 0, 0, 0, 0, 0, 0, 0
good_inps, good_hyps, good_back_hyps = [], [], []
for inp, hyp, back_hyp in zip(input_lines, hyp_lines, back_hyp_lines):
bad = False
if hyp.count('?') == 2:
l, r, _ = hyp.split('?')
l = l + '?'
r = r + '?'
if l == r:
doubles += 1
bad = True # Unnecessary to use doubles for the "bad" criteria
l_toks = l.split()
r_toks = r.split()
inp_toks = inp.split()
for subq_toks in [l_toks, r_toks]:
if set(inp_toks).issubset(set(subq_toks)):
all_q_words_in_subq += 1
bad = True
break
for subq_toks in [l_toks, r_toks]:
if len(subq_toks) >= len(inp_toks):
subq_longer_than_q += 1
bad = True
break
elif hyp.count('?') < 2:
too_few_qs += 1
bad = True
else:
too_many_qs += 1
if not self.params.one_to_variable:
bad = True
if inp in hyp:
contains += 1
bad = True
if inp == hyp:
unchanged += 1
bads += bad
if not bad:
good_inps.append(inp)
good_hyps.append(hyp)
good_back_hyps.append(back_hyp)
scores['%s_%s-%s_mt_doubles' % (data_set, lang1, lang2)] = 100. * doubles / len(hyp_lines)
scores['%s_%s-%s_mt_contains' % (data_set, lang1, lang2)] = 100. * contains / len(hyp_lines)
scores['%s_%s-%s_mt_unchanged' % (data_set, lang1, lang2)] = 100. * unchanged / len(hyp_lines)
scores['%s_%s-%s_mt_too_few_qs' % (data_set, lang1, lang2)] = 100. * too_few_qs / len(hyp_lines)
scores['%s_%s-%s_mt_too_many_qs' % (data_set, lang1, lang2)] = 100. * too_many_qs / len(hyp_lines)
scores['%s_%s-%s_mt_all_q_words_in_subq' % (data_set, lang1, lang2)] = 100. * all_q_words_in_subq / len(hyp_lines)
scores['%s_%s-%s_mt_subq_longer_than_q' % (data_set, lang1, lang2)] = 100. * subq_longer_than_q / len(hyp_lines)
scores['%s_%s-%s_mt_bads' % (data_set, lang1, lang2)] = 100. * bads / len(hyp_lines)
# evaluate BLEU score on good generations
good_hyp_path = hyp_path.replace('.txt', '.good.txt')
with open(good_hyp_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(good_hyps) + '\n')
good_inp_path = good_hyp_path.replace(f'/hyp{scores["epoch"]}', f'/ref{scores["epoch"]}')
with open(good_inp_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(good_inps) + '\n')
good_back_hyp_path = back_hyp_path.replace('.txt', '.good.txt')
with open(good_back_hyp_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(good_back_hyps) + '\n')
goods_frac = 1. - (bads / len(hyp_lines))
goods_input_bleu = eval_moses_bleu(good_inp_path, good_hyp_path)
logger.info("Input BLEU on Good Hyps %s %s : %f" % (good_hyp_path, good_inp_path, goods_input_bleu))
scores['%s_%s-%s_mt_goods_input_bleu' % (data_set, lang1, lang2)] = goods_input_bleu
scores['%s_%s-%s_mt_effective_goods_input_bleu' % (data_set, lang1, lang2)] = goods_input_bleu * goods_frac
goods_back_bleu = eval_moses_bleu(good_inp_path, good_back_hyp_path)
logger.info("Input BLEU on Good Hyps %s %s : %f" % (good_back_hyp_path, good_inp_path, goods_back_bleu))
scores['%s_%s-%s-%s_mt_goods_back_bleu' % (data_set, lang1, lang2, lang1)] = goods_back_bleu
scores['%s_%s-%s-%s_mt_effective_goods_back_bleu' % (data_set, lang1, lang2, lang1)] = goods_back_bleu * goods_frac