in XLM/src/evaluation/evaluator.py [0:0]
def run_all_evals(self, trainer):
"""
Run all evaluations.
"""
params = self.params
scores = OrderedDict({'epoch': trainer.epoch})
with torch.no_grad():
for data_set in ['valid', 'test']:
# causal prediction task (evaluate perplexity and accuracy)
for lang1, lang2 in params.clm_steps:
self.evaluate_clm(scores, data_set, lang1, lang2)
# prediction task (evaluate perplexity and accuracy)
for lang1, lang2 in params.mlm_steps:
self.evaluate_mlm(scores, data_set, lang1, lang2)
# machine translation task (evaluate perplexity and accuracy)
for lang1, lang2 in set(params.mt_steps + [(l2, l3) for _, l2, l3 in params.bt_steps]):
eval_bleu = params.eval_bleu and params.is_master and 'cl' not in lang1
eval_computation = params.eval_computation and params.is_master and 'cl' not in lang1
self.evaluate_mt(scores, data_set, lang1,
lang2, eval_bleu, eval_computation)
# report average metrics per language
_clm_mono = [l1 for (l1, l2) in params.clm_steps if l2 is None]
if len(_clm_mono) > 0:
scores['%s_clm_ppl' % data_set] = np.mean(
[scores['%s_%s_clm_ppl' % (data_set, lang)] for lang in _clm_mono])
scores['%s_clm_acc' % data_set] = np.mean(
[scores['%s_%s_clm_acc' % (data_set, lang)] for lang in _clm_mono])
_mlm_mono = [l1 for (l1, l2) in params.mlm_steps if l2 is None]
if len(_mlm_mono) > 0:
scores['%s_mlm_ppl' % data_set] = np.mean(
[scores['%s_%s_mlm_ppl' % (data_set, lang)] for lang in _mlm_mono])
scores['%s_mlm_acc' % data_set] = np.mean(
[scores['%s_%s_mlm_acc' % (data_set, lang)] for lang in _mlm_mono])
return scores