in codegen_sources/model/src/evaluation/evaluator.py [0:0]
def run_all_evals(self, trainer):
"""
Run all evaluations.
"""
params = self.params
scores = OrderedDict({"epoch": trainer.epoch})
deobf_probas_to_eval = EVAL_OBF_PROBAS
deobfuscation_proba = 1 - params.obf_proba
if deobfuscation_proba not in deobf_probas_to_eval:
deobf_probas_to_eval.append(deobfuscation_proba)
with torch.no_grad():
for data_set in EVAL_DATASET_SPLITS:
# causal prediction task (evaluate perplexity and accuracy)
for lang1, lang2 in params.clm_steps:
self.evaluate_clm(scores, data_set, lang1, lang2)
# prediction task (evaluate perplexity and accuracy)
for lang1, lang2 in params.mlm_steps:
self.evaluate_mlm(scores, data_set, lang1, lang2)
# machine translation task (evaluate perplexity and accuracy)
for keys in set(
params.mt_steps
+ [(l2, l3) for _, l2, l3 in params.bt_steps]
+ [(l1, l2) for l1, langs2 in params.st_steps for l2 in langs2]
+ [(l2, l1) for l1, langs2 in params.st_steps for l2 in langs2]
+ [
(l2_1, l2_2)
for l1, langs2 in params.st_steps
for l2_1 in langs2
for l2_2 in langs2
if l2_1 != l2_2
]
+ params.mt_spans_steps
):
spans = None
assert len(keys) == 2 or len(keys) == 3
lang1, lang2 = keys[0], keys[1]
if len(keys) == 3:
spans = keys[2]
self.evaluate_mt(
scores,
data_set,
lang1,
lang2,
params.eval_bleu,
params.eval_computation,
params.eval_subtoken_score,
spans,
)
if self.params.eval_denoising:
for lang in set(params.ae_steps):
assert lang in params.langs, lang
self.evaluate_mt(
scores,
data_set,
lang,
lang,
eval_bleu=False,
eval_computation=False,
eval_subtoken_score=False,
span=None,
)
# machine translation task (evaluate perplexity and accuracy)
for lang1, lang2 in set(params.do_steps):
assert len(deobf_probas_to_eval) == len(
set(deobf_probas_to_eval)
), f"deobf_probas_to_eval should have no duplicates, was {deobf_probas_to_eval}"
self.evaluate_mt(
scores,
data_set,
lang1,
lang2,
params.eval_bleu,
eval_computation=False,
eval_subtoken_score=params.eval_subtoken_score,
span=None,
deobfuscate=True,
deobfuscate_probas=deobf_probas_to_eval,
)
# prediction task (evaluate perplexity and accuracy)
for lang1, lang2 in params.classif_steps:
self.evaluate_classif(scores, data_set, lang1, lang2)
# report average metrics per language
if len(params.do_steps) > 0 and params.is_master:
for obfuscation_proba in deobf_probas_to_eval:
for score_type in ["precision", "recall", "F1"]:
scores[
"%s_obf_proba_%s_mt_subtoken_%s"
% (data_set, 1 - obfuscation_proba, score_type)
] = np.mean(
[
scores[
"%s_%s_mt_subtoken_%s"
% (
data_set,
get_l1l2_string(
lang1, lang2, obfuscation_proba
),
score_type,
)
]
for lang1, lang2 in params.do_steps
]
)
_clm_mono = [l1 for (l1, l2) in params.clm_steps if l2 is None]
if len(_clm_mono) > 0:
scores["%s_clm_ppl" % data_set] = np.mean(
[
scores["%s_%s_clm_ppl" % (data_set, lang)]
for lang in _clm_mono
]
)
scores["%s_clm_acc" % data_set] = np.mean(
[
scores["%s_%s_clm_acc" % (data_set, lang)]
for lang in _clm_mono
]
)
_mlm_mono = [l1 for (l1, l2) in params.mlm_steps if l2 is None]
if len(_mlm_mono) > 0:
scores["%s_mlm_ppl" % data_set] = np.mean(
[
scores["%s_%s_mlm_ppl" % (data_set, lang)]
for lang in _mlm_mono
]
)
scores["%s_mlm_acc" % data_set] = np.mean(
[
scores["%s_%s_mlm_acc" % (data_set, lang)]
for lang in _mlm_mono
]
)
return scores