in evals/elsuite/identifying_variables/eval.py [0:0]
def _evaluate_sample(self, preds: Optional[Answer], gold: Answer, num_not_ctrl: int) -> Dict:
"""
If the gold hypothesis is invalid, then all other metrics are skipped, and we
only evaluate whether the solver correctly identified the hypothesis as invalid.
Mistakes are propagated: If the solver incorrectly identifies a hypothesis as
invalid, then its missing answers for the remaining tasks are counted as wrong.
In case of violations, the worst possible metrics are recorded, accounting for
the gold hypothesis validity caveat above (e.g. if the gold hypothesis is
invalid, then the worst case ctrl_nDCG is NaN since we'd skip this anyway,
whereas if the gold hypothesis were valid, then the worst case ctrl_nDCG would
be 0.0)
"""
hyp_valid_correct = preds.valid_hypothesis == gold.valid_hypothesis if preds else False
if gold.valid_hypothesis:
ind_correct = preds.ind_var == gold.ind_var if preds else False
dep_correct = preds.dep_var == gold.dep_var if preds else False
ctrl_nDCG = (
self._ctrl_vars_nDCG(preds.ctrl_vars, gold.ctrl_vars, num_not_ctrl)
if preds and preds.ctrl_vars is not None
else 0.0
)
ctrl_recall = (
self._ctrl_vars_recall(preds.ctrl_vars, gold.ctrl_vars)
if preds and preds.ctrl_vars is not None
else 0.0
)
# not in final report, since experiments had already been run
ctrl_fallout = (
self._ctrl_vars_fallout(preds.ctrl_vars, gold.ctrl_vars, num_not_ctrl)
if preds and preds.ctrl_vars is not None
else 1.0
)
else:
ctrl_nDCG = np.nan
ctrl_recall = np.nan
ctrl_fallout = np.nan
ind_correct = np.nan
dep_correct = np.nan
return {
"ctrl_nDCG": ctrl_nDCG,
"ctrl_recall": ctrl_recall,
"ctrl_fallout": ctrl_fallout,
"ind_correct": ind_correct,
"dep_correct": dep_correct,
"hyp_valid_correct": hyp_valid_correct,
"violation": preds is None,
}