def _evaluate_sample()

in evals/elsuite/identifying_variables/eval.py [0:0]
35 lines of code
11 McCabe index (conditional complexity)

    def _evaluate_sample(self, preds: Optional[Answer], gold: Answer, num_not_ctrl: int) -> Dict:
        """
        If the gold hypothesis is invalid, then all other metrics are skipped, and we
        only evaluate whether the solver correctly identified the hypothesis as invalid.

        Mistakes are propagated: If the solver incorrectly identifies a hypothesis as
        invalid, then its missing answers for the remaining tasks are counted as wrong.

        In case of violations, the worst possible metrics are recorded, accounting for
        the gold hypothesis validity caveat above (e.g. if the gold hypothesis is
        invalid, then the worst case ctrl_nDCG is NaN since we'd skip this anyway,
        whereas if the gold hypothesis were valid, then the worst case ctrl_nDCG would
        be 0.0)
        """
        hyp_valid_correct = preds.valid_hypothesis == gold.valid_hypothesis if preds else False

        if gold.valid_hypothesis:
            ind_correct = preds.ind_var == gold.ind_var if preds else False
            dep_correct = preds.dep_var == gold.dep_var if preds else False
            ctrl_nDCG = (
                self._ctrl_vars_nDCG(preds.ctrl_vars, gold.ctrl_vars, num_not_ctrl)
                if preds and preds.ctrl_vars is not None
                else 0.0
            )
            ctrl_recall = (
                self._ctrl_vars_recall(preds.ctrl_vars, gold.ctrl_vars)
                if preds and preds.ctrl_vars is not None
                else 0.0
            )
            # not in final report, since experiments had already been run
            ctrl_fallout = (
                self._ctrl_vars_fallout(preds.ctrl_vars, gold.ctrl_vars, num_not_ctrl)
                if preds and preds.ctrl_vars is not None
                else 1.0
            )

        else:
            ctrl_nDCG = np.nan
            ctrl_recall = np.nan
            ctrl_fallout = np.nan
            ind_correct = np.nan
            dep_correct = np.nan

        return {
            "ctrl_nDCG": ctrl_nDCG,
            "ctrl_recall": ctrl_recall,
            "ctrl_fallout": ctrl_fallout,
            "ind_correct": ind_correct,
            "dep_correct": dep_correct,
            "hyp_valid_correct": hyp_valid_correct,
            "violation": preds is None,
        }