in evals/elsuite/identifying_variables/eval.py [0:0]
def _compute_agg_metrics(self, metrics: List[Dict]) -> Dict[str, float]:
"""
Computes aggregate metrics across all samples
"""
main_metrics = {
"hyp_valid_acc": np.mean([x["hyp_valid_correct"] for x in metrics]),
"violation_count": np.sum([x["violation"] for x in metrics]),
"violation_rate": np.mean([x["violation"] for x in metrics]),
# Some samples may be NaN for cases where the target hypothesis is invalid
"ctrl_nDCG": np.nanmean([x["ctrl_nDCG"] for x in metrics]),
"ctrl_recall": np.nanmean([x["ctrl_recall"] for x in metrics]),
"ctrl_fallout": np.nanmean([x["ctrl_fallout"] for x in metrics]),
"ind_acc": np.nanmean([x["ind_correct"] for x in metrics]),
"dep_acc": np.nanmean([x["dep_correct"] for x in metrics]),
"n_valid_hyp": np.sum([x["valid_hyp"] for x in metrics]),
}
if self.group_metrics:
grouped_metrics = self._compute_grouped_metrics(metrics)
else:
grouped_metrics = {}
total_metrics = {**main_metrics, **grouped_metrics}
total_metrics = {k: float(v) for k, v in total_metrics.items()}
return total_metrics