in evals/elsuite/function_deduction/eval.py [0:0]
def _get_sample_std(self, metrics):
adjusted = []
no_failed = []
solved_ratio_if_any_solved = []
sample_ixs = set(metric["sample_ix"] for metric in metrics)
for sample_ix in sample_ixs:
sample_metrics = [metric for metric in metrics if metric["sample_ix"] == sample_ix]
sample_adjusted = [
metric["num_rounds"] or self.failed_sample_rounds for metric in sample_metrics
]
sample_no_failed = [
metric["num_rounds"] for metric in sample_metrics if metric["success"]
]
solved_ratio = sum(1 for metric in sample_metrics if metric["success"]) / len(
sample_metrics
)
if len(sample_adjusted) > 1:
adjusted.append(np.std(sample_adjusted))
if len(sample_no_failed) > 1:
no_failed.append(np.std(sample_no_failed))
if solved_ratio:
solved_ratio_if_any_solved.append(solved_ratio)
return {
"avg_sample_rounds_std_adjusted": sum(adjusted) / len(adjusted) if adjusted else None,
"avg_sample_rounds_std_no_failed": sum(no_failed) / len(no_failed)
if no_failed
else None,
# This is just solved_ratio but excluding samples that had no succesful attempt.
# So 1 is full stability (i.e. if sample was solved once, it will be solved always),
# and (1/self.n_repeat) is "no sample was solved more than once"
"solved_ratio_if_any_solved": sum(solved_ratio_if_any_solved)
/ len(solved_ratio_if_any_solved)
if solved_ratio_if_any_solved
else None,
}