def _get_sample_std()

in evals/elsuite/function_deduction/eval.py [0:0]


    def _get_sample_std(self, metrics):
        adjusted = []
        no_failed = []
        solved_ratio_if_any_solved = []
        sample_ixs = set(metric["sample_ix"] for metric in metrics)
        for sample_ix in sample_ixs:
            sample_metrics = [metric for metric in metrics if metric["sample_ix"] == sample_ix]
            sample_adjusted = [
                metric["num_rounds"] or self.failed_sample_rounds for metric in sample_metrics
            ]
            sample_no_failed = [
                metric["num_rounds"] for metric in sample_metrics if metric["success"]
            ]
            solved_ratio = sum(1 for metric in sample_metrics if metric["success"]) / len(
                sample_metrics
            )

            if len(sample_adjusted) > 1:
                adjusted.append(np.std(sample_adjusted))
            if len(sample_no_failed) > 1:
                no_failed.append(np.std(sample_no_failed))
            if solved_ratio:
                solved_ratio_if_any_solved.append(solved_ratio)

        return {
            "avg_sample_rounds_std_adjusted": sum(adjusted) / len(adjusted) if adjusted else None,
            "avg_sample_rounds_std_no_failed": sum(no_failed) / len(no_failed)
            if no_failed
            else None,
            #   This is just solved_ratio but excluding samples that had no succesful attempt.
            #   So 1 is full stability (i.e. if sample was solved once, it will be solved always),
            #   and (1/self.n_repeat) is "no sample was solved more than once"
            "solved_ratio_if_any_solved": sum(solved_ratio_if_any_solved)
            / len(solved_ratio_if_any_solved)
            if solved_ratio_if_any_solved
            else None,
        }