def evaluate()

in aepsych/benchmark/problem.py [0:0]


    def evaluate(self, strat: aepsych.strategy.SequentialStrategy) -> Dict[str, float]:
        """Evaluate the strategy with respect to this problem.

        Extend this in subclasses to add additional metrics.

        Args:
            strat (aepsych.strategy.SequentialStrategy): Strategy to evaluate.

        Returns:
            Dict[str, float]: A dictionary containing metrics and their values.
        """
        # always eval f
        f_true = self.f_true().numpy()
        f_hat = self.f_hat(strat).detach().numpy()
        assert (
            f_true.shape == f_hat.shape
        ), f"f_true.shape=={f_true.shape} != f_hat.shape=={f_hat.shape}"
        p_true = norm.cdf(f_true)
        p_hat = norm.cdf(f_hat)
        mae_f = np.mean(np.abs(f_true - f_hat))
        mse_f = np.mean((f_true - f_hat) ** 2)
        max_abs_err_f = np.max(np.abs(f_true - f_hat))
        corr_f = pearsonr(f_true.flatten(), f_hat.flatten())[0]
        mae_p = np.mean(np.abs(p_true - p_hat))
        mse_p = np.mean((p_true - p_hat) ** 2)
        max_abs_err_p = np.max(np.abs(p_true - p_hat))
        corr_p = pearsonr(p_true.flatten(), p_hat.flatten())[0]

        # eval in samp-based expectation over posterior instead of just mean
        fsamps = strat.sample(self.eval_grid, num_samples=1000).detach().numpy()
        ferrs = fsamps - f_true[None, :]
        miae_f = np.mean(np.abs(ferrs))
        mise_f = np.mean(ferrs ** 2)

        perrs = norm.cdf(fsamps) - norm.cdf(f_true[None, :])
        miae_p = np.mean(np.abs(perrs))
        mise_p = np.mean(perrs ** 2)

        metrics = {
            "mean_abs_err_f": mae_f,
            "mean_integrated_abs_err_f": miae_f,
            "mean_square_err_f": mse_f,
            "mean_integrated_square_err_f": mise_f,
            "max_abs_err_f": max_abs_err_f,
            "pearson_corr_f": corr_f,
            "mean_abs_err_p": mae_p,
            "mean_integrated_abs_err_p": miae_p,
            "mean_square_err_p": mse_p,
            "mean_integrated_square_err_p": mise_p,
            "max_abs_err_p": max_abs_err_p,
            "pearson_corr_p": corr_p,
        }

        return metrics