def evaluate()

in aepsych/benchmark/problem.py [0:0]


    def evaluate(self, strat: aepsych.strategy.SequentialStrategy) -> Dict[str, float]:
        """Evaluate the strategy with respect to this problem.

        Args:
            strat (aepsych.strategy.SequentialStrategy): Strategy to evaluate.

        Returns:
            Dict[str, float]: A dictionary containing metrics and their values,
            including parent class metrics.
        """
        metrics = super().evaluate(strat)
        assert (
            strat.has_model
        ), "Can only evaluate a strat that has an underlying model!"
        thresh = self.options.get("thresh", 0.75)
        gridsize = self.options.get("gridsize", 10)
        post_mean, _ = strat.predict(self.eval_grid)

        dim = self.eval_grid.shape[1]
        post_mean_reshape = post_mean.reshape((gridsize,) * dim)
        phi_post_mean = norm.cdf(post_mean_reshape.detach().numpy())
        # assume mono_dim is last dim (TODO make this better)

        x1 = dim_grid(
            lower=strat.lb.numpy()[-1],
            upper=strat.ub.numpy()[-1],
            dim=1,
            gridsize=gridsize,
        ).squeeze()
        x2_hat = get_lse_contour(phi_post_mean, x1, level=thresh, lb=-1.0, ub=1.0)

        true_f = self.f(self.eval_grid)

        true_f_reshape = true_f.reshape((gridsize,) * dim)
        true_x2 = get_lse_contour(
            norm.cdf(true_f_reshape), x1, level=thresh, lb=-1.0, ub=1.0
        )
        assert x2_hat.shape == true_x2.shape, (
            "x2_hat.shape != true_x2.shape, something went wrong!"
            + f"x2_hat.shape={x2_hat.shape}, true_x2.shape={true_x2.shape}"
        )
        mae = np.mean(np.abs(true_x2 - x2_hat))
        mse = np.mean((true_x2 - x2_hat) ** 2)
        max_abs_err = np.max(np.abs(true_x2 - x2_hat))

        metrics["mean_abs_err_thresh"] = mae
        metrics["mean_square_err_thresh"] = mse
        metrics["max_abs_err_thresh"] = max_abs_err

        if dim != 1:
            corr = pearsonr(true_x2.flatten(), x2_hat.flatten())[0]
            metrics["pearson_corr_thresh"] = corr

        # now construct integrated error on thresh
        fsamps = strat.sample(self.eval_grid, num_samples=1000).detach().numpy()

        square_samps = [s.reshape((gridsize,) * strat.model.dim) for s in fsamps]
        contours = np.stack(
            [
                get_lse_contour(norm.cdf(s), x1, level=thresh, mono_dim=-1, lb=-1, ub=1)
                for s in square_samps
            ]
        )
        thresh_err = contours - true_x2[None, :]

        metrics["mean_integrated_abs_err_thresh"] = np.mean(np.abs(thresh_err))
        metrics["mean_integrated_square_err_thresh"] = np.mean(thresh_err ** 2)

        return metrics