in bayesmark/experiment_analysis.py [0:0]
def compute_aggregates(perf_da, baseline_ds, visible_perf_da=None):
"""Aggregate function evaluations in the experiments to get performance summaries of each method.
Parameters
----------
perf_da : :class:`xarray:xarray.DataArray`
Aggregate experimental results with each function evaluation in the experiments according to true loss
(e.g., generalization). `perf_da` has dimensions ``(ITER, SUGGEST, TEST_CASE, METHOD, TRIAL)`` as is assumed
to have no nan values.
baseline_ds : :class:`xarray:xarray.Dataset`
Dataset with baseline performance. It was variables ``(PERF_MED, PERF_MEAN, PERF_CLIP, PERF_BEST)`` with
dimensions ``(ITER, TEST_CASE)``, ``(ITER, TEST_CASE)``, ``(TEST_CASE,)``, and ``(TEST_CASE,)``, respectively.
`PERF_MED` is a baseline of performance based on random search when using medians to summarize performance.
Likewise, `PERF_MEAN` is for means. `PERF_CLIP` is an upperbound to clip poor performance when using the mean.
`PERF_BEST` is an estimate on the global minimum.
visible_perf_da : :class:`xarray:xarray.DataArray`
Aggregate experimental results with each function evaluation in the experiments according to visible loss
(e.g., validation). `visible_perf_da` has dimensions ``(ITER, SUGGEST, TEST_CASE, METHOD, TRIAL)`` as is
assumed to have no nan values. If `None`, we set ``visible_perf_da = perf_da``.
Returns
-------
agg_result : :class:`xarray:xarray.Dataset`
Dataset with summary of performance for each method and test case combination. Contains variables:
``(PERF_MED, LB_MED, UB_MED, NORMED_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN)``
each with dimensions ``(ITER, METHOD, TEST_CASE)``. `PERF_MED` is a median summary of performance with `LB_MED`
and `UB_MED` as error bars. `NORMED_MED` is a rescaled `PERF_MED` so we expect the optimal performance is 0,
and random search gives 1 at all `ITER`. Likewise, `PERF_MEAN`, `LB_MEAN`, `UB_MEAN`, `NORMED_MEAN` are for
mean performance.
summary : :class:`xarray:xarray.Dataset`
Dataset with overall summary of performance of each method. Contains variables
``(PERF_MED, LB_MED, UB_MED, PERF_MEAN, LB_MEAN, UB_MEAN)``
each with dimensions ``(ITER, METHOD)``.
"""
validate_agg_perf(perf_da, min_trial=1)
assert isinstance(baseline_ds, xr.Dataset)
assert tuple(baseline_ds[PERF_BEST].dims) == (TEST_CASE,)
assert tuple(baseline_ds[PERF_CLIP].dims) == (TEST_CASE,)
assert tuple(baseline_ds[PERF_MED].dims) == (ITER, TEST_CASE)
assert tuple(baseline_ds[PERF_MEAN].dims) == (ITER, TEST_CASE)
assert xru.coord_compat((perf_da, baseline_ds), (ITER, TEST_CASE))
assert not any(np.any(np.isnan(baseline_ds[kk].values)) for kk in baseline_ds)
# Now actually get the aggregate performance numbers per test case
agg_result = xru.ds_like(
perf_da,
(PERF_MED, LB_MED, UB_MED, NORMED_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN),
(ITER, METHOD, TEST_CASE),
)
baseline_mean_da = xru.only_dataarray(xru.ds_like(perf_da, ["ref"], (ITER, TEST_CASE)))
# Using values here since just clearer to get raw items than xr object for func_name
for func_name in perf_da.coords[TEST_CASE].values:
rand_perf_med = baseline_ds[PERF_MED].sel({TEST_CASE: func_name}, drop=True).values
rand_perf_mean = baseline_ds[PERF_MEAN].sel({TEST_CASE: func_name}, drop=True).values
best_opt = baseline_ds[PERF_BEST].sel({TEST_CASE: func_name}, drop=True).values
base_clip_val = baseline_ds[PERF_CLIP].sel({TEST_CASE: func_name}, drop=True).values
assert np.all(np.diff(rand_perf_med) <= 0), "Baseline should be decreasing with iteration"
assert np.all(np.diff(rand_perf_mean) <= 0), "Baseline should be decreasing with iteration"
assert np.all(rand_perf_med > best_opt)
assert np.all(rand_perf_mean > best_opt)
assert np.all(rand_perf_mean <= base_clip_val)
baseline_mean_da.loc[{TEST_CASE: func_name}] = linear_rescale(
rand_perf_mean, best_opt, base_clip_val, 0.0, 1.0, enforce_bounds=False
)
for method_name in perf_da.coords[METHOD].values:
# Take the minimum over all suggestion at given iter + sanity check perf_da
curr_da = perf_da.sel({METHOD: method_name, TEST_CASE: func_name}, drop=True)
assert curr_da.dims == (ITER, SUGGEST, TRIAL)
if visible_perf_da is None:
perf_array = get_perf_array(curr_da.values, curr_da.values)
curr_da_ = perf_da.sel({METHOD: method_name, TEST_CASE: func_name}, drop=True).min(dim=SUGGEST)
assert curr_da_.dims == (ITER, TRIAL)
perf_array_ = np.minimum.accumulate(curr_da_.values, axis=0)
assert np.allclose(perf_array, perf_array_)
else:
curr_visible_da = visible_perf_da.sel({METHOD: method_name, TEST_CASE: func_name}, drop=True)
assert curr_visible_da.dims == (ITER, SUGGEST, TRIAL)
perf_array = get_perf_array(curr_da.values, curr_visible_da.values)
# Compute median perf and CI on it
med_perf, LB, UB = qt.quantile_and_CI(perf_array, EVAL_Q, alpha=ALPHA)
assert med_perf.shape == rand_perf_med.shape
agg_result[PERF_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = med_perf
agg_result[LB_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = LB
agg_result[UB_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = UB
# Now store normed version, which is better for aggregation
normed = linear_rescale(med_perf, best_opt, rand_perf_med, 0.0, 1.0, enforce_bounds=False)
agg_result[NORMED_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = normed
# Store normed mean version
normed = linear_rescale(perf_array, best_opt, base_clip_val, 0.0, 1.0, enforce_bounds=False)
# Also, clip the score from below at -1 to limit max influence of single run on final average
normed = np.clip(normed, -1.0, 1.0)
normed = np.mean(normed, axis=1)
agg_result[NORMED_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = normed
# Compute mean perf and CI on it
perf_array = np.minimum(base_clip_val, perf_array)
mean_perf = np.mean(perf_array, axis=1)
assert mean_perf.shape == rand_perf_mean.shape
EB = t_EB(perf_array, alpha=ALPHA, axis=1)
assert EB.shape == rand_perf_mean.shape
agg_result[PERF_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf
agg_result[LB_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf - EB
agg_result[UB_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf + EB
assert not any(np.any(np.isnan(agg_result[kk].values)) for kk in agg_result)
# Compute summary score over all test cases, summarize performance of each method
summary = xru.ds_like(
perf_da,
(PERF_MED, LB_MED, UB_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN, LB_NORMED_MEAN, UB_NORMED_MEAN),
(ITER, METHOD),
)
summary[PERF_MED], summary[LB_MED], summary[UB_MED] = xr.apply_ufunc(
qt.quantile_and_CI,
agg_result[NORMED_MED],
input_core_dims=[[TEST_CASE]],
kwargs={"q": EVAL_Q, "alpha": ALPHA},
output_core_dims=[[], [], []],
)
summary[PERF_MEAN] = agg_result[NORMED_MEAN].mean(dim=TEST_CASE)
EB = xr.apply_ufunc(t_EB, agg_result[NORMED_MEAN], input_core_dims=[[TEST_CASE]])
summary[LB_MEAN] = summary[PERF_MEAN] - EB
summary[UB_MEAN] = summary[PERF_MEAN] + EB
normalizer = baseline_mean_da.mean(dim=TEST_CASE)
summary[NORMED_MEAN] = summary[PERF_MEAN] / normalizer
summary[LB_NORMED_MEAN] = summary[LB_MEAN] / normalizer
summary[UB_NORMED_MEAN] = summary[UB_MEAN] / normalizer
assert all(tuple(summary[kk].dims) == (ITER, METHOD) for kk in summary)
return agg_result, summary