in bayesmark/experiment_analysis.py [0:0]
def main():
"""See README for instructions on calling analysis.
"""
description = "Analyze results from aggregated studies"
args = parse_args(general_parser(description))
# Metric used on leaderboard
leaderboard_metric = cc.VISIBLE_TO_OPT
logger.setLevel(logging.INFO) # Note this is the module-wide logger
if args[CmdArgs.verbose]:
logger.addHandler(logging.StreamHandler())
# Load in the eval data and sanity check
perf_ds, meta = XRSerializer.load_derived(args[CmdArgs.db_root], db=args[CmdArgs.db], key=cc.EVAL_RESULTS)
logger.info("Meta data from source file: %s" % str(meta["args"]))
# Check if there is baselines file, other make one
if cc.BASELINE not in XRSerializer.get_derived_keys(args[CmdArgs.db_root], db=args[CmdArgs.db]):
warnings.warn("Baselines not found. Need to construct baseline.")
do_baseline(args)
# Load in baseline scores data and sanity check (including compatibility with eval data)
baseline_ds, meta_ref = XRSerializer.load_derived(args[CmdArgs.db_root], db=args[CmdArgs.db], key=cc.BASELINE)
logger.info("baseline data from source ref file: %s" % str(meta_ref["args"]))
# Check test case signatures match between eval data and baseline data
sig_errs, signatures = analyze_signature_pair(meta["signature"], meta_ref["signature"])
logger.info("Signature errors:\n%s" % sig_errs.to_string())
print(json.dumps({"exp-anal sig errors": sig_errs.T.to_dict()}))
# Subset baseline to only the test cases run in the experiments
test_cases_run = perf_ds.coords[TEST_CASE].values.tolist()
assert set(test_cases_run) <= set(
baseline_ds.coords[TEST_CASE].values.tolist()
), "Data set contains test cases not found in baseline."
baseline_ds = baseline_ds.sel({TEST_CASE: test_cases_run})
# Also subset to allow shorter runs
iters_run = perf_ds.coords[ITER].values.tolist()
assert set(iters_run) <= set(
baseline_ds.coords[ITER].values.tolist()
), "Data set not same batch size or too many iters compared to baseline."
baseline_ds = baseline_ds.sel({ITER: iters_run})
# Do the actual computation
perf_visible = perf_ds[cc.VISIBLE_TO_OPT]
agg_result = OrderedDict()
summary = OrderedDict()
for metric_for_scoring in sorted(perf_ds):
perf_da = perf_ds[metric_for_scoring]
baseline_ds_ = baseline_ds.sel({OBJECTIVE: metric_for_scoring}, drop=True)
agg_result[(metric_for_scoring,)], summary[(metric_for_scoring,)] = compute_aggregates(
perf_da, baseline_ds_, perf_visible
)
agg_result = xru.ds_concat(agg_result, dims=(cc.OBJECTIVE,))
summary = xru.ds_concat(summary, dims=(cc.OBJECTIVE,))
for metric_for_scoring in sorted(perf_ds):
# Print summary by problem
# Recall that:
# ... summary[PERF_MEAN] = agg_result[NORMED_MEAN].mean(dim=TEST_CASE)
# ... summary[NORMED_MEAN] = summary[PERF_MEAN] / normalizer
# Where normalizer is constant across all problems, optimizers
print("Scores by problem (JSON):\n")
agg_df = agg_result[NORMED_MEAN].sel({cc.OBJECTIVE: metric_for_scoring}, drop=True)[{ITER: -1}].to_pandas().T
print(json.dumps({metric_for_scoring: agg_df.to_dict()}))
print("\n")
final_score = summary[PERF_MED].sel({cc.OBJECTIVE: metric_for_scoring}, drop=True)[{ITER: -1}]
logger.info("median score @ %d:\n%s" % (summary.sizes[ITER], xru.da_to_string(final_score)))
final_score = summary[PERF_MEAN].sel({cc.OBJECTIVE: metric_for_scoring}, drop=True)[{ITER: -1}]
logger.info("mean score @ %d:\n%s" % (summary.sizes[ITER], xru.da_to_string(final_score)))
print("Final scores (JSON):\n")
print(json.dumps({metric_for_scoring: final_score.to_series().to_dict()}))
print("\n")
final_score = summary[NORMED_MEAN].sel({cc.OBJECTIVE: metric_for_scoring}, drop=True)[{ITER: -1}]
logger.info("normed mean score @ %d:\n%s" % (summary.sizes[ITER], xru.da_to_string(final_score)))
# Now saving results
meta = {"args": serializable_dict(args), "signature": signatures}
XRSerializer.save_derived(agg_result, meta, args[CmdArgs.db_root], db=args[CmdArgs.db], key=cc.PERF_RESULTS)
XRSerializer.save_derived(summary, meta, args[CmdArgs.db_root], db=args[CmdArgs.db], key=cc.MEAN_SCORE)
final_msg = xru.da_to_string(
100 * (1.0 - summary[PERF_MEAN].sel({cc.OBJECTIVE: leaderboard_metric}, drop=True)[{ITER: -1}])
)
logger.info("-" * 20)
logger.info("Final score `100 x (1-loss)` for leaderboard:\n%s" % final_msg)