in fairdiplomacy/utils/h2h_sweep.py [0:0]
def maybe_launch_and_get_metrics(self, exp: H2HItem) -> Optional[Dict[str, Union[str, float]]]:
num_powers_in_game = {"CLASSIC": 7, "FVA": 2}[self.VARIANT]
num_powers_being_tested = (
len(self.ACTIVE_POWERS) if self.ACTIVE_POWERS else num_powers_in_game
)
print(exp)
num_seeds = exp.num_seeds or self.NUM_SEEDS
total_games = num_seeds * num_powers_being_tested
eval_run = self.maybe_launch(exp)
# Caching aggregation of .json files. For runs that are already done,
# we don't want to re-gather results.
cache_path = eval_run.out_dir / "cache.pth"
cache_key = (
frozenset(p.name for p in eval_run.out_dir.iterdir() if p.name != cache_path.name),
num_seeds,
total_games,
)
power_scores_list = None
if cache_path.exists():
cache_content = torch.load(cache_path)
if cache_content["key"] != cache_key:
print("Invalidating", cache_path)
cache_path.unlink()
else:
power_scores_list = cache_content["power_scores_list"]
if power_scores_list is None:
power_scores_list = fairdiplomacy.compare_agents_array.get_power_scores_from_folder(
eval_run.out_dir,
)
if not power_scores_list:
metrics = {}
num_missing = total_games
else:
_, scores_list = zip(*power_scores_list)
means, stds = fairdiplomacy.utils.game_scoring.average_game_scores(scores_list)
num_missing = total_games - means.num_games
metrics = means._asdict()
metrics.update((f"{k}_err", v) for k, v in stds._asdict().items())
metrics["square_score_std"] = f"%.{self.PRECISION}f+-%.{self.PRECISION}f" % (
means.square_score,
stds.square_score,
)
# +/- 1 standard_error confidence interval
metrics["square_score_1sigma"] = f"%.{self.PRECISION}f:%.{self.PRECISION}f" % (
means.square_score - stds.square_score,
means.square_score + stds.square_score,
)
# +/- 2 standard_error confidence interval
metrics["square_score_2sigma"] = f"%.{self.PRECISION}f:%.{self.PRECISION}f" % (
means.square_score - stds.square_score * 2,
means.square_score + stds.square_score * 2,
)
# number of standard errors away from null hypothesis of 1/num_powers
if stds.square_score > 0:
metrics["null_sigmas"] = f"%.{self.PRECISION}f" % (
(means.square_score - 1 / num_powers_in_game) / stds.square_score
)
else:
metrics["null_sigmas"] = ""
if num_missing:
metrics["square_score_std"] += "*"
metrics["square_score_1sigma"] += "*"
metrics["square_score_2sigma"] += "*"
metrics["null_sigmas"] += "*"
if not num_missing and not cache_path.exists():
print("Saving cache", cache_path)
torch.save(dict(key=cache_key, power_scores_list=power_scores_list), cache_path)
metrics["progress"] = "%s/%s" % (total_games - num_missing, total_games)
metrics["num_missing"] = num_missing
metrics["total_games"] = total_games
metrics["folder"] = str(eval_run.out_dir)
return metrics