def maybe_launch_and_get_metrics()

in fairdiplomacy/utils/h2h_sweep.py [0:0]


    def maybe_launch_and_get_metrics(self, exp: H2HItem) -> Optional[Dict[str, Union[str, float]]]:
        num_powers_in_game = {"CLASSIC": 7, "FVA": 2}[self.VARIANT]
        num_powers_being_tested = (
            len(self.ACTIVE_POWERS) if self.ACTIVE_POWERS else num_powers_in_game
        )
        print(exp)
        num_seeds = exp.num_seeds or self.NUM_SEEDS
        total_games = num_seeds * num_powers_being_tested
        eval_run = self.maybe_launch(exp)

        # Caching aggregation of .json files. For runs that are already done,
        # we don't want to re-gather results.
        cache_path = eval_run.out_dir / "cache.pth"
        cache_key = (
            frozenset(p.name for p in eval_run.out_dir.iterdir() if p.name != cache_path.name),
            num_seeds,
            total_games,
        )

        power_scores_list = None
        if cache_path.exists():
            cache_content = torch.load(cache_path)
            if cache_content["key"] != cache_key:
                print("Invalidating", cache_path)
                cache_path.unlink()
            else:
                power_scores_list = cache_content["power_scores_list"]
        if power_scores_list is None:
            power_scores_list = fairdiplomacy.compare_agents_array.get_power_scores_from_folder(
                eval_run.out_dir,
            )

        if not power_scores_list:
            metrics = {}
            num_missing = total_games
        else:
            _, scores_list = zip(*power_scores_list)
            means, stds = fairdiplomacy.utils.game_scoring.average_game_scores(scores_list)
            num_missing = total_games - means.num_games
            metrics = means._asdict()
            metrics.update((f"{k}_err", v) for k, v in stds._asdict().items())
            metrics["square_score_std"] = f"%.{self.PRECISION}f+-%.{self.PRECISION}f" % (
                means.square_score,
                stds.square_score,
            )
            # +/- 1 standard_error confidence interval
            metrics["square_score_1sigma"] = f"%.{self.PRECISION}f:%.{self.PRECISION}f" % (
                means.square_score - stds.square_score,
                means.square_score + stds.square_score,
            )
            # +/- 2 standard_error confidence interval
            metrics["square_score_2sigma"] = f"%.{self.PRECISION}f:%.{self.PRECISION}f" % (
                means.square_score - stds.square_score * 2,
                means.square_score + stds.square_score * 2,
            )
            # number of standard errors away from null hypothesis of 1/num_powers
            if stds.square_score > 0:
                metrics["null_sigmas"] = f"%.{self.PRECISION}f" % (
                    (means.square_score - 1 / num_powers_in_game) / stds.square_score
                )
            else:
                metrics["null_sigmas"] = ""
            if num_missing:
                metrics["square_score_std"] += "*"
                metrics["square_score_1sigma"] += "*"
                metrics["square_score_2sigma"] += "*"
                metrics["null_sigmas"] += "*"
        if not num_missing and not cache_path.exists():
            print("Saving cache", cache_path)
            torch.save(dict(key=cache_key, power_scores_list=power_scores_list), cache_path)
        metrics["progress"] = "%s/%s" % (total_games - num_missing, total_games)
        metrics["num_missing"] = num_missing
        metrics["total_games"] = total_games
        metrics["folder"] = str(eval_run.out_dir)
        return metrics