def generate_rl_eval_results()

in jat/utils.py [0:0]


def generate_rl_eval_results(evaluations: Dict[str, List[float]]) -> List[EvalResult]:
    """
    Generate a list of EvalResult objects.

    Args:
        evaluations (`Dict[str, List[float]]`):
            Dictionary containing the evaluation results for each task.

    Returns:
        `List[EvalResult]`:
            A list of EvalResult objects.
    """
    eval_results = []

    # Aggregate the results
    for domain in ["atari", "babyai", "metaworld", "mujoco"]:
        domain_scores = {
            task_name: scores for task_name, scores in evaluations.items() if task_name.startswith(domain)
        }

        if not domain_scores:
            continue

        # Normalize the scores
        norm_scores = {
            task_name: normalize(np.array(scores), task_name, "expert") for task_name, scores in domain_scores.items()
        }
        # Exlcude None
        norm_scores = {k: v for k, v in norm_scores.items() if v is not None}

        # Compute the stratified interquartile mean and confidence interval
        scores_dict = {"a": np.array(list(norm_scores.values())).T}

        def aggregate_func(x):
            return np.array([metrics.aggregate_iqm(x)])

        aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(scores_dict, aggregate_func)
        iqm, low, high = aggregate_scores["a"][0], aggregate_score_cis["a"][0][0], aggregate_score_cis["a"][1][0]

        eval_results.append(
            EvalResult(
                task_type="reinforcement-learning",
                task_name="Reinforcement Learning",
                dataset_type=domain,
                dataset_name=PRETTY_DOMAIN_NAMES[domain],
                metric_type="iqm_expert_normalized_total_reward",
                metric_name="IQM expert normalized total reward",
                metric_value=f"{iqm:.2f} [{low:.2f}, {high:.2f}]",
            )
        )

    atari_scores = {task_name: scores for task_name, scores in evaluations.items() if task_name.startswith("atari")}

    # Normalize the scores
    norm_scores = {
        task_name: normalize(np.array(scores), task_name, "human") for task_name, scores in atari_scores.items()
    }

    # Compute the stratified interquartile mean and confidence interval
    scores_dict = {"a": np.array(list(norm_scores.values())).T}

    def aggregate_func(x):
        return np.array([metrics.aggregate_iqm(x)])

    aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(scores_dict, aggregate_func)
    iqm, low, high = aggregate_scores["a"][0], aggregate_score_cis["a"][0][0], aggregate_score_cis["a"][1][0]

    eval_results.append(
        EvalResult(
            task_type="reinforcement-learning",
            task_name="Reinforcement Learning",
            dataset_type="atari",
            dataset_name=PRETTY_DOMAIN_NAMES["atari"],
            metric_type="iqm_human_normalized_total_reward",
            metric_name="IQM human normalized total reward",
            metric_value=f"{iqm:.2f} [{low:.2f}, {high:.2f}]",
        )
    )

    for task_name, scores in evaluations.items():
        mean_reward = np.mean(scores)
        std_reward = np.std(scores)

        eval_results.append(
            EvalResult(
                task_type="reinforcement-learning",
                task_name="Reinforcement Learning",
                dataset_type=task_name,
                dataset_name=PRETTY_TASK_NAMES[task_name],
                metric_type="total_reward",
                metric_name="Total reward",
                metric_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
            )
        )

    for task_name, scores in evaluations.items():
        norm_scores = normalize(np.array(scores), task_name, "expert")
        if norm_scores is None:
            continue
        mean_scores = np.mean(norm_scores)
        std_scores = np.std(norm_scores)

        eval_results.append(
            EvalResult(
                task_type="reinforcement-learning",
                task_name="Reinforcement Learning",
                dataset_type=task_name,
                dataset_name=PRETTY_TASK_NAMES[task_name],
                metric_type="expert_normalized_total_reward",
                metric_name="Expert normalized total reward",
                metric_value=f"{mean_scores:.2f} +/- {std_scores:.2f}",
            )
        )

    for task_name, scores in evaluations.items():
        if not task_name.startswith("atari"):
            continue
        norm_scores = normalize(np.array(scores), task_name, "human")
        mean_scores = np.mean(norm_scores)
        std_scores = np.std(norm_scores)

        eval_results.append(
            EvalResult(
                task_type="reinforcement-learning",
                task_name="Reinforcement Learning",
                dataset_type=task_name,
                dataset_name=PRETTY_TASK_NAMES[task_name],
                metric_type="human_normalized_total_reward",
                metric_name="Human normalized total reward",
                metric_value=f"{mean_scores:.2f} +/- {std_scores:.2f}",
            )
        )

    return eval_results