in jat/utils.py [0:0]
def generate_rl_eval_results(evaluations: Dict[str, List[float]]) -> List[EvalResult]:
"""
Generate a list of EvalResult objects.
Args:
evaluations (`Dict[str, List[float]]`):
Dictionary containing the evaluation results for each task.
Returns:
`List[EvalResult]`:
A list of EvalResult objects.
"""
eval_results = []
# Aggregate the results
for domain in ["atari", "babyai", "metaworld", "mujoco"]:
domain_scores = {
task_name: scores for task_name, scores in evaluations.items() if task_name.startswith(domain)
}
if not domain_scores:
continue
# Normalize the scores
norm_scores = {
task_name: normalize(np.array(scores), task_name, "expert") for task_name, scores in domain_scores.items()
}
# Exlcude None
norm_scores = {k: v for k, v in norm_scores.items() if v is not None}
# Compute the stratified interquartile mean and confidence interval
scores_dict = {"a": np.array(list(norm_scores.values())).T}
def aggregate_func(x):
return np.array([metrics.aggregate_iqm(x)])
aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(scores_dict, aggregate_func)
iqm, low, high = aggregate_scores["a"][0], aggregate_score_cis["a"][0][0], aggregate_score_cis["a"][1][0]
eval_results.append(
EvalResult(
task_type="reinforcement-learning",
task_name="Reinforcement Learning",
dataset_type=domain,
dataset_name=PRETTY_DOMAIN_NAMES[domain],
metric_type="iqm_expert_normalized_total_reward",
metric_name="IQM expert normalized total reward",
metric_value=f"{iqm:.2f} [{low:.2f}, {high:.2f}]",
)
)
atari_scores = {task_name: scores for task_name, scores in evaluations.items() if task_name.startswith("atari")}
# Normalize the scores
norm_scores = {
task_name: normalize(np.array(scores), task_name, "human") for task_name, scores in atari_scores.items()
}
# Compute the stratified interquartile mean and confidence interval
scores_dict = {"a": np.array(list(norm_scores.values())).T}
def aggregate_func(x):
return np.array([metrics.aggregate_iqm(x)])
aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(scores_dict, aggregate_func)
iqm, low, high = aggregate_scores["a"][0], aggregate_score_cis["a"][0][0], aggregate_score_cis["a"][1][0]
eval_results.append(
EvalResult(
task_type="reinforcement-learning",
task_name="Reinforcement Learning",
dataset_type="atari",
dataset_name=PRETTY_DOMAIN_NAMES["atari"],
metric_type="iqm_human_normalized_total_reward",
metric_name="IQM human normalized total reward",
metric_value=f"{iqm:.2f} [{low:.2f}, {high:.2f}]",
)
)
for task_name, scores in evaluations.items():
mean_reward = np.mean(scores)
std_reward = np.std(scores)
eval_results.append(
EvalResult(
task_type="reinforcement-learning",
task_name="Reinforcement Learning",
dataset_type=task_name,
dataset_name=PRETTY_TASK_NAMES[task_name],
metric_type="total_reward",
metric_name="Total reward",
metric_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
)
)
for task_name, scores in evaluations.items():
norm_scores = normalize(np.array(scores), task_name, "expert")
if norm_scores is None:
continue
mean_scores = np.mean(norm_scores)
std_scores = np.std(norm_scores)
eval_results.append(
EvalResult(
task_type="reinforcement-learning",
task_name="Reinforcement Learning",
dataset_type=task_name,
dataset_name=PRETTY_TASK_NAMES[task_name],
metric_type="expert_normalized_total_reward",
metric_name="Expert normalized total reward",
metric_value=f"{mean_scores:.2f} +/- {std_scores:.2f}",
)
)
for task_name, scores in evaluations.items():
if not task_name.startswith("atari"):
continue
norm_scores = normalize(np.array(scores), task_name, "human")
mean_scores = np.mean(norm_scores)
std_scores = np.std(norm_scores)
eval_results.append(
EvalResult(
task_type="reinforcement-learning",
task_name="Reinforcement Learning",
dataset_type=task_name,
dataset_name=PRETTY_TASK_NAMES[task_name],
metric_type="human_normalized_total_reward",
metric_name="Human normalized total reward",
metric_value=f"{mean_scores:.2f} +/- {std_scores:.2f}",
)
)
return eval_results