in gym/gym/benchmarks/scoring.py [0:0]
def benchmark_aggregate_score(benchmark, env_id_to_benchmark_results):
scores = {}
solves = {}
start_times = []
end_times = []
elapsed_times = []
# N.B. for each env_id, our benchmark_results will have a list of scores,
# solves, and times corresponding to the different tasks for that env_id. If
# we don't have enough trials, we zero out the score.
# TODO could do smarter matching of results to trials if we have extras
# TODO for now, baked in assumption that the number of trials is the
# same for all tasks involving a particular env.
for env_id in benchmark.env_ids:
task_list = benchmark.task_specs(env_id)
num_trials = task_list[0].trials
benchmark_results = env_id_to_benchmark_results.get(env_id, [])
for trial in range(num_trials):
if trial < len(benchmark_results):
# okay process this benchmark result against this trial
benchmark_result = benchmark_results[trial]
env_scores = scores.setdefault(env_id, [])
env_scores.append(benchmark_result['scores'])
# note: solves is a list of lists - for each task for this env,
# does each episode solve that task. We consider the env solved
# if every episode for every task is individually solved.
solved = solves.setdefault(env_id, True)
solves[env_id] = solved and np.sum(benchmark_result['solves'])
# these timestamps are a list of the first / last valid timestamp
# for each task involving this env.
start_times.append(benchmark_result['initial_reset_timestamp'])
end_times.append(max(benchmark_result['timestamps']))
elapsed_times.extend(benchmark_result['elapsed_times'])
else:
# no matching benchmark result for this trial
# TODOJT bug?
env_scores = scores.setdefault(env_id, [])
env_scores.append([benchmark.scorer.null_score for _ in task_list])
solves[env_id] = False
score = benchmark.score_benchmark(scores)
num_envs_solved = len([s for s in solves.values() if s])
start_to_finish_seconds = max(end_times) - min(start_times) if end_times and start_times else 0.0
summed_task_wall_time = np.sum([end - start for end, start in zip(end_times, start_times)])
summed_training_seconds = np.sum(elapsed_times)
return dict(
score=score,
num_envs_solved=num_envs_solved,
start_to_finish_seconds=start_to_finish_seconds,
summed_task_wall_time=summed_task_wall_time,
summed_training_seconds=summed_training_seconds,
)