in evals/elsuite/error_recovery/scripts/make_plots.py [0:0]
def performance_loss_per_task(metrics_df: pd.DataFrame, results_df: pd.DataFrame, out_dir: Path):
# Plot performance lost for each model
unique_models = get_unique_models(results_df)
get_all_tasks(results_df)
all_tasks_renamed = get_all_tasks_renamed(results_df)
all_tasks_pretty = [TASK_NAMES[i] for i in all_tasks_renamed]
all_metrics = {}
all_errors = {}
for model in unique_models:
metrics = []
errors = []
for task in all_tasks_renamed:
model_mask = metrics_df.solver == model
task_mask = metrics_df.task == task
CR_corrects = metrics_df[model_mask & task_mask]["CR_correct"]
IR_corrects = metrics_df[model_mask & task_mask]["IR_correct"]
performance_loss, performance_loss_error = corrects_to_performance_loss_and_error(
CR_corrects, IR_corrects
)
metrics.append(performance_loss)
errors.append(performance_loss_error)
pretty_model_name = MODEL_NAMES[model]
all_metrics[pretty_model_name] = metrics
all_errors[pretty_model_name] = errors
fig, ax = plt.subplots(figsize=(20, 6), constrained_layout=True)
plot_df = pd.DataFrame(all_metrics, index=all_tasks_pretty)
errs_df = pd.DataFrame(all_errors, index=all_tasks_pretty)
colors = [MODEL_COLOR_MAP[model] for model in unique_models]
ax = plot_df.plot.bar(rot=0.0, color=colors, ax=ax, width=0.8, yerr=errs_df, capsize=4)
annotate_axes(ax, errs_df)
# Shrink current axis by 20% to make room for the legend
box = ax.get_position()
ax.set_position((box.x0, box.y0, box.width * 0.8, box.height))
ax.set_ylim(bottom=-1, top=1.1)
ax.legend()
ax.axhline(0, 0, 1, color="black", linestyle="-")
ax.set_title("Performance loss per task (lower is better)")
ax.set_xlabel("Task type")
ax.set_ylabel("Performance loss")
outpath = os.path.join(out_dir, "results_split_by_model.png")
fig.savefig(outpath)
maybe_show(fig)