in evals/elsuite/bugged_tools/eval.py [0:0]
def _log_additional_metrics(self, metrics: Sequence[Event], results: dict):
"""
Modifies results in-place, breaks results down per tool and per bug
"""
all_tools = list(set([j for i in metrics for j in i["tools"]]))
all_bugs = list(set([j for i in metrics for j in i["bugs"]]))
# Log bug metrics per type of tool
for tool in all_tools:
filtered_metrics = [i for i in metrics if i["tools"][0] == tool]
tp, fp, tn, fn, accuracy, precision, recall, f1 = precision_recall_fscore(
filtered_metrics
)
results[f"tool_{tool}_f1"] = f1
results[f"tool_{tool}_precision"] = precision
results[f"tool_{tool}_recall"] = recall
results[f"tool_{tool}_accuracy"] = accuracy
results[f"tool_{tool}_tp"] = tp
results[f"tool_{tool}_fp"] = fp
results[f"tool_{tool}_tn"] = tn
results[f"tool_{tool}_fn"] = fn
# Log bug metrics per type of bug. Only log accuracy since all examples here are positive (bugged)
for bug in all_bugs:
filtered_metrics = [i for i in metrics if len(i["bugs"]) > 0]
filtered_metrics = [i for i in filtered_metrics if i["bugs"][0] == bug]
tp, fp, tn, fn, accuracy, precision, recall, f1 = precision_recall_fscore(
filtered_metrics
)
results[f"bug_{bug}_accuracy"] = accuracy