in evaluation/datasets/common.py [0:0]
def eval(self, predictions: list, targets=None, perturb_prefix=None) -> dict:
"""
Adapted from common.helpers.validate_prediction, compute accuracy / f1, etc.
If targets is passed, will compute weighted delta metrics against
the given target, and the target id should be original id
"""
# load target examples
target_examples = self.read_labels(perturb_prefix) if not targets else targets
target_ids = [x["id"] for x in target_examples]
# note to compute accuracy with changed label,
# each perturbed example needs to have a unique id too
target_labels_dict = {t["id"]: t["answer"] for t in target_examples}
target_labels = [target_labels_dict[id] for id in target_ids]
target_tags_dict = {t["id"]: t["tags"] for t in target_examples}
target_tags = [target_tags_dict[id] for id in target_ids]
predictions = [
self.pred_field_converter(prediction) for prediction in predictions
]
score_obj = {} # score_obj keys always correspond to scores table columns
if targets and perturb_prefix: # i.e compute perturb percentage
predictions_dict = {id: [] for id in target_ids}
try:
id_mapping = self.read_labels(perturb_prefix)
id_mapping = {m["id"]: m["input_id"] for m in id_mapping}
for p in predictions:
predictions_dict[id_mapping[p["id"]]].append(p["pred"])
predictions = [predictions_dict[id] for id in target_ids]
except KeyError as ex:
logger.exception(f"Prediction and target file example mismatch: {ex}")
except Exception as ex:
logger.exception(f"Unknown exception {ex}")
else:
delta_metrics_dict = get_delta_metrics(
self.task, predictions, target_labels, perturb_prefix
)
score_obj = {
**delta_metrics_dict,
"metadata_json": json.dumps(delta_metrics_dict),
}
else: # compute normal eval metrics
# validate alignment of prediction and target labels
predictions = {p["id"]: p["pred"] for p in predictions}
try:
predictions = [predictions[id] for id in target_ids]
assert len(predictions) == len(target_labels)
except AssertionError:
logger.exception("Prediction and target file length mismatch")
except KeyError as ex:
logger.exception(f"Prediction and target file example mismatch: {ex}")
except Exception as ex:
logger.exception(f"Unknown exception {ex}")
else:
# Get performance
perf, perf_dict = get_eval_metrics(
self.task, predictions, target_labels
)
score_obj["perf"] = perf
score_obj["perf_std"] = perf_dict.get("perf_std", None)
score_obj["pretty_perf"] = str(perf) + " %"
score_obj["metadata_json"] = perf_dict
# Get performance breakdown for this round across tags
if target_tags:
examples_by_tag = {}
for pred, target_label, target_tags in zip(
predictions, target_labels, target_tags
):
for tag in target_tags:
examples_by_tag.setdefault(tag, []).append(
(pred, target_label)
)
perf_by_tag_tuple_dict = {
k: get_eval_metrics(self.task, *list(zip(*examples)))
for k, examples in examples_by_tag.items()
}
score_obj["metadata_json"]["perf_by_tag"] = score_obj[
"metadata_json"
].get("perf_by_tag", []) + [
{
"tag": tag,
"pretty_perf": str(perf) + " %",
"perf": perf,
"perf_std": perf_dict.get("perf_std", None),
"perf_dict": perf_dict,
}
for tag, (perf, perf_dict) in perf_by_tag_tuple_dict.items()
]
score_obj["metadata_json"] = json.dumps(score_obj["metadata_json"])
return score_obj