def eval()

in evaluation/datasets/common.py [0:0]
77 lines of code
28 McCabe index (conditional complexity)

    def eval(self, predictions: list, targets=None, perturb_prefix=None) -> dict:
        """
        Adapted from common.helpers.validate_prediction, compute accuracy / f1, etc.
        If targets is passed, will compute weighted delta metrics against
        the given target, and the target id should be original id
        """
        # load target examples
        target_examples = self.read_labels(perturb_prefix) if not targets else targets
        target_ids = [x["id"] for x in target_examples]
        # note to compute accuracy with changed label,
        # each perturbed example needs to have a unique id too
        target_labels_dict = {t["id"]: t["answer"] for t in target_examples}
        target_labels = [target_labels_dict[id] for id in target_ids]
        target_tags_dict = {t["id"]: t["tags"] for t in target_examples}
        target_tags = [target_tags_dict[id] for id in target_ids]

        predictions = [
            self.pred_field_converter(prediction) for prediction in predictions
        ]

        score_obj = {}  # score_obj keys always correspond to scores table columns

        if targets and perturb_prefix:  # i.e compute perturb percentage
            predictions_dict = {id: [] for id in target_ids}
            try:
                id_mapping = self.read_labels(perturb_prefix)
                id_mapping = {m["id"]: m["input_id"] for m in id_mapping}
                for p in predictions:
                    predictions_dict[id_mapping[p["id"]]].append(p["pred"])
                predictions = [predictions_dict[id] for id in target_ids]
            except KeyError as ex:
                logger.exception(f"Prediction and target file example mismatch: {ex}")
            except Exception as ex:
                logger.exception(f"Unknown exception {ex}")
            else:
                delta_metrics_dict = get_delta_metrics(
                    self.task, predictions, target_labels, perturb_prefix
                )
                score_obj = {
                    **delta_metrics_dict,
                    "metadata_json": json.dumps(delta_metrics_dict),
                }

        else:  # compute normal eval metrics
            # validate alignment of prediction and target labels
            predictions = {p["id"]: p["pred"] for p in predictions}
            try:
                predictions = [predictions[id] for id in target_ids]
                assert len(predictions) == len(target_labels)
            except AssertionError:
                logger.exception("Prediction and target file length mismatch")
            except KeyError as ex:
                logger.exception(f"Prediction and target file example mismatch: {ex}")
            except Exception as ex:
                logger.exception(f"Unknown exception {ex}")
            else:
                # Get performance
                perf, perf_dict = get_eval_metrics(
                    self.task, predictions, target_labels
                )
                score_obj["perf"] = perf
                score_obj["perf_std"] = perf_dict.get("perf_std", None)
                score_obj["pretty_perf"] = str(perf) + " %"
                score_obj["metadata_json"] = perf_dict

                # Get performance breakdown for this round across tags
                if target_tags:
                    examples_by_tag = {}
                    for pred, target_label, target_tags in zip(
                        predictions, target_labels, target_tags
                    ):
                        for tag in target_tags:
                            examples_by_tag.setdefault(tag, []).append(
                                (pred, target_label)
                            )
                    perf_by_tag_tuple_dict = {
                        k: get_eval_metrics(self.task, *list(zip(*examples)))
                        for k, examples in examples_by_tag.items()
                    }
                    score_obj["metadata_json"]["perf_by_tag"] = score_obj[
                        "metadata_json"
                    ].get("perf_by_tag", []) + [
                        {
                            "tag": tag,
                            "pretty_perf": str(perf) + " %",
                            "perf": perf,
                            "perf_std": perf_dict.get("perf_std", None),
                            "perf_dict": perf_dict,
                        }
                        for tag, (perf, perf_dict) in perf_by_tag_tuple_dict.items()
                    ]

                score_obj["metadata_json"] = json.dumps(score_obj["metadata_json"])

        return score_obj