def compute()

in src/lighteval/metrics/metrics_sample.py [0:0]
31 lines of code
10 McCabe index (conditional complexity)

    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
        """Computes all the requested metrics on the golds and prediction.

        Args:
            golds (list[str]): A list of possible golds. If it contains more than one item, only the first one is kept.
            predictions (list[str]): Predicted strings.

        Returns:
           dict: The different scores computed
        """
        predictions = model_response.text
        golds = doc.get_golds()
        if len(golds) > 1:
            logger.warning(
                "Provided more than one gold to compute a string distance metric. Just using the first one."
            )
        reference = golds[0]

        result = {m: [] for m in self.metric_types}
        for sequence in predictions:
            if self.strip_prediction:
                completion = sequence.strip()
            else:
                completion = sequence

            # `reference` is the entire remaining book for each instance.
            # Truncate it here to be of the same length as the completion to ensure edit-distance is meaningful.
            truncated_reference = reference[: len(completion)]

            completion_tokens = np.array(TreebankWordTokenizer().tokenize(completion))
            truncated_reference_tokens = np.array(TreebankWordTokenizer().tokenize(truncated_reference))

            if "edit_distance" in self.metric_types:
                result["edit_distance"].append(edit_distance(s1=completion_tokens, s2=truncated_reference_tokens))
            if "edit_similarity" in self.metric_types:
                result["edit_similarity"].append(
                    self.edit_similarity(s1=completion_tokens, s2=truncated_reference_tokens)
                )
            if "longest_common_prefix_length" in self.metric_types:
                result["longest_common_prefix_length"].append(
                    self.longest_common_prefix_length(s1=completion_tokens, s2=truncated_reference_tokens)
                )

        final_result = {}
        # We cast to float as final results can be numpy types, not JSON serializable
        for m, v in result.items():
            final_result[m] = float(self.sample_aggregations[m](v))

        return final_result