In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate your autorater with meta-evaluation

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/tree/main/gemini/evaluation/evaluate_autorater.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaluate_autorater.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaluate_autorater.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_autorater.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/bigquery/v1/32px.svg" alt="BigQuery Studio logo"><br> Open in BigQuery Studio
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/tree/main/gemini/evaluation/evaluate_autorater.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_autorater.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_autorater.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_autorater.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_autorater.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_autorater.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| | |
|-|-|
| Author(s) |  [Yuan (Emily) Xue](https://www.linkedin.com/in/yuan-emily-xue-3483012/), [Irina Sigler](https://www.linkedin.com/in/irina-sigler-298a59b1/)|

## Overview

It's crucial to assess the effectiveness of your Large Language Model (LLM) evaluator to ensure it's guiding you correctly. This process, known as meta-evaluation, is a key step in establishing a task-specific evaluation framework.

Essentially, it involves comparing the performance of automated evaluation systems (evaluators) against human judgments to determine how well they align with human preferences. This calibration is often done using agreement or correlation measures, depending on the specific evaluation task.

This tutorial offers the design of two simple autoraters and a step-by-step guide to evaluate autoraters.

## Get started

### Install Vertex AI SDK and other required packages


In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says "Your session crashed for an unknown reason." This is expected. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Import libraries

In [None]:
import abc
from collections.abc import Callable
import dataclasses
import random
import re

import pandas as pd
from scipy.stats import kendalltau, spearmanr
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from vertexai.generative_models import GenerativeModel

## Load model as autorater

In [None]:
MODEL_ID = "gemini-2.0-flash"  # @param {type:"string", isTemplate: true}

autorater_model = GenerativeModel(MODEL_ID)

## Set Meta-evaluation components

### Define Autoraters

The following code defines a system for automatically rating responses to prompts, particularly for comparing pairs of responses.

The system has the following components:

- **`base_model`** which is assumed to be a language model capable of generating text.

- **`AutoRater`** class provides an abstract framework for response rating, implemented concretely by `BasicRater` and `SelfConsistencyRater`. **`BasicRater`** rates a single example using a chosen prompt method from a prompt map. **`SelfConsistencyRater`** enhances reliability by calling the base model multiple times with the same example and aggregating the results for a consensus.

- **`The RaterInstruction`** dataclass pairs a prompt template with a corresponding parsing function to interpret the model's output. The parsing function uses regular expressions for extracting the winner or tie from the model's response. It handles potential errors gracefully and provides flexibility through different prompt methods and rating strategies.


In [None]:
@dataclasses.dataclass
class Example:
    """Represents a single example for rating.

    Attributes:
        prompt: The prompt string.
        response1: The first response string.
        response2: The second response string.
    """

    prompt: str
    response1: str
    response2: str


@dataclasses.dataclass
class RaterInstruction:
    """Stores a rating prompt and its associated result parser.

    Attributes:
        prompt: The prompt string for the rater.
        result_parser: A function to parse the rater's output string into an integer score.
    """

    prompt: str
    result_parser: Callable[[str], int | None]


def simple_no_tie_result_parser(result_str: str) -> int | None:
    """Parses a result string for simple prompts (no ties allowed).

    Args:
        result_str: The rater's output string.

    Returns:
        -1 if response 1 is better, 1 if response 2 is better, or None if the output is invalid.
    """
    matches = re.findall(r"<winner>(.*?)</winner>", result_str)
    if not matches or len(matches) > 1:
        return None
    if matches[0] == "1":
        return -1
    elif matches[0] == "2":
        return 1
    else:
        return None


def simple_with_tie_result_parser(result_str: str) -> int | None:
    """Parses a result string for simple prompts (ties allowed).

    Args:
        result_str: The rater's output string.

    Returns:
        -1 if response 1 is better, 1 if response 2 is better, 0 if it's a tie, or None if the output is invalid.
    """
    if "<tie>" in result_str:
        return 0

    matches = re.findall(r"<winner>(.*?)</winner>", result_str)
    if not matches or len(matches) > 1:
        return None
    if matches[0] == "1":
        return -1
    elif matches[0] == "2":
        return 1
    else:
        return None


def reward_bench_gemini_no_tie_parser(result_str: str) -> int | None:
    """Parses a result string for reward-bench Gemini prompts (no ties).

    Args:
        result_str: The rater's output string.

    Returns:
        -1 if response 1 (A) is better, 1 if response 2 (B) is better, or None if the output is invalid.
    """
    match = re.search(r"\[\[(A|B)\]\]", result_str)
    if match:
        if match.group(1) == "A":
            return -1
        elif match.group(1) == "B":
            return 1
        else:
            return None
    else:
        return None


PROMPT_MAP = {
    "simple_no_tie": RaterInstruction(
        prompt="""
          ## You are an impartial judge to evaluate the quality of two responses to the user prompt.

          [Start of Prompt]{prompt} [end of  prompt]
          [Start of Response 1]{response1}[End of the response 1]
          [Start of Response 2]{response2}[End of the response 2]

          Your output should strictly follow this format: <winner>1</winner>, if response 1 is better; <winner>2</winner>, if response 2 is better.
          """,
        result_parser=simple_no_tie_result_parser,
    ),
    "simple_with_tie": RaterInstruction(
        prompt="""
          ## You are an impartial judge to evaluate the quality of two responses to the user prompt.

          [Start of Prompt]{prompt} [end of  prompt]
          [Start of Response 1]{response1}[End of the response 1]
          [Start of Response 2]{response2}[End of the response 2]

          Your output should strictly follow this format: <winner>1</winner>, if response 1 is better; <winner>2</winner>, if response 2 is better; <tie> if you cannot determine a winner
          """,
        result_parser=simple_with_tie_result_parser,
    ),
    "reward-bench_gemini_no_tie": RaterInstruction(
        prompt=(
            "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. "
            "You should choose the assistant that follows the user's instructions and answers the user's question better. "
            "Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. "
            "Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. "
            "Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. "
            "Be as objective as possible. "
            "Your output should only consist of '[[A]]' if assistant A is better, or '[[B]]' if assistant B is better. Omit any other output.\n"
            "[User Question]\n{prompt}\n\n[The Start of Assistant A's Answer]\n{response1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response2}\n[The End of Assistant B's Answer]",
        ),
        result_parser=simple_no_tie_result_parser,
    ),
}


class AutoRater(abc.ABC):
    """Abstract base class for automatic rating of responses."""

    def __init__(self, name, base_model, prompt_method):
        """Initializes the AutoRater.

        Args:
        name: The name of the rater.
        base_model: The base language model to use for rating.
        prompt_method: The name of the prompt method to use.
        """
        self.name = name
        self.base_model = base_model
        self.prompt_method = prompt_method

    @abc.abstractmethod
    def rate_one_example(self, example: Example) -> int:
        """Rates a single example.

        Args:
        example: The example to rate.

        Returns:
        The rating for the example.
        """
        # for pair-wise: -1, 0, 1
        # for point-wise: scale 1-5
        pass  # This method should also be implemented by subclasses

    def rate_batch(self, id_to_example: dict[str, Example]) -> dict[str, int]:
        """Rates a batch of examples.

        Args:
        id_to_example: A dictionary mapping example IDs to examples.

        Returns:
        A dictionary mapping example IDs to ratings.
        """
        id_to_rating = {}
        for id, ex in id_to_example.items():
            rating = self.rate_one_example(ex)
            if rating is not None:
                id_to_rating[id] = rating
        return id_to_rating


class BasicRater(AutoRater):
    """A basic rater that uses a single call to the language model."""

    def __init__(self, base_model, prompt_method):
        """Initializes the BasicRater.

        Args:
        base_model: The base language model.
        prompt_method: The prompt method name.
        """
        super().__init__("Basic", base_model, prompt_method)

    def rate_one_example(self, example: Example) -> int | None:
        """Rates a single example using a single LLM call.

        Args:
        example: The example to rate.

        Returns:
        The rating for the example, or None if an error occurred.
        """
        autorater_prompt = PROMPT_MAP[self.prompt_method].prompt.format(
            prompt=example.prompt,
            response1=example.response1,
            response2=example.response2,
        )
        try:
            response = self.base_model.generate_content(autorater_prompt)
        except Exception as e:
            print(e)
            return None
        if response.candidates and response.candidates[0].content.parts:
            result_str = response.candidates[0].content.parts[0].text
            print(result_str)
            result = PROMPT_MAP[self.prompt_method].result_parser(result_str)
            print(result)
            return result


class SelfConsistencyRater(AutoRater):
    """A self-consistency rater that uses multiple calls to the language model and aggregates the results."""

    def __init__(self, base_model, prompt_method, num_calls):
        """Initializes the SelfConsistencyRater.

        Args:
        base_model: The base language model.
        prompt_method: The prompt method name.
        num_calls: The number of calls to make to the language model.
        """
        super().__init__("SelfConsistency", base_model, prompt_method)
        self.num_calls = num_calls

    def _get_consensus_result(self, result_list: list[int]) -> int:
        """Aggregates the results from multiple calls to the language model.

        Args:
        result_list: A list of integer results.

        Returns:
        The aggregated result.
        """
        avg = sum(result_list) / len(result_list)
        if avg > 0.5:
            return 1
        if avg < -0.5:
            return -1
        return 0

    def rate_one_example(self, example: Example) -> int | None:
        """Rates a single example using multiple LLM calls and consensus aggregation.

        Args:
        example: The example to rate.


        Returns:
        The aggregated rating for the example, or None if an error occurred or no valid results were obtained.
        """
        result_list = []
        for i in range(self.num_calls):
            autorater_prompt = PROMPT_MAP[self.prompt_method].prompt.format(
                prompt=example.prompt,
                response1=example.response1,
                response2=example.response2,
            )
            try:
                response = self.base_model.generate_content(autorater_prompt)
            except Exception as e:
                print(e)
                continue

            if response.candidates and response.candidates[0].content.parts:
                result_str = response.candidates[0].content.parts[0].text
                print(result_str)
                result = PROMPT_MAP[self.prompt_method].result_parser(result_str)
                print(result)
                if result is not None:
                    result_list.append(result)
        print(result_list)
        if result_list:
            result = self._get_consensus_result(result_list)
            print(result)
            return result
        else:
            return None

### Define datasets import and process utils

After you define the Autorater rating system, you prepare you evaluation data. The `process_reward_bench` function prepares data from the [AllenAI reward benchmark](https://github.com/allenai/reward-bench) for use in training or evaluating preference models.

You read data from a specified parquet file ('raw' or 'filtered'), and creates a dictionary mapping question IDs to Example objects, each containing a prompt and two responses. A random "golden rating" (-1 or 1) is assigned to each example, indicating which response is preferred. This simulated preference data can then be used for tasks like training a reward model.

Notice how the `process_reward_bench` function also handles limiting the number of processed examples based on the `total_count` parameter.


In [None]:
def process_reward_bench(split: str = "filtered", total_count: int = None):
    """Processes reward benchmark data.

    This function processes reward benchmark data based on the specified split
    and total count. If 'total_count' is less than 0, all data will be used.

    Args:
      split: The split of data to use ('filtered' by default).
      total_count: The maximum number of data points to process. If not set,
        all data will be used.

    Returns:
      This function does not return any value (implicitly returns None).
    """
    split_to_path = {
        "raw": "data/raw-00000-of-00001.parquet",
        "filtered": "data/filtered-00000-of-00001.parquet",
    }
    df = pd.read_parquet("hf://datasets/allenai/reward-bench/" + split_to_path[split])
    id_to_example = {}
    id_to_golden_rating = {}

    for index, (id, row) in enumerate(df.iterrows()):
        if total_count is not None and index > total_count - 1:
            break
        question_id = f'{row["subset"]}-{row["id"]}'
        golden_rating = random.choice([-1, 1])
        if golden_rating == -1:
            response1 = row["chosen"]
            response2 = row["rejected"]
        else:
            response2 = row["chosen"]
            response1 = row["rejected"]
        prompt = row["prompt"]
        example = Example(prompt=prompt, response1=response1, response2=response2)
        id_to_example[question_id] = example
        id_to_golden_rating[question_id] = golden_rating

    return id_to_example, id_to_golden_rating

### Meta Evaluation methods

To implement the meta evaluation you define two functions:

- `get_caliboration_result` computes several metrics to evaluate the performance of a model, including Spearman's rank correlation, Kendall's tau, Cohen's Kappa, and generates a confusion matrix. These metrics are commonly used in machine learning to assess the agreement between predicted and actual labels.

- `get_aligned_lists` takes two dictionaries, each mapping IDs to ratings (e.g., automatically generated ratings and human-assigned golden ratings). It sorts the shared keys between the two dictionaries and creates two aligned lists of ratings corresponding to the sorted IDs, enabling paired comparisons between auto generated and true labels. This function is crucial for preparing data for comparison and evaluation, ensuring that the ratings being compared actually correspond to the same data points.


In [None]:
def get_caliboration_result(
    model_outputs: list[int],
    golden_labels: list[int],
    labels: list[int] = [-1, 0, 1],
    weights: str = "quadratic",
):
    """Calculates various evaluation metrics for model outputs against golden labels.

    Args:
        model_outputs: Predicted labels from the model.
        golden_labels: True labels.
        labels: Unique labels in the dataset. Default is [-1, 0, 1].
        weights: Weighting scheme for Cohen's Kappa. Default is "quadratic".

    Returns:
        A tuple containing the confusion matrix (DataFrame), Cohen's Kappa score, Spearman's rank correlation, and Kendall's rank correlation.
    """
    spearman, _ = spearmanr(model_outputs, golden_labels)
    kendall, _ = kendalltau(model_outputs, golden_labels)
    kappa = cohen_kappa_score(
        model_outputs, golden_labels, labels=labels, weights=weights
    )

    conf_matrix = confusion_matrix(golden_labels, model_outputs, labels=labels)
    conf_matrix_df = pd.DataFrame(
        conf_matrix,
        index=["Gold_1", "Gold_Tie", "Gold_2"],
        columns=["Model_1", "Model_Tie", "Model_2"],
    )

    return conf_matrix_df, kappa, spearman, kendall


def get_aligned_lists(
    id_to_auto_ratings: dict, id_to_golden_rating: dict
) -> tuple[list, list]:
    """Aligns two dictionaries of ratings based on shared keys (IDs).

    Args:
        id_to_auto_ratings: Dictionary mapping IDs to automatically generated ratings.
        id_to_golden_rating: Dictionary mapping IDs to golden ratings.

    Returns:
        A tuple containing two aligned lists of ratings.
    """
    id_list = sorted(id_to_auto_ratings.keys())
    auto_ratings = [id_to_auto_ratings[id] for id in id_list]
    golden_ratings = [id_to_golden_rating[id] for id in id_list]
    return auto_ratings, golden_ratings

### Evaluate your autorater

#### Load the evaluation dataset

Load the dataset to evaluate the autorater.

In [None]:
id_to_example, id_to_golden_rating = process_reward_bench(
    split="filtered", total_count=100
)
len(id_to_example)

#### Basic evaluation


##### Run the evaluation

Evaluate your autorater using the basic rater on a single example.

In [None]:
rater = BasicRater(autorater_model, "simple_no_tie")
id_to_basic_auto_ratings = rater.rate_batch(id_to_example)

##### Calculate Meta Evaluation scores

Compute alignement metrics to assess the agreement between predicted and actual labels.

In [None]:
auto_ratings, golden_ratings = get_aligned_lists(
    id_to_basic_auto_ratings, id_to_golden_rating
)
conf_matrix_df, kappa_score, corr_spearman, corr_kendall = get_caliboration_result(
    model_outputs=auto_ratings,
    golden_labels=golden_ratings,
    labels=[-1, 0, 1],
    weights="quadratic",
)
print(f"kappa_score: {kappa_score}")
print(f"corr_spearman: {corr_spearman}")
print(f"corr_kendall: {corr_kendall}")

In [None]:
print(conf_matrix_df)

#### Self consistent evaluation

##### Run the evaluation

Evaluate your autorater by calling the base model multiple times with the same example and aggregating the results for a consensus.

In [None]:
sc_rater = SelfConsistencyRater(autorater_model, "simple_no_tie", 3)
id_to_sc_auto_ratings = sc_rater.rate_batch(id_to_example)

##### Calculate Meta Evaluation scores

Compute alignement metrics to assess the autorater.

In [None]:
auto_ratings, golden_ratings = get_aligned_lists(
    id_to_sc_auto_ratings, id_to_golden_rating
)
print(len(auto_ratings))
print(len(golden_ratings))

In [None]:
conf_matrix_df, kappa_score, corr_spearman, corr_kendall = get_caliboration_result(
    model_outputs=auto_ratings,
    golden_labels=golden_ratings,
    labels=[-1, 0, 1],
    weights="quadratic",
)
print(f"kappa_score: {kappa_score}")
print(f"corr_spearman: {corr_spearman}")
print(f"corr_kendall: {corr_kendall}")

In [None]:
print(conf_matrix_df)