In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate images with Gecko

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/tree/main/gemini/evaluation/evaluate_images_with_gecko.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaluate_images_with_gecko.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaluate_images_with_gecko.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/tree/main/gemini/evaluation/evaluate_images_with_gecko.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_images_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

 | | | | |
 |-|-|-|-|
 |Author(s): | [Greg Breard](https://github.com/gregbreard) | Anant Nawalgaria | Olivia Wiles |

## Overview

This CoLAB shows how to leverage the Vertex AI evaluation service in order to run [Gecko](https://arxiv.org/abs/2404.16820).

As with a more standard rubric approach, Gecko proceeds in two stages: a rubric generation step followed by a validator step. The key difference is that the rubric is generated based on the prompt.
This allows for a more fine-grained metric that can be customized to prompts with differing challenges.

In more detail, Gecko proceeds as follows, with two key steps: the QA generation step (ie the rubric generation step) and then the VQA step (ie the validator step).

## The rubric generation step
Given a prompt, such as `A teddy bear riding a skateboard`, we prompt the Gemini model to generate a set of questions, answer choices and corresponding ground truth (GT) answer. The question is also tagged with a question type. Depending on the prompt, these questions can either be `yes`/`no` questions or multiple choice ones.

`A teddy bear riding a skateboard` -->

- `Q1: Is there a teddy bear? Choices: [yes, no]. GT Answer: yes. Tag: Object.`
- `Q2: Is there a skateboard? Choices: [yes, no]. GT Answer: yes. Tag: Object.`
- `Q3: Is the teddy bear riding a skateboard? Choices: [yes, no]. GT Answer: yes. Tag: Action.`

## The validator step
Given a generated image and the questions above, we query the Gemini model for each question to give an answer. We then check if it matches the GT answer, with a result of 1 if it matches and 0 if it does not. We aggregate these results to give a final overall score, which can be broken down into scores per question. We can also aggregate scores based on tags.

For example, imagine we have a generated image `<image1>` which includes a teddy bear but no skateboard, and Gemini outputs the following results:

- `<image1> Is there a teddy bear? GT Answer: yes. Result: 1.`
- `<image1> Is there a skateboard? GT Answer: no. Result: 0.`
- `<image1> Is the teddy bear riding a skateboard? GT Answer: no. Result: 0.`

The final score will be `0.33` with a score of `0.5` for the question tag and `0.0` for the action tag.

## Further exploration
We provide two prompts, engineered for video and image generation tasks. Below, we show how to run Gecko for the image modality on a set of generations.

However, these prompts can be modified and changed as suits a developer's needs. The quality can be analysed by exploring what  questions are generated as well as the reliability of the validator step. Questions can also be manually added as desired for an application.

## Steps

1. Set up the environment.
2. Define helper functions, prompt templates, and metric.
3. Prepare the dataset for evaluation.
4. Run the evaluation (including model inference).

## Costs
This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
%pip install --upgrade --quiet google-cloud-aiplatform

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>‚ö†Ô∏è The kernel is going to restart. Wait until it's finished before continuing to the next step. ‚ö†Ô∏è</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "your-project-id"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}


import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Import libraries

In [None]:
import pandas as pd
from vertexai.preview.evaluation import (
    CustomOutputConfig,
    EvalTask,
    PointwiseMetric,
    RubricBasedMetric,
    RubricGenerationConfig,
)

# Set up eval metrics for Gecko

## Helper functions

The outputs supported by Gecko are more sophisticated than the default outputs of predefined rubric based metrics. To handle this, custom parsing logic is required.

The following code block defines 2 classes: `QARecord` and `QAResult`. The `QARecord` represents the questions created during rubric generation. The `QAResult` extends the `QARecord` with a result field that is populated after validation.

There are also two parsing methods. The `parse_json_to_qa_records` method converts the text output of rubric generation to `QARecords` and the `parse_rubric_results` method extracts the answers from the validation step. These are passed into the metric definition and parsing is handled automatically during the generation and validation steps.

Finally, the `compute_scores` method compares the `QARecord`s and rubric results to calculate a per row score and appends `QAResult`s and scores to the dataset.

In addition, there are pretty printing methods provided to present the output in a human readable format.

In [None]:
from collections.abc import Sequence
from dataclasses import dataclass, field
import json
import re
from typing import Any

import numpy as np

_QUESTION_REGEX = re.compile(r"Question:(.*?)Verdict:", re.DOTALL)
_VERDICT_REGEX = re.compile("Verdict:(.*)")
_QUESTION_BLOCK_REGEX = re.compile("<question>(.*?)</question>", re.DOTALL)
_TABLE_STYLE = [
    {
        "selector": "th",
        "props": [
            ("background-color", "#f2f2f2"),
            ("border", "1px solid gray"),
            ("color", "black"),
            ("font-size", "11pt"),
            ("text-align", "center"),
            ("word-break", "break-all"),
        ],
    },
    {"selector": "tr:nth-child(even)", "props": [("background-color", "#f9f9f9")]},
    {"selector": "tr:nth-child(odd)", "props": [("background-color", "white")]},
    {"selector": "tr:hover", "props": [("background-color", "#94e6ff")]},
    {"selector": "td:hover", "props": [("background-color", "#ffffb3")]},
]


@dataclass(kw_only=True, frozen=True)
class QARecord:
    """A basic QA Record for storing question-answer pairs.

    Attributes:
        question: Question text.
        question_type: Category of question.
        gt_answer: Ground-truth answer to the question.
        answer_choices: Possible answers for multiple choice questions.
        justification: How the question relates to the prompt.
    """

    question: str = ""
    question_type: str = ""
    gt_answer: str = ""
    answer_choices: Sequence[str] = field(default_factory=list)
    justification: str = ""


class QAResult(QARecord):
    """A basic QA Result for storing question-answer results.

    Attributes:
        result: The result of answering the question.
    """

    result: str = ""

    def __init__(self, qa_record: QARecord, result: str):
        super().__init__(
            question=qa_record.question,
            question_type=qa_record.question_type,
            gt_answer=qa_record.gt_answer,
            answer_choices=qa_record.answer_choices,
            justification=qa_record.justification,
        )
        self.result = result


def parse_json_to_qa_records(json_response: str) -> dict[str, Any]:
    """
    Parse the JSON response and convert it to a questions and QARecords.

    Args:
        json_response: JSON string containing the QA data.

    Returns:
        Dict with keywords, questions, and QARecord objects.

    Raises:
        json.JSONDecodeError: If JSON parsing fails
        KeyError: If expected keys are missing from the JSON structure
    """
    json_response = re.sub(
        r"(.*```json|```.*)",
        "",
        json_response.strip(),
    )
    try:
        # Parse JSON string to Python object
        data = json.loads(json_response)
        qa_records = []

        # Process each QA pair in the QAs array
        rubrics = []
        for qa in data["qas"]:
            record = QARecord(
                question=qa["question"],
                question_type=qa["question_type"],
                gt_answer=qa["answer"],
                answer_choices=qa["choices"],
                justification=qa["justification"],
            )
            qa_records.append(record)
            rubrics.append(
                f"<question>{record.question}<choices>{','.join(record.answer_choices)}"
            )
        return {
            "questions": "\n".join(rubrics),
            "keywords": data["keywords"],
            "qa_records": qa_records,
        }
    except json.JSONDecodeError as e:
        return {
            "questions": f"Error decoding JSON response: {str(e)}",
            "keywords": "",
            "qa_records": json_response,
        }
    except KeyError as e:
        return {
            "questions": f"Missing required key in JSON structure: {str(e)}",
            "keywords": "",
            "qa_records": json_response,
        }


def parse_rubric_results(results: list[str]) -> dict[str, Any]:
    """Parses the rubric results from the rubric validator response."""
    rubric_results = {}
    for result in results:
        rubric_verdicts = _parse_question_blocks(result)
        for rubric, verdict in rubric_verdicts:
            rubric_results[rubric.lower()] = verdict.lower()
    return {"rubric_results": rubric_results}


def _parse_question_blocks(txt: str) -> list[tuple[str, bool]]:
    """Parses the question blocks from the rubric validator response."""
    responses = []
    question_blocks = _QUESTION_BLOCK_REGEX.findall(txt)
    if not question_blocks:
        question_blocks = [txt]
    for block in question_blocks:
        q = _parse_question(block)
        v = _parse_verdict(block)
        if q is not None and v is not None:
            responses.append((q, v))
    return responses


def _parse_question(txt: str):
    """Parses the question from the rubric validator response."""
    if not isinstance(txt, str) or not txt:
        return None
    try:
        txt = txt.split("Verdict:")[0]
        if "Question:" in txt:
            return txt.split("Question:")[-1].strip()
        if question := _QUESTION_REGEX.findall(txt):
            return question[0].strip()
    except Exception as e:
        print(f"Failed to parse question: {str(e)}")
        return None


def _parse_verdict(txt: str):
    """Parses the verdict from the rubric validator response."""
    if not isinstance(txt, str) or not txt:
        return None
    try:
        if verdict := _VERDICT_REGEX.findall(txt):
            verdict = verdict[0].strip()
            return verdict
    except Exception as e:
        print(f"Failed to parse question: {str(e)}")
        return None


def compute_scores(df: "pd.DataFrame") -> "pd.DataFrame":
    """Computes scores for each row based on QA results."""
    qa_results = []
    final_scores = []
    for idx, row in df.iterrows():
        rubric_results = {}
        for key in row.keys():
            if "rubric_results" in key:
                rubric_results = row[key]
        scores = []
        results = []
        for qa in row["qa_records"]:
            q = qa.question.lower()
            if q in rubric_results:
                if qa.gt_answer.lower() in rubric_results[q]:
                    results.append(QAResult(qa, f"{qa.gt_answer} ‚úì"))
                    scores.append(1)
                else:
                    results.append(QAResult(qa, f"{rubric_results[q]} üó¥"))
                    scores.append(0)
            else:
                results.append(QAResult(qa, "no result"))
                scores.append(0)
        qa_results.append(results)
        final_scores.append(np.mean(scores))
    df_with_score = df.assign(qa_results=qa_results, final_score=final_scores)
    return df_with_score


def pretty_print_qa_records_df(
    df: "pd.DataFrame", hide_columns: list[str]
) -> "pd.Styler":
    """Prints QA records data frame as stylized HTML table."""
    styled_df = df.copy()
    for col in df.columns:
        if (
            isinstance(df[col][0], list)
            and df[col][0]
            and isinstance(df[col][0][0], QARecord)
        ):
            styled_df[col] = styled_df[col].apply(
                lambda x: _qa_records_to_html_table(x)
            )
    styles = _TABLE_STYLE.copy()
    styles.append(
        {
            "selector": "td",
            "props": [
                ("border", "1px solid gray"),
                ("color", "black"),
                ("min-width", "100px"),
                ("text-align", "center"),
            ],
        }
    )
    return (
        styled_df.style.hide(axis="index")
        .hide(subset=hide_columns, axis=1)
        .set_table_styles(styles)
    )


def pretty_print_result_df(df: "pd.DataFrame", hide_columns: list[str]) -> "pd.Styler":
    """Prints results data frame as stylized HTML table."""
    styled_df = df.copy()
    for col in df.columns:
        if (
            isinstance(df[col][0], list)
            and df[col][0]
            and isinstance(df[col][0][0], QARecord)
        ):
            styled_df[col] = styled_df[col].apply(
                lambda x: _qa_records_to_html_table(x)
            )
    styles = _TABLE_STYLE.copy()
    styles.append(
        {
            "selector": "td",
            "props": [
                ("border", "1px solid gray"),
                ("color", "black"),
                ("min-width", "120px"),
                ("text-align", "center"),
            ],
        }
    )
    return (
        styled_df.style.hide(axis="index")
        .hide(subset=hide_columns, axis=1)
        .format({"final_score": "{:,.1f}"})
        .set_table_styles(styles)
    )


def _qa_records_to_html_table(data: list[QARecord]) -> str:
    """Converts a list to an HTML table."""
    if not data:
        return "<i>No data to display.</i>"
    html_table = "<table style='border-collapse: collapse'><thead><tr>"
    # Extract headers from the first element
    keys = ["question", "answer_choices", "gt_answer"]
    if isinstance(data[0], QAResult):
        keys.append("result")
    for key in keys:
        html_table += f"<th>{key}</th>"
    html_table += "</tr></thead><tbody>"
    # Add rows
    for item in data:
        html_table += "<tr>"
        for key in keys:
            html_table += f"<td>{item.__dict__[key]}</td>"
        html_table += "</tr>"
    html_table += "</tbody></table>"
    return html_table

## Prompt Templates

This cell defines the prompt templates that will be used for evaluation. The `RUBRIC_GENERATION_PROMPT` is used to generate questions relevant to the user input. The `RUBRIC_VALIDATOR_PROMPT` is then used to answer the questions for a generated image.

In [None]:
RUBRIC_GENERATION_PROMPT = """
In this task, you will help me measure generate question-answer pairs to verify
an image description.

You will first identify the key words to be validated, e.g. ignoring filler or
redundant words.

You will then, for each word, generate a question-answer pair for each word. The
question should be simple and *cannot* be answered correctly based on common
sense or without reading the description. You will also tag each question as
having a type, which should be one off: object,
human, animal, food, activity, attribute, counting, color, material, spatial,
location, shape, other.

**Important**: There should be one and only one question-answer pair per key word.


Given a "description", your answer must have this format:
{
  "keywords": "Your {1}[itemized] {2}[keywords]",
  "qas": [
    The list of QAs in the format "{
      "question_id": i,
      "question": "the question",
      "answer": "the answer: yes or no",
      "choices": ["yes", "no"],
      "justification": "why is this about the keyword",
      "question_type": "the question type. One of [object, human, animal, food, activity, attribute, counting, color, material, spatial, location, shape, other]."
      }".,
  ]
}

**Important**: There should be one and only one question-answer pair per key word.
===
Some examples are below.

Description: A man posing for a selfie in a jacket and bow tie.
Answer:
{
  "keywords": "A {1}[man] {2}[posing] for a {3}[selfie] in a {4}[jacket] and a {5}[bow tie].",
  "qas": [
    {
      "question_id": 1, "question": "is there a man in the image?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a man in the image.", "question_type": "human"
    },
    {
      "question_id": 2, "question": "is the man posing for a selfie?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The man is posing for a selfie.", "question_type": "activity"
    },
    {
      "question_id": 3, "question": "Is the man taking a selfie?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "This is a selfie.", "question_type": "object"
    },
    {
      "question_id": 4, "question": "Is the man wearing a jacket?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The man is wearing a jacket.", "question_type": "object"
    },
    {
      "question_id": 5, "question": "Is the man wearing a bow tie?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The man is wearing a bow tie.", "question_type": "object"
    },
  ]
}

Description: A horse and several cows feed on hay.
Answer:
{
  "keywords": "A {1}[horse] and {2}[several] {3}[cows] {4}[feed] on {5}[hay]",
  "qas": [
    {
      "question_id": 1, "question": "is there a horse?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a horse in the image.", "question_type": "animal",
    },
    {
      "question_id": 2, "question": "are there several cows?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There are several cows in the image.", "question_type": "counting",
    },
    {
      "question_id": 3, "question": "are there cows?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There are cows in the image.", "question_type": "animal",
    },
    {
      "question_id": 4, "question": "are the horse and cows feeding on hay?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The horse and cows are feeding.", "question_type": "activity",
    },
    {
      "question_id": 5, "question": "is there hay?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is hay in the image.", "question_type": "object",
    },
  ]
}

Description: A red colored dog.
Answer:
{
  "keywords": "A {1}[red colored] {2}[dog].",
  "qas": [
    {
      "question_id": 1, "question": "is the dog red?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a red colored dog in the image.", "question_type": "color",
    },
    {
      "question_id": 2, "question": "is there a dog?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a dog in the image.", "question_type": "animal",
    },
  ]
}

Description: A busy intersection with an ice cream truck driving by.
Answer:
{
  "keywords": "A {1}[busy] {2}[intersection] with an {3}[ice cream truck] {4}[driving by].",
  "qas": [
    {
      "question_id": 1, "question": "is this a busy intersection?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The intersection is busy.", "question_type": "attribute",
    },
    {
      "question_id": 2, "question": "is this an intersection?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a busy intersection.", "question_type": "object",
    },
    {
      "question_id": 3, "question": "is there an ice cream truck?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is an ice cream truck.", "question_type": "object",
    },
    {
      "question_id": 4, "question": "is the ice cream truck driving by?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The ice cream truck is driving by.", "question_type": "activity",
    },
  ]
}

Description: Portrait of a gecko wearing a train conductor's hat and holding a flag that has a yin-yang symbol on it. Woodcut.
Answer:
{
  "keywords": "{1}[Portrait] of a {2}[gecko] {3}[wearing] a {4}[train conductor's hat] and {5}[holding] a {6}[flag] that has a {7}[yin-yang symbol] on it. {8}[Woodcut].",
  "qas": [
    {
      "question_id": 1, "question": "is this a portrait?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "This is a portrait.", "question_type": "attribute",
    },
    {
      "question_id": 2, "question": "is there a gecko?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a gecko.", "question_type": "animal",
    },
    {
      "question_id": 3, "question": "is the gecko wearing a hat?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The gecko is wearing a train conductor's hat.", "question_type": "activity",
    },
    {
      "question_id": 4, "question": "is the gecko wearing a train conductor's hat?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The gecko is wearing a train conductor's hat.", "question_type": "attribute",
    },
    {
      "question_id": 5, "question": "is the gecko holding a flag", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The gecko is holding a flag.", "question_type": "activity",
    },
    {
      "question_id": 6, "question": "is there a flag?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a flag.", "question_type": "object",
    },
    {
      "question_id": 7, "question": "does the flag have a yin-yang symbol?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The flag has a yin-yang symbol on it.", "question_type": "attribute",
    },
    {
      "question_id": 8, "question": "is this a woodcut?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "woodcut.", "question_type": "attribute",
    },
  ]
}

Description: A woman is showing a watermelon slice to a woman on a scooter.
Answer:
{
  "keywords": "A {1}[woman] is {2}[showing] a {3}[watermelon slice] to a {4}[woman] {5}[on] a {6}[scooter].",
  "qas": [
    {
      "question_id": 1, "question": "is there a woman?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "This is a woman.", "question_type": "human",
    },
    {
      "question_id": 2, "question": "is one woman showing a watermelon slice to another woman?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "A woman is showing a watermelon slice to a woman.", "question_type": "activity",
    },
    {
      "question_id": 3, "question": "is there a watermelon slice?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The watermelon slice is a watermelon slice.", "question_type": "food",
    },
    {
      "question_id": 4, "question": "are there two women?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "A woman is showing a watermelon slice to a woman.", "question_type": "human",
    },
    {
      "question_id": 5, "question": "is one of the women on a scooter?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "A woman is on a scooter.", "question_type": "spatial",
    },
    {
      "question_id": 6, "question": "is there a scooter?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a scooter.", "question_type": "object",
    },
  ]
}

Description: A photo of three dogs.
Answer:
{
  "keywords": "A {1}[photo] of {2}[three] {3}[dogs].",
  "qas": [
    {
      "question_id": 1, "question": "is this a photo?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "This is a photo of three dogs.", "question_type": "attribute",
    },
    {
      "question_id": 2, "question": "are there three dogs?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "three dogs.", "question_type": "counting",
    },
    {
      "question_id": 3, "question": "is there a dog?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a dog.", "question_type": "object",
    },
  ]
}

Description: A white milk truck with a license plate that reads 'pro milk'.
Answer:
{
  "keywords": "A {1}[white] {2}[milk truck] with a {3}[license plate] that reads {4}['pro milk'].",
  "qas": [
    {
      "question_id": 1, "question": "is this a white truck?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "This is a white milk truck.", "question_type": "attribute",
    },
    {
      "question_id": 2, "question": "is there a milk truck?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "This is a white milk truck.", "question_type": "object",
    },
    {
      "question_id": 3, "question": "is there a license plate on the vehicle?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "A white milk truck with a license plate.", "question_type": "object",
    },
    {
      "question_id": 4, "question": "does the license plate read 'pro milk'?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The license plate reads 'pro milk'.", "question_type": "attribute",
    },
  ]
}

Description: A person sitting on a horse in air over gate in grass with people and trees in background.
Answer:
{
  "keywords": "A {1}[person] {2}[sitting] {3}[on] a {4}[horse] {5}[in air] {6}[over] {7}[gate] in {8}[grass] with {9}[people] and {10}[trees] in {11}[background].",
  "qas": [
    {
      "question_id": 1, "question": "is there a person?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "This is a person.", "question_type": "human",
    },
    {
      "question_id": 2, "question": "is the person sitting?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "A person is sitting on a horse.", "question_type": "activity",
    },
    {
      "question_id": 3, "question": "is the person on a horse?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "A person is sitting on a horse.", "question_type": "spatial",
    },
    {
      "question_id": 4, "question": "is there a horse?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "A person is sitting on a horse.", "question_type": "object",
    },
    {
      "question_id": 5, "question": "is the horse in the air?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The horse is in the air.", "question_type": "attribute",
    },
    {
      "question_id": 6, "question": "is the horse over the gate?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The horse is over the gate.", "question_type": "spatial",
    },
    {
      "question_id": 7, "question": "is there a gate?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The horse is over the gate.", "question_type": "object",
    },
    {
      "question_id": 8, "question": "is there grass?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "The horse is in the grass.", "question_type": "object",
    },
    {
      "question_id": 9, "question": "are there people?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There are people.", "question_type": "human",
    },
    {
      "question_id": 10, "question": "are there trees?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There are trees.", "question_type": "object",
    },
    {
      "question_id": 11, "question": "are there people and trees in the background?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There people and trees in the background.", "question_type": "human",
    },
  ]
}

Description: a red blue and yellow train and some people on a platform
Answer:
{
  "keywords": "a {1}[red blue and yellow] {2}[train] and {3}[some] {4}[people] on a {5}[platform]",
  "qas": [
    {
      "question_id": 1, "question": "is the train red blue and yellow?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "This is a red blue and yellow train.", "question_type": "color",
    },
    {
      "question_id": 2, "question": "is there a train?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a train.", "question_type": "object",
    },
    {
      "question_id": 3, "question": "Are there some people?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There are some people.", "question_type": "counting",
    },
    {
      "question_id": 4, "question": "are the people on the platform?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There are some people on the platform.", "question_type": "human",
    },
    {
      "question_id": 5, "question": "is there a platform?", "answer": "yes", "choices": ["yes", "no"],
      "justification": "There is a platform.", "question_type": "object",
    },
  ]
}

Description:
{prompt}
Answer:
"""

RUBRIC_VALIDATOR_PROMPT = """
# Instructions
Look at the image carefully and answer each question with a yes or no:
{rubrics}

# Image
{image}

# Output Format
<question>
Question: repeat the original question
Verdict: yes|no
</question>
"""

## Define the metric

This cell configures the rubric generation and validator metric for rubric based evaluation.

In [None]:
# Rubric Generation
rubric_generation_config = RubricGenerationConfig(
    prompt_template=RUBRIC_GENERATION_PROMPT,
    parsing_fn=parse_json_to_qa_records,
)

# Rubric Validation
pointwise_metric = PointwiseMetric(
    metric="gecko_metric",
    metric_prompt_template=RUBRIC_VALIDATOR_PROMPT,
    custom_output_config=CustomOutputConfig(
        return_raw_output=True,
        parsing_fn=parse_rubric_results,
    ),
)

# Rubric Metric
rubric_based_gecko = RubricBasedMetric(
    generation_config=rubric_generation_config,
    critique_metric=pointwise_metric,
)

# Prepare the dataset

In the following dataset, two prompts are used for each generated image. The first is the prompt that corresponds to the generated content. The second is a counterexample that is similar but does not exactly match the generated content. This is done to demonstrate the difference in the Gecko evaluation for high quality and low quality responses.

In [None]:
prompts = [
    "steaming cup of coffee and a croissant on a table",
    "steaming cup of coffee and toast in a cafe",
    "sunset over a calm ocean",
    "sunset over a tranquil forest",
    "butterfly with colorful wings on a flower",
    "butterfly fluttering over a leaf",
    "musician playing guitar on a street corner",
    "musician playing saxophone under lamp post",
    "vintage camera with a worn leather strap",
    "new camera with a power zoom lens",
    "colorful abstract painting",
    "black and white painting",
    "baker decorating a cake with frosting",
    "baker topping cupcakes with sprinkles",
    "hot air balloon floating above a field of lavender",
    "hot air balloon landing in a field of sunflowers",
]
images = [
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/coffee.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/coffee.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/sunset.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/sunset.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/butterfly.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/butterfly.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/musician.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/musician.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/camera.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/camera.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/abstract.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/abstract.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/baker.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/baker.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/balloon.png"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/images/balloon.png"}}]}]}',
]

eval_dataset = pd.DataFrame(
    {
        "prompt": prompts,
        "image": images,
    }
)

# Run evaluation

## Generate rubrics

First we generate rubrics for the user prompts.

In [None]:
dataset_with_rubrics = rubric_based_gecko.generate_rubrics(eval_dataset)
pretty_print_qa_records_df(
    dataset_with_rubrics, hide_columns=["prompt", "image", "rubrics"]
)

## Evaluate with rubrics

Then we use the generated rubrics to evaluate the quality of the responses.

In [None]:
eval_task = EvalTask(
    dataset=dataset_with_rubrics,
    metrics=[rubric_based_gecko],
)
eval_result = eval_task.evaluate(response_column_name="image")

# Calculate overall score for metric.
dataset_with_final_scores = compute_scores(eval_result.metrics_table)
np.mean(dataset_with_final_scores["final_score"])

In [None]:
pretty_print_result_df(
    dataset_with_final_scores,
    hide_columns=[
        "prompt",
        "image",
        "rubrics",
        "qa_records",
        "gecko_metric/rubric_results",
    ],
)