In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate videos with Gecko

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/tree/main/gemini/evaluation/evaluate_videos_with_gecko.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaluate_videos_with_gecko.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaluate_videos_with_gecko.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/tree/main/gemini/evaluation/evaluate_videos_with_gecko.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_videos_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_videos_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_videos_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_videos_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_videos_with_gecko.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

 | | | | |
 |-|-|-|-|
 |Author(s): | [Greg Breard](https://github.com/gregbreard) | Anant Nawalgaria | Olivia Wiles |

## Overview

This CoLAB shows how to leverage the Vertex AI evaluation service in order to run [Gecko](https://arxiv.org/abs/2404.16820).

As with a more standard rubric approach, Gecko proceeds in two stages: a rubric generation step followed by a validator step. The key difference is that the rubric is generated based on the prompt.
This allows for a more fine-grained metric that can be customized to prompts with differing challenges.

In more detail, Gecko proceeds as follows, with two key steps: the QA generation step (ie the rubric generation step) and then the VQA step (ie the validator step).

## The rubric generation step
Given a prompt, such as `A teddy bear riding a skateboard`, we prompt the Gemini model to generate a set of questions, answer choices and corresponding ground truth (GT) answer. The question is also tagged with a question type. Depending on the prompt, these questions can either be `yes`/`no` questions or multiple choice ones.

`A teddy bear riding a skateboard` -->

- `Q1: Is there a teddy bear? Choices: [yes, no]. GT Answer: yes. Tag: Object.`
- `Q2: Is there a skateboard? Choices: [yes, no]. GT Answer: yes. Tag: Object.`
- `Q3: Is the teddy bear riding a skateboard? Choices: [yes, no]. GT Answer: yes. Tag: Action.`

## The validator step
Given a generated image and the questions above, we query the Gemini model for each question to give an answer. We then check if it matches the GT answer, with a result of 1 if it matches and 0 if it does not. We aggregate these results to give a final overall score, which can be broken down into scores per question. We can also aggregate scores based on tags.

For example, imagine we have a generated image `<image1>` which includes a teddy bear but no skateboard, and Gemini outputs the following results:

- `<image1> Is there a teddy bear? GT Answer: yes. Result: 1.`
- `<image1> Is there a skateboard? GT Answer: no. Result: 0.`
- `<image1> Is the teddy bear riding a skateboard? GT Answer: no. Result: 0.`

The final score will be `0.33` with a score of `0.5` for the question tag and `0.0` for the action tag.

## Further exploration
We provide two prompts, engineered for video and image generation tasks. Below, we show how to run Gecko for the video modality on a set of generations.

However, these prompts can be modified and changed as suits a developer's needs. The quality can be analysed by exploring what  questions are generated as well as the reliability of the validator step. Questions can also be manually added as desired for an application.

## Steps

1. Set up the environment.
2. Define helper functions, prompt templates, and metric.
3. Prepare the dataset for evaluation.
4. Run the evaluation (including model inference).

## Costs
This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
%pip install --upgrade --quiet google-cloud-aiplatform

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>‚ö†Ô∏è The kernel is going to restart. Wait until it's finished before continuing to the next step. ‚ö†Ô∏è</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "your-project-id"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}


import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Import libraries

In [None]:
import pandas as pd
from vertexai.preview.evaluation import (
    CustomOutputConfig,
    EvalTask,
    PointwiseMetric,
    RubricBasedMetric,
    RubricGenerationConfig,
)

# Set up eval metrics for Gecko

## Helper functions

The outputs supported by Gecko are more sophisticated than the default outputs of predefined rubric based metrics. To handle this, custom parsing logic is required.

The following code block defines 2 classes: `QARecord` and `QAResult`. The `QARecord` represents the questions created during rubric generation. The `QAResult` extends the `QARecord` with a result field that is populated after validation.

There are also two parsing methods. The `parse_json_to_qa_records` method converts the text output of rubric generation to `QARecords` and the `parse_rubric_results` method extracts the answers from the validation step. These are passed into the metric definition and parsing is handled automatically during the generation and validation steps.

Finally, the `compute_scores` method compares the `QARecord`s and rubric results to calculate a per row score and appends `QAResult`s and scores to the dataset.

In addition, there are pretty printing methods provided to present the output in a human readable format.

In [None]:
from collections.abc import Sequence
from dataclasses import dataclass, field
import json
import re
from typing import Any

import numpy as np

_QUESTION_REGEX = re.compile(r"Question:(.*?)Verdict:", re.DOTALL)
_VERDICT_REGEX = re.compile("Verdict:(.*)")
_QUESTION_BLOCK_REGEX = re.compile("<question>(.*?)</question>", re.DOTALL)
_TABLE_STYLE = [
    {
        "selector": "th",
        "props": [
            ("background-color", "#f2f2f2"),
            ("border", "1px solid gray"),
            ("color", "black"),
            ("font-size", "11pt"),
            ("text-align", "center"),
            ("word-break", "break-all"),
        ],
    },
    {"selector": "tr:nth-child(even)", "props": [("background-color", "#f9f9f9")]},
    {"selector": "tr:nth-child(odd)", "props": [("background-color", "white")]},
    {"selector": "tr:hover", "props": [("background-color", "#94e6ff")]},
    {"selector": "td:hover", "props": [("background-color", "#ffffb3")]},
]


@dataclass(kw_only=True, frozen=True)
class QARecord:
    """A basic QA Record for storing question-answer pairs.

    Attributes:
        question: Question text.
        question_type: Category of question.
        gt_answer: Ground-truth answer to the question.
        answer_choices: Possible answers for multiple choice questions.
        justification: How the question relates to the prompt.
    """

    question: str = ""
    question_type: str = ""
    gt_answer: str = ""
    answer_choices: Sequence[str] = field(default_factory=list)
    justification: str = ""


class QAResult(QARecord):
    """A basic QA Result for storing question-answer results.

    Attributes:
        result: The result of answering the question.
    """

    result: str = ""

    def __init__(self, qa_record: QARecord, result: str):
        super().__init__(
            question=qa_record.question,
            gt_answer=qa_record.gt_answer,
            answer_choices=qa_record.answer_choices,
            justification=qa_record.justification,
        )
        self.result = result


def parse_json_to_qa_records(json_response: str) -> dict[str, Any]:
    """
    Parse the JSON response and convert it to a questions and QARecords.

    Args:
        json_response: JSON string containing the QA data.

    Returns:
        Dict with keywords, questions, and QARecord objects.

    Raises:
        json.JSONDecodeError: If JSON parsing fails
        KeyError: If expected keys are missing from the JSON structure
    """
    json_response = re.sub(
        r"(.*```json|```.*)",
        "",
        json_response.strip(),
    )
    try:
        # Parse JSON string to Python object
        data = json.loads(json_response)
        qa_records = []

        # Process each QA pair in the QAs array
        rubrics = []
        for qa in data["qas"]:
            record = QARecord(
                question=qa["question"],
                gt_answer=qa["answer"],
                answer_choices=qa["choices"],
                justification=qa["justification"],
            )
            qa_records.append(record)
            rubrics.append(
                f"<question>{record.question}<choices>{','.join(record.answer_choices)}"
            )
        return {
            "questions": "\n".join(rubrics),
            "keywords": data["keywords"],
            "qa_records": qa_records,
        }
    except json.JSONDecodeError as e:
        return {
            "questions": f"Error decoding JSON response: {str(e)}",
            "keywords": "",
            "qa_records": json_response,
        }
    except KeyError as e:
        return {
            "questions": f"Missing required key in JSON structure: {str(e)}",
            "keywords": "",
            "qa_records": json_response,
        }


def parse_rubric_results(results: list[str]) -> dict[str, Any]:
    """Parses the rubric results from the rubric validator response."""
    rubric_results = {}
    for result in results:
        rubric_verdicts = _parse_question_blocks(result)
        for rubric, verdict in rubric_verdicts:
            rubric_results[rubric.lower()] = verdict.lower()
    return {"rubric_results": rubric_results}


def _parse_question_blocks(txt: str) -> list[tuple[str, bool]]:
    """Parses the question blocks from the rubric validator response."""
    responses = []
    question_blocks = _QUESTION_BLOCK_REGEX.findall(txt)
    if not question_blocks:
        question_blocks = [txt]
    for block in question_blocks:
        q = _parse_question(block)
        v = _parse_verdict(block)
        if q is not None and v is not None:
            responses.append((q, v))
    return responses


def _parse_question(txt: str):
    """Parses the question from the rubric validator response."""
    if not isinstance(txt, str) or not txt:
        return None
    try:
        txt = txt.split("Verdict:")[0]
        if "Question:" in txt:
            return txt.split("Question:")[-1].strip()
        if question := _QUESTION_REGEX.findall(txt):
            return question[0].strip()
    except Exception as e:
        print(f"Failed to parse question: {str(e)}")
        return None


def _parse_verdict(txt: str):
    """Parses the verdict from the rubric validator response."""
    if not isinstance(txt, str) or not txt:
        return None
    try:
        if verdict := _VERDICT_REGEX.findall(txt):
            verdict = verdict[0].strip()
            return verdict
    except Exception as e:
        print(f"Failed to parse question: {str(e)}")
        return None


def compute_scores(df: "pd.DataFrame") -> "pd.DataFrame":
    """Computes scores for each row based on QA results."""
    qa_results = []
    final_scores = []
    for idx, row in df.iterrows():
        rubric_results = {}
        for key in row.keys():
            if "rubric_results" in key:
                rubric_results = row[key]
        scores = []
        results = []
        for qa in row["qa_records"]:
            q = qa.question.lower()
            if q in rubric_results:
                if qa.gt_answer.lower() in rubric_results[q]:
                    results.append(QAResult(qa, f"{qa.gt_answer} ‚úì"))
                    scores.append(1)
                else:
                    results.append(QAResult(qa, f"{rubric_results[q]} üó¥"))
                    scores.append(0)
            else:
                results.append(QAResult(qa, "no result"))
                scores.append(0)
        qa_results.append(results)
        final_scores.append(np.mean(scores))
    df_with_score = df.assign(qa_results=qa_results, final_score=final_scores)
    return df_with_score


def pretty_print_qa_records_df(
    df: "pd.DataFrame", hide_columns: list[str]
) -> "pd.Styler":
    """Prints QA records data frame as stylized HTML table."""
    styled_df = df.copy()
    for col in df.columns:
        if (
            isinstance(df[col][0], list)
            and df[col][0]
            and isinstance(df[col][0][0], QARecord)
        ):
            styled_df[col] = styled_df[col].apply(
                lambda x: _qa_records_to_html_table(x)
            )
    styles = _TABLE_STYLE.copy()
    styles.append(
        {
            "selector": "td",
            "props": [
                ("border", "1px solid gray"),
                ("color", "black"),
                ("min-width", "100px"),
                ("text-align", "center"),
            ],
        }
    )
    return (
        styled_df.style.hide(axis="index")
        .hide(subset=hide_columns, axis=1)
        .set_table_styles(styles)
    )


def pretty_print_result_df(df: "pd.DataFrame", hide_columns: list[str]) -> "pd.Styler":
    """Prints results data frame as stylized HTML table."""
    styled_df = df.copy()
    for col in df.columns:
        if (
            isinstance(df[col][0], list)
            and df[col][0]
            and isinstance(df[col][0][0], QARecord)
        ):
            styled_df[col] = styled_df[col].apply(
                lambda x: _qa_records_to_html_table(x)
            )
    styles = _TABLE_STYLE.copy()
    styles.append(
        {
            "selector": "td",
            "props": [
                ("border", "1px solid gray"),
                ("color", "black"),
                ("min-width", "120px"),
                ("text-align", "center"),
            ],
        }
    )
    return (
        styled_df.style.hide(axis="index")
        .hide(subset=hide_columns, axis=1)
        .format({"final_score": "{:,.1f}"})
        .set_table_styles(styles)
    )


def _qa_records_to_html_table(data: list[QARecord]) -> str:
    """Converts a list to an HTML table."""
    if not data:
        return "<i>No data to display.</i>"
    html_table = "<table style='border-collapse: collapse'><thead><tr>"
    # Extract headers from the first element.
    keys = ["question", "answer_choices", "gt_answer"]
    if isinstance(data[0], QAResult):
        keys.append("result")
    else:
        keys.append("justification")
    for key in keys:
        html_table += f"<th>{key}</th>"
    html_table += "</tr></thead><tbody>"
    # Add rows
    for item in data:
        html_table += "<tr>"
        for key in keys:
            html_table += f"<td>{item.__dict__[key]}</td>"
        html_table += "</tr>"
    html_table += "</tbody></table>"
    return html_table

## Prompt Templates

This cell defines the prompt templates that will be used for evaluation. The `RUBRIC_GENERATION_PROMPT` is used to generate questions relevant to the user input. The `RUBRIC_VALIDATOR_PROMPT` is then used to answer the questions for a generated video.

In [None]:
RUBRIC_GENERATION_PROMPT = """Given a video description and the groundable words
in it, generate multiple-choice questions that verify if the video description
is correct.

The goal is to ask questions about entities, objects, attributes, actions, colors,
spatial relations, temporal relations, styles and scenes, when these are present
in the description.

Make sure that all options are substantially different from each other and only
one option can be the correct one based on the description. Do not include other
parts of the description as a non correct option.

Justify why the other options cannot be true based on the description and
question. Also, make sure that the question cannot be answered correctly only
based on common sense and without reading the description.

Each generated question should be independent of the other ones and it should be
able to be understood without knowing the other questions; avoid referring to
entities/objects/places from previous questions.

Finally, avoid asking very general questions, such as 'What is in the video?',
or 'Name a character in the video'.

Generate the multiple-choice questions in the exact same format as the examples
that follow. Do not add asterisks, white spaces, or any other reformatting and
explanation that deviate from the formatting of the following examples.

**Important**: There should be one and only one question-answer pair per key word.
**Important**: answer value MUST BE only one of the following letters a, b, c, or d. And it MUST BE ALWAYS in lowercase!


Given a "description", your answer must respond using this format:
{
  "keywords": "Your {1}[itemized] {2}[keywords]",
  "qas": [
    The list of QAs in the format "{
      "question_id": i,
      "question": "the question",
      "choices": ["a) option 1", "b) option 2", "c) option 3", "d) option 4"],
      "justification": "why is this about the keyword",
      "answer": "the identifier of the right answer (i.e. a, b, c, or d)",
      }",
  ]
}

===
Some examples are below.

Description:

Close up of grapes on a rotating table.
Answer:
{
  "keywords": "{1}[Close up, style, 1.0] of {2}[grapes, object, 1.0] {3}[on a {4}[rotating, action, 1.0] {5}[table, spatial relation, 1.0]",
  "qas": [
    {
      "question_id": 1, "question": " How is the object displayed in the video shot in the camera?", "choices": ["a) long shot", "b) close up", "c) glimpse", "d) slow motion"],
      "justification": "The grapes, which is the main object displayed in the video ({2}) is presented with a close up ({1}). Given this, none of the other options can be correct as they are the opposite or contradict the description."
      "answer": "b"
    },
    {
      "question_id": 2, "question": "What is the object that the camera focuses on during the video?", "choices": ["a) table", "b) pears", "c) blackberries", "d) grapes"],
       "justification": "the close up is happening on the grapes ({2}). A table is also present in the video ({5}) but it is not the main focus (close up) of the video. Pears and blackberries are not present in the video.",
      "answer": "d"
    },
    {
      "question_id": 3, "question": "Where are the grapes placed in the video?", "choices": ["a) table", "b) chair", "c) bowl", "d) plate'],
      "justification": "the grapes are placed on a table ({3}). Chair is not correct, but it is similar furniture to table and could be found next to it, and bowl and plate are reasonable answers for placing grapes but not true here based on the description.",
      "answer": "a",

    },
    {
      "question_id": 4, "question": "What movement does the table in the video follows?", "choices": ["a) it stays still", "b) it is moved to the right", "c) it is moved to the left", "d) it rotates"],
      "justification": "the table is rotating ({4}, {5}). Staying still is typically how a table is depicted in videos, and moving it right or left are other movements that we often see but they are not true according to the description.",
      "answer": d,
    }
  ]
}

Description:

Turtle swimming in ocean.

Answer:

{
  "keywords": "{1}[Turtle, entity, 1.0] {2}[swimming, action, 1.0] {3}[in ocean, spatial relation, 1.0]",
  "qas": [
    {
      "question_id": 1,
      "question": "What animal is present in the video?",
      "choices": ["a) fish", "b) dolphin", "c) turtle", "d) whale"],
      "justification": "turtle is the correct answer ({1}). All of fish, dolphin and whale are animals that live and swim in the ocean, so they are reasonable responses to such a question, but not the correct ones according to the description.",
      "answer": "c"
    },
    {
      "question_id": 2,
      "question": "What is the turtle doing in the video?",
      "choices": ["a) swims", "b) walks", "c) stays still", "d) moves the legs statically"],
      "justification": "the turtle is swimming ({2}). Staying still, walking or moving the legs without walking are typical movements that a turtle does, but they are not true according to the description.",
      "answer": "a"
    },
    {
      "question_id": 3,
      "question": "Where is the video taking place?",
      "choices": ["a) in the beach", "b) in the ocean", "c) in a boat", "d) in a lake"],
      "justification": "the turtle is swimming in the ocean ({3}). All other options are not true, but they would look similar to an ocean and they are of similar topic.",
      "answer": "b"
    }
  ]
}

Description:

A fat rabbit wearing a purple robe walking through a fantasy landscape.

Answer:

{
  "keywords": "A {1}[fat, attribute, 1.0] {2}[rabbit, entity, 1.0] {3}[wearing a {4}[purple, color, 1.0] robe, attribute, 1.0] {5}[walking, action, 1.0] through a {6}[fantasy landscape, scene, 1.0]",
  "qas": [
    {
      "question_id": 1,
      "question": "What is the most appropriate description for the animal of the video?",
      "choices": ["a) thin", "b) regular", "c) slim", "d) fat"],
      "justification": "the rabbit in the video is fat ({1}). The options thin and slim are opposite of the attribute mentioned in the description and the regular adjective checks whether it is obvious that the rabbit has a weight above normal.",
      "answer": "d"
    },
    {
      "question_id": 2,
      "question": "Who wears a robe in the video?",
      "choices": ["a) rabbit", "b) hare", "c) squirrel", "d) rat"],
      "justification": "the rabbit is the animal that wears a robe in the video ({2}). Hare is an animal very similar to rabbit, and the other two options (squirrel and rat) are also similar but not true according to the description.",
      "answer": "a"
    },
    {
      "question_id": 3,
      "question": "What is the rabbit wearing in the video?",
      "choices": ["a) nothing", "b) dress", "c) robe", "d) jumpsuit"],
      "justification": "the rabbit is wearing a robe ({3}). Nothing is what normally an animal is wearing, and the options dress and jumpsuit are similar to the robe but not true according to the description.",
      "answer": "c"
    },
    {
      "question_id": 4,
      "question": "What is the color of the clothing that the rabbit wears in the video?",
      "choices": ["a) purple", "b) blue", "c) pink", "d) green"],
      "justification": "the rabbit is wearing a purple robe ({4}). the options blue, pink and green are colors similar to purple.",
      "answer": "a"
    },
    {
      "question_id": 5,
      "question": "What is the rabbit doing in the video?",
      "choices": ["a) running", "b) walking", "c) standing", "d) jumping"],
      "justification": "the rabbit is walking through a fantasy landscape ({5}, {6}). The options running and standing are similar to walking, and jumping is an action that could be performed by a rabbit, but not true according to the description.",
      "answer": "b"
    },
    {
      "question_id": 6,
      "question": "Where is the video taking place?",
      "choices": ["a) fields", "b) countryside", "c) fantasy landscape", "d) mountains"],
      "justification": "the rabbit is walking through a fantasy landscape ({6}). The options fields, countryside, and mountains are different types of landscapes, but they are real-world scenes instead of fantasy ones.",
      "answer": "c"
    }
  ]
}

Description:

A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo

Answer:

{
  "keywords": "A {1}[beautiful coastal beach, scene, 1.0] {2}[in spring, temporal relation, 1.0], {3}[waves, scene, 1.0] {4}[lapping, action, 1.0] {5}[on sand, spatial relation, 1.0] {6}[by Hokusai, style, 1.0], {7}[in the style of Ukiyo, style, 1.0]",
  "qas": [
    {
      "question_id": 1,
      "question": "Where is the video taking place?",
      "choices": ["a) cliffs", "b) harbor", "c) coastal park", "d) coastal beach"],
      "justification": "the main scene is a beautiful coastal beach ({1}). The options cliffs, harbor, and coastal park are similar to coastal beach but not true according to the description.",
      "answer": "d"
    },
    {
      "question_id": 2,
      "question": "Which season is most likely during the video?",
      "choices": ["a) spring", "b) summer", "c) autumn", "d) winter"],
      "justification": "the video shows a coastal beach in spring ({2}). The options summer, autumn and winter are other seasons that are not true according to the description.",
      "answer": "a"
    },
    {
      "question_id": 3,
      "question": "What is the level of movement of the sea during the video?",
      "choices": ["a) calm", "b) wavy", "c) slightly moving", "d) ripply"],
      "justification": "the sea is wavy ({3}). The options calm, slightly moving, and ripply are different levels of movement of the sea and they are all different enough from wavy.",
      "answer": "b"
    },
    {
      "question_id": 4,
      "question": "What is the movement of the sea during the video?",
      "choices": ["a) gentle waves are coming to the shore", "b) there is a tide", "c) waves are lapping on the shore", "d) there are sea ripples"],
      "justification": "the sea is lapping on the shore ({4}). The other provided options are either of less intensity (gentle waves are coming to the shore, there are sea ripples) or the exact opposite (there is a tide).",
      "answer": "c"
    },
    {
      "question_id": 5,
      "question": "Where does the sea move to during the video?",
      "choices": ["a) sand", "b) rocks", "c) cliffs", "d) pebbles"],
      "justification": "the waves are lapping on sand ({5}). The options pebbles, rocks, and cliffs are different types of ground typically by the sea and have different levels of solidity.",
      "answer": "a"
    },
    {
      "question_id": 6,
      "question": "Whose artist is the theme of the scene similar to?",
      "choices": ["a) Utamaro", "b) Hokusai", "c) Hiroshige", "d) Yoshitoshi"],
      "justification": "the theme of the scene resembles a painting of Hokusai. The other options are other Japanese artists that are similar to Hokusai.",
      "answer": "b"
    },
    {
      "question_id": 7,
      "question": "Which Japanese painting style is most similar to the video?",
      "choices": ["a) Ukiyo", "b) Nihonga", "c) Sumi", "d) ink calligraphy"],
      "justification": "the video scene is in the style of Ukiyo ({7}). The other options are other types of Japanese painting styles that are not similar to the video according to the description.",
      "answer": "a"
    }
  ]
}

Descripion:

Mysterious scene of Sherlock Holmes investigating a crime scene at 221B Baker Street, forced perspective

Answer:

{
  "keywords": "{1}[Mysterious scene, style, 1.0] of {2}[Sherlock Holmes, entity, 1.0] {3}[investigating, action, 1.0] a {4}[crime scene, scene, 1.0] {5}[at 221B Baker Street, spatial relation, 1.0], {6}[forced perspective, style, 1.0]",
  "qas": [
    {
      "question_id": 1,
      "question": "What is the vibe of the video?",
      "choices": ["a) light", "b) mysterious", "c) scary", "d) calm"],
      "justification": "the vibe of the video is mysterious ({1}). The options light and calm are opposite vibes to mysterious, and scary is similar to mysterious but more exaggerated and not true according to the description.",
      "answer": "b"
    },
    {
      "question_id": 2,
      "question": "What is the name of the person investigating the scene in the video?",
      "choices": ["a) Sherlock Holmes", "b) Watson", "c) John Luther", "d) Hercule Poirot"],
      "justification": "the video shows Sherlock Holmes in the scene ({2}). Watson is another character from the Sherlock Holmes show but not the correct one according to the description, and John Luther and Hercule Poirot are other detective characters from shows.",
      "answer": "a"
    },
    {
      "question_id": 3,
      "question": "What is the man doing in the video?",
      "choices": ["a) walking in a street", "b) walking indoors", "c) investigating a scene", "d) leaving a scene"],
      "justification": "the man is investigating the scene ({3}). The options walking in a street, and walking indoors are general descriptions but not specific enough to the contents of the video, and leaving a scene is the opposite of investigating.",
      "answer": "c"
    },
    {
      "question_id": 4,
      "question": "Where is the video taking place?",
      "choices": ["a) house", "b) basement", "c) street", "d) crime scene"],
      "justification": "the video is taking place in a crime scene ({4}). The other provided options are common places, but not as specific as a crime scene.",
      "answer": "d"
    },
    {
      "question_id": 5,
      "question": "Which street appears in the video?",
      "choices": ["a) Liverpool", "b) Baker", "c) Oxford", "d) Bond"],
      "justification": "the street appearing in the video is the Baker Street ({5}). The options Liverpool, Baker, Oxford and Bond are different names of streets.",
      "answer": "b"
    },
    {
      "question_id": 6,
      "question": "What is the perspective of the video?",
      "choices": ["a) close up", "b) forced", "c) farther away", "d) top down"],
      "justification": "the perspective of the video is forced. The other options are other perspective styles in video.",
      "answer": "b"
    }
  ]
}

Description:

Larry David costumed as Bob Ross is drawing a nature scene but spills the paint

Answer:

{
  "keywords": "{1}[Larry David, entity, 1.0] as {2}[Bob Ross, entity, 1.0] {3}[is drawing, action, 1.0] a {4}[nature scene, object, 1.0] {5}[but, temporal relation, 1.0] {6}[spills, spatial relation, 1.0], {7}[the paint, object, 1.0]",
  "qas": [
    {
      "question_id": 1,
      "question": "Who is the character that draws a painting in the video?",
      "choices": ["a) Bob Ross", "b) Larry David", "c) Bill Alexander", "d) George Costanza"],
      "justification": "Larry David is present in the video ({1}). The option Bob Ross is the person that Larry is dressed as, Bill Alexander is a painter with similar style as Bob Ross, and George Costanza is a character similar to Larry David.",
      "answer": "b"
    },
    {
      "question_id": 2,
      "question": "Who is the painter of the video dressed as?",
      "choices": ["a) Bill Alexander", "b) William Alexander", "c) Thomas Kinkade", "d) Bob Ross"],
      "justification": "the main character is dressed like Bob Ross ({2}). The other options are all painters that are similar to Bob Ross.",
      "answer": "d"
    },
    {
      "question_id": 3,
      "question": "What is the painter doing in the video?",
      "choices": ["a) looking at a painting", "b) sitting next to a painting", "c) drawing a painting", "d) hanging up a painting"],
      "justification": "the man is drawing a painting ({3}). The other options are still involving a painting; looking at and sitting next to a painting are more static, and hanging up a painting is a different action from drawing the painting.",
      "answer": "c"
    },
    {
      "question_id": 4,
      "question": "What is depicted in the painting in the video?",
      "choices": ["a) nature scene", "b) abstract art", "c) geometric shapes", "d) blank canvas"],
      "justification": "the painting in the video depicts a nature scene ({4}). The other options are all different types of paintings that are mutually exclusive with depicting a nature scene.",
      "answer": "a"
    },
    {
      "question_id": 5,
      "question": "What is happening in the end of the video?",
      "choices": ["a) the man looks at the painting", "b) the man spills the paint", "c) the main draws the painting", "d) the man leaves the painting"],
      "justification": "towards the end of the video the man spills the paint ({5}, {6}). The option of drawing the painting happens earlier in the video, and the other two options are alternative actions around the painting.",
      "answer": "b"
    },
    {
      "question_id": 6,
      "question": "What does the man overturn in the end of the video?",
      "choices": ["a) the paint", "b) the painting", "c) the hat", "d) the brushes"],
      "justification": "the man overturns the paint. The option of the painting is another object present in the video, but not the correct one given the question, and the hat and brushes are related objects that are likely in the space in the video.",
      "answer": "a"
    }
  ]
}

Description:

Child swings high on tire swing

Answer:

{
  "keywords": "{1}[Child, entity, 1.0] {2}[swings, action, 1.0] {3}[high, spatial relation, 1.0] {4}[on tire swing, spatial relation, 1.0]",
  "qas": [
    {
      "question_id": 1,
      "question": "What is the age of the character in the video?",
      "choices": ["a) child", "b) young man", "c) baby", "d) old man"],
      "justification": "the main character of the video is a child ({1}). The options young man, baby and old man are characters of different ages.",
      "answer": "a"
    },
    {
      "question_id": 2,
      "question": "What is the child doing in the video?",
      "choices": ["a) sits on swing", "b) pushes the swing", "c) swings on swing", "d) walks away from the swing"],
      "justification": "the child swings on the swing ({2}). The option sits on the swing is similar, but it does not have any movement. The options pushes the swing and walks away from the swing require a different position of the child relative to the swing.",
      "answer": "c"
    },
    {
      "question_id": 3,
      "question": "What is the child doing on the swing?",
      "choices": ["a) sits", "b) swings high", "c) moves slightly", "d) gets off"],
      "justification": "the child swings high on the swing ({3}). The options sits and moves slightly are different movements of different intensity that the child could have been doing on the swing and the option gets off the swing is the opposite.",
      "answer": "b"
    },
    {
      "question_id": 4,
      "question": "Where is the child sitting on?",
      "choices": ["a) circular swing", "b) flat swing", "c) classic swing", "d) tire swing"],
      "justification": "The child sits in a tire swing ({4}). The other options are all different types of swings that are similar to tire swing.",
      "answer": "d"
    }
  ]
}

Description:

Frog jumps in a pond, forced perspective

Answer:

{
  "keywords": "{1}[Frog, entity, 1.0] {2}[jumps, action, 1.0] {3}[in a pond, spatial relation, 1.0] {4}[forced perspective, style, 1.0]",
  "qas": [
    {
      "question_id": 1,
      "question": "What animal is present in the video?",
      "choices": ["a) toad", "b) salamander", "c) frogs", "d) frog"],
      "justification": "the animal of the video is a frog ({1}). The option frogs is the plural which is not correct given the description. The options salamander and toad are animals similar to frog.",
      "answer": "d"
    },
    {
      "question_id": 2,
      "question": "What is the frog doing in the video?",
      "choices": ["a) sits next to a pond", "b) jumps in a pond", "c) jumps out of a pond", "d) slides in a pond"],
      "justification": "the frog jumps in a pond ({2}). The option sits next to a pond is related to the pond, but it does not have any movement. The option slides in a pond has a similar movement but it is a different action of different intensity. The option jumps out of a pond is the opposite.",
      "answer": "b"
    },
    {
      "question_id": 3,
      "question": "Where is the frog jumping in?",
      "choices": ["a) lake", "b) reservoir", "c) pond", "d) fountain"],
      "justification": "the frog jumps in a pond ({3}). The other options are all different types of water masses of different sizes.",
      "answer": "c"
    },
    {
      "question_id": 4,
      "question": "What is the perspective that the video is filmed?",
      "choices": ["a) aerial perspective", "b) forced perspective", "c) linear perspective", "d) one point perspective"],
      "justification": "the video is filmed in a forced perspective. The other options are all different perspective styles in video.",
      "answer": "b"
    }
  ]
}

Description:
{prompt}
Answer:
"""

RUBRIC_VALIDATOR_PROMPT = """
# Instructions
Watch the video below carefully and answer the question based on the choices
provided. Only answer with the letter (a, b, c, or d) that corresponds to the
correct answer.

{rubrics}

# Video
{video}

# Output Format
<question>
Question: repeat the original question
Verdict: a|b|c|d|e
</question>
"""

## Define the metric

This cell configures the rubric generation and validator metric for rubric based evaluation.

In [None]:
# Rubric Generation
rubric_generation_config = RubricGenerationConfig(
    prompt_template=RUBRIC_GENERATION_PROMPT,
    parsing_fn=parse_json_to_qa_records,
)

# Rubric Validation
pointwise_metric = PointwiseMetric(
    metric="gecko_metric",
    metric_prompt_template=RUBRIC_VALIDATOR_PROMPT,
    custom_output_config=CustomOutputConfig(
        return_raw_output=True,
        parsing_fn=parse_rubric_results,
    ),
)

# Rubric Metric
rubric_based_gecko = RubricBasedMetric(
    generation_config=rubric_generation_config,
    critique_metric=pointwise_metric,
)

# Prepare the dataset

In the following dataset, two prompts are used for each generated video. The first is the prompt that corresponds to the generated content. The second is a counterexample that is similar but does not exactly match the generated content. This is done to demonstrate the difference in the Gecko evaluation for high quality and low quality responses.

In [None]:
prompts = [
    "Snow blanketed rocky mountains surround and shadow deep canyons. the canyons bend through the high elevated mountain peaks. black and white",
    "Lush green valley is carved between rocky cliffs. the valley winds through the high elevated rock faces. misty morning",
    "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas",
    "Two friends, dressed in casual summer clothes, are caught in a light summer rain while running home",
    "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon",
    "A eerie panorama of the Arizona desert, with ancient ruins silhouetted against the setting sun",
    "Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up",
    "A large red apple rotating on the turntable. water drops appear on the skin during rotation. isolated on the black background. close-up",
    "A boat sailing leisurely along the Seine River with the Eiffel Tower in background",
    "A boat cruising rapidly along the Thames River with Big Ben behind",
]
videos = [
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/mountain.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/mountain.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/couple.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/couple.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/desert.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/desert.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/plum.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/plum.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/boat.mp4"}}]}]}',
    '{"contents": [{"parts": [{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://cloud-samples-data/generative-ai/evaluation/videos/boat.mp4"}}]}]}',
]

eval_dataset = pd.DataFrame(
    {
        "prompt": prompts,
        "video": videos,
    }
)

# Run evaluation

## Generate rubrics

First we generate rubrics for the user prompts.

In [None]:
dataset_with_rubrics = rubric_based_gecko.generate_rubrics(eval_dataset)
pretty_print_qa_records_df(
    dataset_with_rubrics, hide_columns=["prompt", "video", "rubrics"]
)

## Evaluate with rubrics

Then we use the generated rubrics to evaluate the quality of the responses.

In [None]:
eval_task = EvalTask(
    dataset=dataset_with_rubrics,
    metrics=[rubric_based_gecko],
)
eval_result = eval_task.evaluate(response_column_name="video")

# Calculate overall score for metric.
dataset_with_final_scores = compute_scores(eval_result.metrics_table)
np.mean(dataset_with_final_scores["final_score"])

In [None]:
pretty_print_result_df(
    dataset_with_final_scores,
    hide_columns=[
        "prompt",
        "video",
        "rubrics",
        "qa_records",
        "gecko_metric/rubric_results",
    ],
)