In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Hugging Face DLCs: Using Gemma for running evaluations with Vertex AI Gen AI Evaluation

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_gemma_with_genai_evaluation.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fevaluation%2Fvertex_ai_tgi_gemma_with_genai_evaluation.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/evaluation/vertex_ai_tgi_gemma_with_genai_evaluation.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_gemma_with_genai_evaluation.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_gemma_with_genai_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_gemma_with_genai_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_gemma_with_genai_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_gemma_with_genai_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/evaluation/vertex_ai_tgi_gemma_with_genai_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| | |
|-|-|
|Author(s) | [Ivan Nardini](https://github.com/inardini) |

## Overview

Assessing the performance of Large Language Models (LLMs) remains a complex task, especially when it comes to integrating them into production systems. Unlike conventional software and non-generative machine learning models, evaluating LLMs is subjective, challenging to automate, and prone to highly visible errors.

To tackle these challenges, Vertex AI offers a comprehensive evaluation framework through its Gen AI Evaluation service. This framework encompasses the entire LLM lifecycle, from prompt engineering and model comparison to operationalizing automated model evaluation in production environments.

Learn more about [Vertex AI Gen AI Evaluation service](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluate-models).

## Objective

In this tutorial, you learn how to use the Vertex AI Gen AI Evaluation framework to evaluate Gemma 2 in a summarization task.

This tutorial uses the following Google Cloud ML services and resources:

- Vertex AI Model Garden
- Vertex AI Prediction
- Vertex AI Model Eval

The steps performed include:

- Evaluate Gemma 2 for summarization task.
- Use Gemma 2 as LLM-as-Judge to evaluate generated summaries.

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation]
%pip install --upgrade --user --quiet fsspec datasets
%pip install --upgrade --user --quiet plotly

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Authenticate your Hugging Face account

As [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) is a gated model, you are required to review and agree to Google usage license on the Hugging Face Hub for any of the models from the [Gemma 2 release collection](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b), and the access request will be processed inmediately.

Once this is done, you need to generate a new user access token with read-only access so that the weights can be downloaded from the Hub in the Hugging Face DLC for TGI.

> Note that the user access token can only be generated via [the Hugging Face Hub UI](https://huggingface.co/settings/tokens/new), where you can either select read-only access to your account, or follow the recommendations and generate a fine-grained token with read-only access to [`google/gemma-2-9b-it`](https://huggingface.co/google/google/gemma-2-9b-it).

Then you can install the `huggingface_hub` that comes with a CLI that will be used for the authentication with the token generated in advance. So that then the token can be safely retrieved via `huggingface_hub.get_token`.

In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

Read more about [Hugging Face Security](https://huggingface.co/docs/hub/en/security), specifically about [Hugging Face User Access Tokens](https://huggingface.co/docs/hub/en/security-tokens).

### Requirements

#### Set Project ID and Location

To get started using Vertex AI, you must have an existing Google Cloud project and [enable these APIs](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,artifactregistry.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user does not provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

PROJECT_NUMBER = !gcloud projects describe {PROJECT_ID} --format="get(projectNumber)"[0]
PROJECT_NUMBER = PROJECT_NUMBER[0]

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

#### Set Service Account and permissions

You will need to have the Vertex AI User (roles/aiplatform.user) IAM role.

For more information about granting roles, see [Manage access](https://cloud.google.com/iam/docs/granting-changing-revoking-access).


> If you run following commands using Vertex AI Workbench, run directly in the terminal.


In [None]:
SERVICE_ACCOUNT = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

In [None]:
! gcloud projects add-iam-policy-binding {PROJECT_ID} \
      --member=serviceAccount:{SERVICE_ACCOUNT} \
      --role=roles/aiplatform.user --condition=None

### Initiate Vertex AI SDK for Python

Initiate Vertex AI client session.

In [None]:
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

Import relevant libraries.

In [None]:
import json
import logging
import random
import string
from typing import Any
import warnings

from IPython.display import Markdown, display
import datasets
from google.cloud import aiplatform
from huggingface_hub import get_token
import pandas as pd
import plotly.graph_objects as go
from tenacity import retry, wait_random_exponential
from transformers import AutoTokenizer
from vertexai import generative_models
from vertexai.evaluation import CustomMetric, EvalTask
from vertexai.generative_models import (
    Content,
    GenerationConfig,
    GenerativeModel,
    Part,
    SafetySetting,
)

### Library settings

In [None]:
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

### Helper functions

In [None]:
def generate_uuid(length: int = 8) -> str:
    """Generate a uuid of a specified length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def init_new_model(
    model_name: str,
    generation_config: GenerationConfig | None = None,
    safety_settings: list[SafetySetting] | None = None,
    **kwargs: Any,
) -> GenerativeModel:
    """Initialize a new model with configurable generation and safety settings."""

    if generation_config is None:
        generation_config = GenerationConfig(
            candidate_count=1, max_output_tokens=2048, temperature=0
        )
    if safety_settings is None:
        safety_settings = [
            generative_models.SafetySetting(
                category=generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
                method=generative_models.SafetySetting.HarmBlockMethod.SEVERITY,
                threshold=generative_models.HarmBlockThreshold.BLOCK_NONE,
            ),
            generative_models.SafetySetting(
                category=generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
                method=generative_models.SafetySetting.HarmBlockMethod.SEVERITY,
                threshold=generative_models.HarmBlockThreshold.BLOCK_NONE,
            ),
            generative_models.SafetySetting(
                category=generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                method=generative_models.SafetySetting.HarmBlockMethod.SEVERITY,
                threshold=generative_models.HarmBlockThreshold.BLOCK_NONE,
            ),
            generative_models.SafetySetting(
                category=generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT,
                method=generative_models.SafetySetting.HarmBlockMethod.SEVERITY,
                threshold=generative_models.HarmBlockThreshold.BLOCK_NONE,
            ),
        ]

    model = GenerativeModel(
        model_name=model_name,
        generation_config=generation_config,
        safety_settings=safety_settings,
        **kwargs,
    )
    return model


@retry(wait=wait_random_exponential(multiplier=1, max=120))
async def async_generate(
    prompt: str,
    model: GenerativeModel,
    **kwargs: Any,
) -> str | None:
    """Generates a response from the model, optionally handling function calls."""

    user_prompt_content = Content(role="user", parts=[Part.from_text(prompt)])

    try:
        # Initial generation - potentially calling a function.
        response = await model.generate_content_async(
            prompt,
            **kwargs,
        )

        # Extract and return text if generation was successful
        if response and response.candidates and response.candidates[0].content.parts:
            return (
                response.candidates[0].content.parts[0].text
            )  # More robust text extraction
        return None

    except Exception as e:  # pylint: disable=broad-except
        print(f"Error calling the model: {e}")  # Include the actual error message
        return "Could not call the model. Please try it again in a few minutes."


def display_eval_report(
    eval_result: pd.DataFrame, title: str, metrics: list[str] = None
) -> None:
    """Display the evaluation results."""

    summary_metrics, report_df = eval_result.summary_metrics, eval_result.metrics_table
    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
    if metrics:
        metrics_df = metrics_df.filter(
            [
                metric
                for metric in metrics_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )
        report_df = report_df.filter(
            [
                metric
                for metric in report_df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the title with Markdown for emphasis
    display(Markdown(f"## {title}"))

    # Display the metrics DataFrame
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    # Display the detailed report DataFrame
    display(Markdown("### Report Metrics"))
    display(report_df)


def display_explanations(
    df: pd.DataFrame, metrics: list[str] = None, n: int = 1
) -> None:
    """Display the explanations for the evaluation results."""

    # Sample the DataFrame
    df = df.sample(n=n)

    # Filter the DataFrame based on the selected metrics
    if metrics:
        df = df.filter(
            ["instruction", "context", "reference", "completed_prompt", "response"]
            + [
                metric
                for metric in df.columns
                if any(selected_metric in metric for selected_metric in metrics)
            ]
        )

    # Display the explanations using Markdown for consistent styling
    for index, row in df.iterrows():
        display(Markdown("---"))  # Section separator
        for col in df.columns:
            display(Markdown(f"### {col}"))
            display(Markdown(f"{row[col]}"))


def plot_bar_plot(
    eval_result: pd.DataFrame, title: str, metrics: list[str] = None
) -> None:
    fig = go.Figure()
    data = []

    summary_metrics = eval_result.summary_metrics
    if metrics:
        summary_metrics = {
            k: summary_metrics[k]
            for k, v in summary_metrics.items()
            if any(selected_metric in k for selected_metric in metrics)
        }

    data.append(
        go.Bar(
            x=list(summary_metrics.keys()),
            y=list(summary_metrics.values()),
            name=title,
        )
    )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()

## Initiate Gemma 2 on Vertex AI from Hugging Face Hub

To use Gemma 2 with Vertex AI Gen AI evaluation, you need to deploy the model on Vertex AI.

To deploy Gemma 2 on Vertex AI from Hugging Face Hub, register the model on Vertex AI Model Registry using Hugging Face Deep Learning Container. This requires to specify the container image for serving the model and configure essential environment variables. Before deploying the model, create an endpoint, a dedicated resource on Vertex AI that serves as an entry point for predictions. Finally, deploy the registered model to the newly created endpoint.

Learn more about serving open models on Vertex AI using Hugging Face Deep Learning Container, check out [this tutorial](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/vertex_ai_text_generation_inference_gemma.ipynb).

In [None]:
gemma_model = aiplatform.Model.upload(
    display_name="google--gemma-2-9b-it",
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311",
    serving_container_environment_variables={
        "MODEL_ID": "google/gemma-2-9b-it",
        "NUM_SHARD": "2",
        "MAX_INPUT_TOKENS": "4095",
        "MAX_TOTAL_TOKENS": "4096",
        "MAX_BATCH_PREFILL_TOKENS": "4145",
        "HUGGING_FACE_HUB_TOKEN": get_token(),
    },
    serving_container_ports=[8080],
)
gemma_model.wait()

deployed_gemma_model = gemma_model.deploy(
    endpoint=aiplatform.Endpoint.create(display_name="google--gemma-2-9b-it-endpoint"),
    machine_type="g2-standard-24",
    accelerator_type="NVIDIA_L4",
    accelerator_count=2,
)

## Using Gemma 2 with Vertex AI Gen AI Evaluation

To run evaluations using Gemma 2 with Vertex AI Gen AI Evaluation, you use the `EvalTask` class.

The `EvalTask` requires an evaluation dataset (DataFrame, dictionary, or URI) and a list of [supported metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval). Datasets can use standard column names like prompt, reference, response, and baseline_model_response, customizable via parameters like response_column_name.

EvalTask supports three scenarios: bring-your-own-response (BYOR), inference without a prompt template (using a prompt column), and inference with a prompt template (using columns matching template variables). And those scenarios are compatible with Gemini, 3P models, and custom functions, supporting various metrics.

After defining your EvalTask, use `evaluate()` method to run the evaluation, optionally providing a model, prompt template, logging configuration, and other parameters. See the [Gen AI Evaluation package](https://cloud.google.com/vertex-ai/generative-ai/docs/reference/python/latest/vertexai.evaluation) documentation for more details.

This tutorial shows the two main ways to use Gemma 2 with Vertex AI Gen AI Evaluation:

1. Gemma 2 as model to evaluate (`Evaluate Gemma 2` scenario)
2. Gemma 2 as model  (`Gemma 2 as LLM-as-Judge` scenario)

### Scenario 1: `Evaluate Gemma 2` for summarization

To evaluate Gemma 2 for text summarization using Vertex AI Gen AI evaluation, cover the following steps:

1.   Prepare the dataset
2.   Define a model function
3.   Set a base prompt and metrics
4.   Initiate an `EvalTask`
5.   Run an evaluation job

#### Prepare the evaluation dataset

To start, prepare the evaluation dataset.

The XSum dataset is loaded and preprocessed for evaluation. Documents and summaries longer than 4096 tokens are filtered out, columns are renamed to "context" and "reference", and the "id" column is removed.

A random 10-sample subset is created for efficient evaluation.

In [None]:
eval_model_dataset = datasets.load_dataset("xsum", split="test", trust_remote_code=True)

eval_model_dataset = (
    eval_model_dataset.filter(lambda example: len(example["document"]) < 2048)
    .filter(lambda example: len(example["summary"]) < 2048)
    .rename_columns({"document": "context", "summary": "reference"})
    .remove_columns(["id"])
)

n = 10  # @param {type: "integer", placeholder: "10", isTemplate: true}
eval_model_sample_df = (
    eval_model_dataset.shuffle(seed=8)
    .select(random.sample(range(0, len(eval_model_dataset)), n))
    .to_pandas()
)

In [None]:
eval_model_sample_df.head()

#### Define a model function

Define a model function which is a wrapper to generate predictions.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")

generation_config = {
    "max_new_tokens": 256,
    "do_sample": True,
    "temperature": 0.2,
}


def gemma_fn(prompt, generation_config=generation_config):
    formatted_prompt = tokenizer.apply_chat_template(
        [
            {"role": "user", "content": prompt},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

    instance = {"inputs": formatted_prompt, "parameters": generation_config}
    output = deployed_gemma_model.predict(instances=[instance])
    generated_text = output.predictions[0]
    return generated_text

#### Set base prompt and metrics to evaluate your task

Define the prompt template and metrics to use to evaluate the summarization task. Vertex AI Gen AI Evalutions provides several metric prompt templates for model-based evaluation you can use. Check out [the documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/metrics-templates) to know more.

In [None]:
prompt_template = (
    "Summarize the following article in one sentence: {context}.\nSummary:"
)

In [None]:
metrics = ["rouge_l_sum", "summarization_quality", "fluency"]

#### Run the evaluation

To run evaluations for prompt templates, you run an evaluation job repeatedly against an evaluation dataset and its associated metrics. With EvalTask, you leverage integration with Vertex AI Experiments to track settings and results for each evaluation run.

In [None]:
run_id = generate_uuid()
experiment_name = "eval-gemma-base-prompt-sum"
experiment_run_name = f"{experiment_name}-{run_id}"

eval_task = EvalTask(
    dataset=eval_model_sample_df,
    metrics=metrics,
    experiment=experiment_name,
)

eval_result = eval_task.evaluate(
    model=gemma_fn,
    prompt_template=prompt_template,
    experiment_run_name=experiment_run_name,
)

#### Display Evaluation reports and explanations

Display detailed evaluation reports, explanations, and useful charts to summarize key metrics in an informative manner.

In [None]:
display_eval_report(eval_result, "Gemma 2 evaluation report")

In [None]:
display_explanations(eval_result.metrics_table, metrics=["fluency"])

In [None]:
plot_bar_plot(
    eval_result,
    title="Evaluate Gemma 2",
    metrics=["summarization_quality/mean", "fluency/mean"],
)

### Scenario 2: `Gemma 2 as LLM-as-Judge` to evaluate generated summaries

To use Gemma 2 as LLM-as-Judge for text summarization using Vertex AI Gen AI evaluation, cover the following steps:

1.   Define a Model function (see above)
2.   Define a Custom metric to set Gemma 2 as autorater
3.   Initiate an `EvalTask`
4.   Run an evaluation job

#### Prepare the dataset

In this scenario, generate summaries to evaluate using Gemini API on Vertex AI by leveraging concurrent prediction requests for increased efficiency. This approach is particularly useful when evaluating against a large dataset of summaries.


In [None]:
eval_model_sample_df["prompt"] = eval_model_sample_df.apply(
    lambda row: prompt_template.format(context=row["context"]), axis=1
)
gemini_llm = init_new_model(model_name="gemini-2.0-flash")
gemini_predictions = [
    async_generate(p, model=gemini_llm) for p in eval_model_sample_df["prompt"]
]
gemini_predictions_col = await tqdm_asyncio.gather(*gemini_predictions)
eval_model_sample_df["response"] = gemini_predictions_col

In [None]:
eval_model_sample_df.head()

#### Define a metric function to use Gemma 2 as an evaluator

Define a custom model-based metric function, `catchiness_fn` in this case, to evaluate the "catchiness" of an AI-generated response given a user prompt.

It uses a Gemma 2 as an evaluator based on the detailed prompt template.

In [None]:
def catchiness_fn(instance: dict) -> dict:

    metric_prompt_template = """

# Instruction
You are an expert evaluator. Your task is to evaluate the catchiness of responses generated by AI models.
We will provide you with the user input (prompt) and an AI-generated response.
You should first read the user input carefully to understand the task, and then evaluate the catchiness of the response based on the criteria provided in the Evaluation section below.
Then you will assign the response a score (float) and a explanation (string) following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.
Finally ONLY return the score and the explanation in a JSON as shown in Examples section.

# Evaluation

## Metric Definition
Catchiness:  The response uses creative language, vivid imagery, memorable phrasing, and a compelling tone to create a lasting impression on the reader. It might employ techniques like humor, wordplay, or strong emotional appeals. It should be relevant to the prompt and avoid being overly repetitive or generic.

## Criteria
* **Creative Language:**  Does the response utilize figurative language (metaphors, similes, personification, etc.), evocative descriptions, and interesting vocabulary?  Is the language fresh and original?
* **Memorable Phrasing:** Does the response contain turns of phrase, slogans, or other linguistic devices that stick with the reader? Are there any particularly quotable lines?
* **Relevance:** Is the catchiness relevant to the response? Does it enhance the core message or distract from it?

## Rating Rubric
5: (Exceptionally Catchy) The response is highly creative, uses vivid imagery and memorable phrasing, and maintains a compelling tone.  It leaves a strong and lasting impression.  It is perfectly relevant to the prompt and avoids generic language.
4: (Very Catchy) The response is creative and engaging, with clear use of imagery and memorable phrasing.  It leaves a positive impression. It is relevant to the prompt and mostly avoids generic language.
3: (Moderately Catchy) The response shows some creativity and uses some imagery and memorable phrasing, but the impact is less pronounced. It is relevant to the prompt but might contain some generic language.
2: (Slightly Catchy) The response demonstrates limited creativity and uses minimal imagery or memorable phrasing.  The impact is weak. It might be somewhat relevant to the prompt and contains quite a bit of generic language.
1: (Not Catchy) The response lacks creativity, vivid imagery, and memorable phrasing. It leaves no lasting impression. It might be irrelevant to the prompt and relies heavily on generic language.

## Evaluation Steps
STEP 1: Assess Creative Language:  Identify the use of figurative language, evocative descriptions, and interesting vocabulary. Judge the originality and freshness of the language.
STEP 2: Assess Vivid Imagery and Memorable Phrasing:  Analyze the use of sensory details and identify any phrases or lines that are particularly memorable or quotable.
STEP 3: Assess Compelling Tone and Relevance: Determine the tone of the response and evaluate its appropriateness for the prompt and target audience.  Assess the relevance of the catchy elements to the core message of the prompt.
STEP 4: Assess Avoidance of Repetition and Generic Language: Identify any instances of clichés, overused phrases, or repetitive sentence structures.

# User Inputs and AI-generated Response
## User Inputs
### Prompt
{prompt}

## AI-generated Response
{response}

# Examples
```json {{"score": 5, "explanation": "The summary is perfectly relevant to the prompt, highly creative, and avoids generic language. It uses vivid imagery, memorable phrasing, and a compelling tone to leave a strong and lasting impression."}} ```
```json {{"score": 3, "explanation": "The summary is relevant to the prompt, but the impact is somewhat lessened by generic language and less vivid imagery and phrasing. It shows some creativity, though."}} ```
```json {{"score": 1, "explanation": "The summary is irrelevant, lacks creativity, and fails to make a lasting impression. It relies on generic language and lacks vivid imagery or memorable phrasing."}} ```

# Evaluation JSON:
"""

    default_result = {"catchiness": 0, "explanation": ""}

    def parse_json_output(json_string: str) -> dict:
        """Parses JSON output and extracts score and explanation."""
        try:
            # Clean JSON string more robustly
            cleaned_json = (
                json_string.strip().removeprefix("```json").removesuffix("```")
            )
            data = json.loads(cleaned_json)

            return {
                "catchiness": data.get("score", 0),
                "explanation": data.get("explanation", ""),
            }
        except json.JSONDecodeError:
            return default_result

    try:
        # Input validation
        if not isinstance(instance, dict) or not all(
            k in instance for k in ["prompt", "response"]
        ):
            raise ValueError(
                "Instance must be a dict with 'prompt' and 'response' keys"
            )

        metric_prompt = metric_prompt_template.format(
            prompt=instance["prompt"], response=instance["response"]
        )

        rater_config = {"max_new_tokens": 256, "temperature": 0}

        eval_response = gemma_fn(metric_prompt, rater_config)
        return parse_json_output(eval_response)

    except Exception as e:
        return default_result

Create a `Custom Metric` instance to evaluate generated summaries using Vertex AI Gen AI evaluation.

In [None]:
catchiness_metric = CustomMetric(
    name="catchiness",
    metric_function=catchiness_fn,
)

#### Set metrics to evaluate your task

Define metrics to use to evaluate the summarization task.

In [None]:
metrics = ["rouge_l_sum", "fluency", catchiness_metric]

#### Run the evaluation

To run the evaluation job using Gemma as an evaluator.

In [None]:
run_id = generate_uuid()
experiment_name = "gemma-judge-base-prompt-sum"
experiment_run_name = f"{experiment_name}-{run_id}"

eval_task = EvalTask(
    dataset=eval_model_sample_df,
    metrics=metrics,
    experiment=experiment_name,
)

eval_result = eval_task.evaluate(
    experiment_run_name=experiment_run_name,
)

#### Display Evaluation reports and explanations

Visualize reports and useful charts to evaluate the model in summarization task.

In [None]:
display_eval_report(eval_result, "Gemma 2 Judging result")

In [None]:
display_explanations(eval_result.metrics_table, metrics=["catchiness"])

In [None]:
plot_bar_plot(
    eval_result,
    title="Evaluate Gemini using Gemma 2",
    metrics=["fluency/mean", "catchiness/mean"],
)

## Cleaning up

In [None]:
delete_experiment = False  # @param {type:"boolean", isTemplate: false}

if delete_experiment:
    from google.cloud import aiplatform

    aiplatform.init(project=PROJECT_ID, location=LOCATION)
    for experiment_name in [
        "eval-gemma-base-prompt-sum",
        "gemma-judge-base-prompt-sum",
    ]:
        experiment = aiplatform.Experiment(experiment_name=experiment_name)
        experiment.delete()