In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#  Evaluating Vertex RAG Engine Generation with Vertex AI Python SDK for Gen AI Evaluation Service

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/rag-engine/rag_engine_eval_service_sdk.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Frag-engine%2Frag_engine_eval_service_sdk.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/rag-engine/rag_engine_eval_service_sdk.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/rag-engine/rag_engine_eval_service_sdk.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/rag-engine/rag_engine_eval_service_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/rag-engine/rag_engine_eval_service_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/rag-engine/rag_engine_eval_service_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/rag-engine/rag_engine_eval_service_sdk.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/rag-engine/rag_engine_eval_service_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| | |
|-|-|
| Author(s) | [Noa Ben-Efraim](https://github.com/noabenefraim/) |

## Overview

This notebook demonstrates how to evaluate the performance of a Retrieval Augmented Generation (RAG) engine built with Vertex AI using the Vertex AI Python SDK for Gen AI Evaluation Service. By focusing on a practical example using "Alice in Wonderland" as our knowledge base, we'll walk through the process of creating an evaluation dataset and applying custom metrics to assess the quality of generated responses.

Specifically, this notebook will guide you through:

* **Setting up a RAG Corpus:** Creating and populating a RAG corpus with a PDF document.
* **Generating Grounded Responses:** Using the Vertex AI Gemini model to produce responses based on retrieved contexts.
* **Creating an Evaluation Dataset:** Constructing a dataset with prompts, retrieved contexts, and generated responses.
* **Defining Custom Evaluation Metrics:** Implementing a custom metric to assess the accuracy, completeness, and groundedness of the generated responses.
* **Running Evaluation Tasks:** Utilizing the Vertex AI Gen AI Evaluation Service to evaluate the RAG engine's performance.
* **Analyzing Evaluation Results:** Visualizing and interpreting the evaluation results using the provided SDK tools.

## Get started

### Install Google Gen AI SDK and other required packages


In [None]:
%pip install --upgrade --quiet google-genai google-cloud-aiplatform[evaluation] vertexai

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says "Your session crashed for an unknown reason." This is expected. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

from google import genai
import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)
vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [None]:
from google.genai.types import GenerateContentConfig, Retrieval, Tool, VertexRagStore
import pandas as pd
from tqdm import tqdm
from vertexai import rag
from vertexai.evaluation import (
    EvalTask,
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
    notebook_utils,
)

### Load model

In [None]:
MODEL_ID = "gemini-2.0-flash-001"  # @param {type:"string"}

### Create `RAGCorpus`

In [None]:
# Currently supports Google first-party embedding models
EMBEDDING_MODEL = "publishers/google/models/text-embedding-005"  # @param {type:"string", isTemplate: true}

rag_corpus = rag.create_corpus(
    display_name="rag-eval-corpus",
    description="A test corpus for generation evaluation",
    backend_config=rag.RagVectorDbConfig(
        rag_embedding_model_config=rag.RagEmbeddingModelConfig(
            vertex_prediction_endpoint=rag.VertexPredictionEndpoint(
                publisher_model=EMBEDDING_MODEL
            )
        )
    ),
)

# Get the rag corpus you just created
rag.get_corpus(rag_corpus.name)

### Import files from Google Cloud Storage into `RagCorpus` (configure chunk size, chunk overlap etc as desired)

For this step you will need to create a GCS bucket, and then copy over the data from the public GCS bucket. Remember to grant "Viewer" access to the "Vertex RAG Data Service Agent" (with the format of service-{project_number}@gcp-sa-vertex-rag.iam.gserviceaccount.com) for your Google Cloud Storage bucket.

For this example, we'll use a dataset that comprises the full texts of five classic children's literature books: "The Wizard of Oz," "Gulliver's Travels," "Peter Pan," "Alice's Adventures in Wonderland," and "Through the Looking-Glass." This collection provides a rich corpus for exploring themes, characters, and settings across these iconic stories.


##### Copy data from public GCS bucket

In [None]:
CURRENT_BUCKET_PATH = "gs://"  # @param {type:"string"},

PUBLIC_DATA_PATH = (
    "gs://github-repo/generative-ai/gemini/rag-engine/rag_engine_eval_service/"
)

!gsutil -m rsync -r -d $PUBLIC_DATA_PATH $CURRENT_BUCKET_PATH

##### Import dataset into `RagCorpus`

In [None]:
transformation_config = rag.TransformationConfig(
    chunking_config=rag.ChunkingConfig(
        chunk_size=512,
        chunk_overlap=100,
    ),
)

rag.import_files(
    corpus_name=rag_corpus.name,
    paths=[CURRENT_BUCKET_PATH],
    transformation_config=transformation_config,  # Optional
)

In [None]:
# List the files in the rag corpus
rag.list_files(rag_corpus.name)

### Create RAG Retrieval Tool

In [None]:
# Create a tool for the RAG Corpus
rag_retrieval_tool = Tool(
    retrieval=Retrieval(
        vertex_rag_store=VertexRagStore(
            rag_corpora=[rag_corpus.name],
            similarity_top_k=10,
            vector_distance_threshold=0.5,
        )
    )
)

In [None]:
def get_generated_response(prompt: str) -> str:
    """
    Generates a grounded response using a language model and retrieved context.

    Args:
        prompt: The input prompt for the language model.

    Returns:
        The generated text response.
    """
    response = client.models.generate_content(
        model=MODEL_ID,
        contents=prompt,
        config=GenerateContentConfig(tools=[rag_retrieval_tool]),
    )

    return response.text

In [None]:
def get_retrieved_contexts(prompt: str) -> str:
    """
    Retrieves relevant contexts based on a given prompt using a RAG system.

    Args:
        prompt: The input prompt for context retrieval.

    Returns:
        A concatenated string of retrieved context texts, with newlines removed.
    """

    rag_filter = rag.utils.resources.Filter(vector_distance_threshold=0.5)

    retrieval_config = rag.RagRetrievalConfig(top_k=5, filter=rag_filter)

    response = rag.retrieval_query(
        rag_resources=[
            rag.RagResource(
                rag_corpus=rag_corpus.name,
                # Optional: supply IDs from `rag.list_files()`.
                # rag_file_ids=["rag-file-1", "rag-file-2", ...],
            )
        ],
        text=prompt,
        rag_retrieval_config=retrieval_config,
    )
    context = " ".join(
        [context.text for context in response.contexts.contexts]
    ).replace("\n", "")
    return context

### Create Evaluation Dataset

Now we are prepared to create the evaluation dataset. The dataset will include:

+ Prompt: What the user is asking the RAG engine. The prompts will be a mix of inter-document and intra-document analysis.
+ Retrieved Context: The top k retrieved context from Vertex RAG Engine
+ Generated Response: The LLM generated responses grounded in the retrieved context.

In [None]:
prompts = [
    "Compare and contrast the behaviors of the Mad Hatter and the March Hare during the tea party.",
    "What happened during Alice's croquet game with the Queen of Hearts?",
    "How did the Mad Hatter and March Hare act at the tea party?",
    "What was special about the cakes Alice ate?",
    "What happened when Gulliver first arrived in Lilliput?",
    "What was Captain Hook's main goal in Neverland?",
]

In [None]:
retrieved_context = []
generated_response = []
for prompt in tqdm(prompts):
    retrieved_context.append(get_retrieved_contexts(prompt))
    generated_response.append(get_generated_response(prompt))

In [None]:
eval_dataset = pd.DataFrame(
    {
        "prompt": prompts,
        "retrieved_context": retrieved_context,
        "response": generated_response,
    }
)

eval_dataset

## Use Gen AI Evaluation Service SDK

Before diving into the evaluation process, we've set up the necessary components: a RAG corpus containing our document, a retrieval tool, and functions to generate grounded responses and retrieve relevant contexts. We've also compiled an evaluation dataset with a set of questions, the corresponding retrieved contexts, and the model's responses.

This dataset will serve as the foundation for our evaluation. We'll now leverage the Vertex AI Gen AI Evaluation Service SDK to define and apply custom metrics, allowing us to quantitatively assess the RAG engine's performance. The Gen AI Evaluation Service provides a robust framework for creating and running evaluation tasks, enabling us to gain valuable insights into the quality of our generated responses.

In [None]:
custom_question_answering_correctness = PointwiseMetric(
    metric="custom_question_answering_correctness",
    metric_prompt_template=PointwiseMetricPromptTemplate(
        criteria={
            "accuracy": (
                "The response provides completely accurate information, consistent with the retrieved context, with no errors or omissions."
            ),
            "completeness": (
                "The response answers all parts of the question fully, utilizing the information available in the retrieved context."
            ),
            "groundedness": (
                "The response uses only the information provided in the retrieved context and does not introduce any external information or hallucinations."
            ),
        },
        rating_rubric={
            "5": "(Very good). The answer is completely accurate, complete, concise, grounded in the retrieved context, and follows all instructions.",
            "4": "(Good). The answer is mostly accurate, complete, and grounded in the retrieved context, with minor issues in conciseness or instruction following.",
            "3": "(Ok). The answer is partially accurate and complete but may have some inaccuracies, omissions, or significant issues with conciseness, groundedness, or instruction following, based on the retrieved context.",
            "2": "(Bad). The answer contains significant inaccuracies, is largely incomplete, or fails to follow key instructions, considering the information available in the retrieved context.",
            "1": "(Very bad). The answer is completely inaccurate, irrelevant, or fails to address the question in any meaningful way, based on the retrieved context.",
        },
        input_variables=["prompt", "retrieved_context"],
    ),
)

# Display the serialized metric prompt template
print(custom_question_answering_correctness.metric_prompt_template)

### Run Eval Task

The Gen AI Evaluation SDK has many useful utilities to graph, summarize, and explain the evaluation results. 

In [None]:
# Run evaluation using the custom_text_quality metric
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[custom_question_answering_correctness],
    experiment="test",
)
eval_result = eval_task.evaluate()

In [None]:
notebook_utils.display_eval_result(eval_result=eval_result)

In [None]:
# Example for graphing
notebook_utils.display_radar_plot(
    eval_results_with_title=[("Question answering correctness", eval_result)],
    metrics=["custom_question_answering_correctness"],
)

In [None]:
# Displaying explanations for one row.
notebook_utils.display_explanations(eval_result=eval_result, num=1)

## Cleaning up

Delete ExperimentRun created by the evaluation.

In [None]:
aiplatform.ExperimentRun(
    run_name=eval_result.metadata["experiment_run"],
    experiment=eval_result.metadata["experiment"],
).delete()

In [None]:
rag.delete_corpus(rag_corpus.name)