In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Build and deploy a Hugging Face smolagent using DeepSeek-r1 on Vertex AI

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/vertex_ai_deepseek_smolagents.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fuse-cases%2Fvertex_ai_deepseek_smolagents.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/use-cases/vertex_ai_deepseek_smolagents.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/vertex_ai_deepseek_smolagents.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>



<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/vertex_ai_deepseek_smolagents.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/vertex_ai_deepseek_smolagents.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/vertex_ai_deepseek_smolagents.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/vertex_ai_deepseek_smolagents.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/vertex_ai_deepseek_smolagents.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| | |
|-|-|
| Author(s) |  [Ivan Nardini](https://github.com/inardini) |

## Overview

> [DeepSeek-R1 from DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-R1) is a powerful language model developed with a focus on enhancing reasoning capabilities. DeepSeek-R1-Zero, DeepSeek-R1, and a collection of six distilled, dense models derived from DeepSeek-R1. These distilled models, based on the popular Llama and Qwen architectures, offer a range of sizes and capabilities to suit diverse research needs.

> [HuggingFace's smol-agents](https://huggingface.co/docs/smolagents/en/index) library provides a lightweight and flexible framework for building and experimenting with language agents.

> [Vertex AI](https://cloud.google.com/vertex-ai/docs) provides a comprehensive platform for the entire machine learning lifecycle.  It empowers you to build, train, and deploy ML models and AI applications, including customizing powerful large language models (LLMs).

This notebook showcases how to deploy DeepSeek R1 Distill Qwen 7B from the Hugging Face Hub on Vertex AI using Vertex AI Model Garden. It also shows how to prototype and deploy a simple agent using HuggingFace's smol-agents library on Vertex AI Reasoning Engine.


By the end of this notebook, you will learn how to:

- Register and deploy Deepseek-r1 from the Hugging Face Hub on Vertex AI
- Prototype and evaluate an Deepseek-r1 agent on Vertex AI Reasoning Engine
- Prototype and deploy an Deepseek-r1 agent on Vertex AI Reasoning Engine


## Get started

### Install Vertex AI SDK and other required packages


In [None]:
%pip install --upgrade --user --quiet "google-cloud-aiplatform[reasoningengine, evaluation]" "openai" "smolagents" \
    "cloudpickle==3.0.0" \
    "pydantic>=2.10" \
    "requests"

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>‚ö†Ô∏è The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says "Your session crashed for an unknown reason." This is expected. Wait until it's finished before continuing to the next step. ‚ö†Ô∏è</b>
</div>


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Authenticate your Hugging Face account

Then you can install the `huggingface_hub` that comes with a CLI that will be used for the authentication with the token generated in advance. So that then the token can be safely retrieved via `huggingface_hub.get_token`.


In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

Read more about [Hugging Face Security](https://huggingface.co/docs/hub/en/security), specifically about [Hugging Face User Access Tokens](https://huggingface.co/docs/hub/en/security-tokens).


### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}

if not BUCKET_NAME or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = f"{PROJECT_ID}-bucket"

BUCKET_URI = f"gs://{BUCKET_NAME}"

! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Import libraries

In [None]:
import random
import string
import threading
import time

from IPython.display import HTML, Markdown, display
import google.auth
from google.auth import default
import google.auth.transport.requests
from google.cloud import aiplatform
from huggingface_hub import get_token
import openai
import pandas as pd
import plotly.graph_objects as go
from smolagents import ChatMessage, CodeAgent, Model
from smolagents.agents import ActionStep
from smolagents.tools import Tool
from vertexai.preview import reasoning_engines
from vertexai.preview.evaluation import EvalTask

## Helpers

In [None]:
def get_id(length: int = 8) -> str:
    """Generate a uuid of a specified length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def parse_smolagents_output_to_dictionary(agent, agent_outcome):
    """
    Parse SmolAgent output into a structured dictionary format.
    """

    final_output = {"response": str(agent_outcome), "predicted_trajectory": []}

    try:
        # Access the agent's action logs
        for log in agent.logs:
            # First check if the log is an ActionStep
            if isinstance(log, ActionStep):
                # Then check if it has tool_calls
                if hasattr(log, "tool_calls"):
                    for tool_call in log.tool_calls:
                        # Parse tool arguments - split by newline and create key-value pairs
                        args_list = [
                            arg.strip()
                            for arg in tool_call.arguments.split("\n")
                            if arg.strip()
                        ]
                        tool_args = {
                            f"arg_{idx}": arg
                            for idx, arg in enumerate(
                                args_list
                            )  # Using enumerate ensures sequential numbering
                        }

                        # Create tool info dictionary
                        tool_info = {
                            "tool_name": tool_call.name,
                            "tool_input": tool_args,
                        }
                        final_output["predicted_trajectory"].append(tool_info)

    except Exception as e:
        final_output["error"] = f"Error parsing tools results: {str(e)}"

    return final_output


def format_output_as_markdown(output: dict) -> str:
    """
    Convert the output dictionary to a detailed execution report.

    Args:
        output: Dictionary containing response and predicted trajectory

    Returns:
        str: Formatted string with detailed execution information
    """
    report = "üìä Execution Report\n"
    report += "=" * 50 + "\n\n"

    report += "üéØ Final Result:\n"
    report += f"{output['response']}\n\n"

    if output["predicted_trajectory"]:
        report += "üîç Execution Details:\n"
        report += "-" * 50
        for idx, call in enumerate(output["predicted_trajectory"], 1):
            report += f"\nüìå Operation {idx}:\n"
            report += f"Tool: {call['tool_name']}\n"
            report += "Args:\n"
            for arg_name, command in call["tool_input"].items():
                report += f"  ‚ñ∂ {command}\n"
            report += "-" * 50 + "\n"

    return report


def display_dataframe_rows(
    df: pd.DataFrame,
    columns: list[str] | None = None,
    num_rows: int = 3,
    display_drilldown: bool = False,
) -> None:
    """Displays a subset of rows from a DataFrame, optionally including a drill-down view."""

    if columns:
        df = df[columns]

    base_style = "font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;"
    header_style = base_style + "font-weight: bold;"

    for _, row in df.head(num_rows).iterrows():
        for column in df.columns:
            display(
                HTML(
                    f"<span style='{header_style}'>{column.replace('_', ' ').title()}: </span>"
                )
            )
            display(HTML(f"<span style='{base_style}'>{row[column]}</span><br>"))

        display(HTML("<hr>"))

        if (
            display_drilldown
            and "predicted_trajectory" in df.columns
            and "reference_trajectory" in df.columns
        ):
            display_drilldown(row)


def display_eval_report(eval_result: pd.DataFrame) -> None:
    """Display the evaluation results."""
    metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient="index").T
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    display(Markdown("### Row-wise Metrics"))
    display(eval_result.metrics_table)


def plot_bar_plot(
    eval_result: pd.DataFrame, title: str, metrics: list[str] = None
) -> None:
    fig = go.Figure()
    data = []

    summary_metrics = eval_result.summary_metrics
    if metrics:
        summary_metrics = {
            k: summary_metrics[k]
            for k, v in summary_metrics.items()
            if any(selected_metric in k for selected_metric in metrics)
        }

    data.append(
        go.Bar(
            x=list(summary_metrics.keys()),
            y=list(summary_metrics.values()),
            name=title,
        )
    )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()

## Set model

Set the model ID from Hugging Face Hub. In this case, you use DeepSeek-R1-Distill-Qwen-7B, a dense model distilled from DeepSeek-R1 good at math.

In [None]:
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"  # @param {type:"string", isTemplate: true}

## Register and Deploy DeepSeek model on Vertex AI


### Register a DeepSeek model on Vertex AI Model Registry

Deploying a DeepSeek model on Vertex AI begins with importing the model into the [Vertex AI Model Registry](https://www.google.com/search?q=model+registry+vertex+ai&oq=model+registry+vertex+ai&gs_lcrp=EgZjaHJvbWUqBwgAEAAYgAQyBwgAEAAYgAQyCggBEAAYgAQYogQyBggCEEUYPDIGCAMQRRg8MgYIBBBFGDwyBggFEEUYQDIGCAYQRRhAMgYIBxBFGEDSAQg2MzMxajBqN6gCALACAA&sourceid=chrome&ie=UTF-8), a central hub for managing your ML model lifecycle.  This registry stores model configurations, enabling streamlined organization, tracking, and versioning.  

The `aiplatform.Model.upload` method specifies the display name, the serving container image URI (pointing to the vLLM inference container on Vertex AI Model Garden), and arguments for the vLLM API server. Key arguments include the model name, tensor parallelism size, maximum model length, and enforcement of eager execution.

It also defines the serving container port, predict route, health route, and crucial environment variables, notably the Hugging Face token for downloading the model from the Hugging Face Hub.

See the [vLLM documentation](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#quickstart-online) and [aiplatform.Model.upload](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_upload) Python reference for a complete list of arguments.

In [None]:
deepseek_model = aiplatform.Model.upload(
    display_name=MODEL_ID.replace("/", "--").lower(),
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-6.ubuntu2204.py310",
    serving_container_args=[
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={MODEL_ID}",
        # Hugging Face configuration
        "--tensor-parallel-size=1",
        "--max-model-len=16384",
        "--enforce-eager",
    ],
    serving_container_ports=[8080],
    serving_container_predict_route="/generate",
    serving_container_health_route="/ping",
    serving_container_environment_variables={
        "HF_TOKEN": get_token(),
        "DEPLOY_SOURCE": "notebook",
    },
)
deepseek_model.wait()

### Deploy DeepSeek model on Vertex AI Prediction

After the model is registered on Vertex AI, you can deploy the model to an endpoint.

First create the endpoint with aiplatform.Endpoint.create method. Then you deploys the model to this endpoint, specifying the machine type (`g2-standard-24`), accelerator type (`NVIDIA_L4`), and the number of accelerators (`2`).

> This deployment configuration is based on [Vertex AI Model Garden](https://console.cloud.google.com/vertex-ai/model-garden/featured-partners/hugging-face). Be sure you have enough GPU quota for deploying the model.

For more information on the supported `aiplatform.Model.deploy` arguments, you can check its [Python reference](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_deploy).

In [None]:
deepseek_endpoint = aiplatform.Endpoint.create(
    display_name=MODEL_ID.replace("/", "--").lower() + "-endpoint"
)

deployed_deepseek_model = deepseek_model.deploy(
    endpoint=deepseek_endpoint,
    machine_type="g2-standard-12",
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
    sync=False,
)

> Note that the model deployment on Vertex AI can take around 20 minutes to get deployed.


### Generate predictions with Vertex AI API

After deploying the model, you can use the `aiplatform.Endpoint.predict` method to generate online predictions. This sends requests to the deployed endpoint, utilizing the `/predict` route defined within the container and adhering to Vertex AI's input/output payload formatting requirements.

> Note the instance request format is aligned the [vLLM OpenAI Completions API interface](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-completions-api-with-vllm).

In [None]:
prediction_request = {
    "instances": [
        {
            "@requestFormat": "chatCompletions",
            "messages": [
                {
                    "role": "user",
                    "content": "Count the number of 'r' in the word Strawberry",
                }
            ],
            "max_tokens": 2048,
            "temperature": 0.7,
        }
    ]
}

output = deployed_deepseek_model.predict(instances=prediction_request["instances"])
for prediction in output.predictions[0]:
    print("------- DeepSeek prediction -------")
    print(prediction["message"]["content"])
    print("---------------------------------\n")

## Build a simple math agent with Hugging Face's smolagents

With your DeepSeek model now deployed on Vertex AI, let's leverage its mathematical capabilities. The `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B` excels at mathematical reasoning, making it an ideal tool for an agent designed to verify math results.  

Let's create a simple agent that combines the strengths of Gemini's function calling for orchestration and answer generation with DeepSeek's verification abilities on Vertex AI. This agent will use Hugging Face's smol-agents library.

### Create a VertexAIServerModel class

To integrate Gemini with Vertex AI for agent development, a custom [Model](https://huggingface.co/docs/smolagents/v1.5.0/en/reference/agents#models) class is required. This class will represent the Gemini text generation model, serving as the engine for your agent.

> Note the code is based on the official [Model](https://github.com/huggingface/smolagents/blob/main/src/smolagents/models.py) implementation.

In [None]:
class VertexAIServerModel(Model):
    """This model connects to a Vertex AI-compatible API server."""

    def __init__(
        self, model_id: str, project_id: str, location: str, endpoint_id: str, **kwargs
    ):
        #  Try to import dependencies
        try:
            from google.auth import default
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                "Please install 'openai, google-auth and requests' extra to use VertexAIGeminiModel as described in the official documentation: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/call-vertex-using-openai-library"
            ) from None

        # Initialize parent class with any additional keyword arguments
        super().__init__(**kwargs)
        self.model_id = model_id
        self.project_id = project_id
        self.location = location
        self.endpoint_id = endpoint_id
        self.kwargs = kwargs
        self._refresh_task = None

        # Initialize credentials and set up Google Cloud authentication with required permissions
        self.credentials, _ = default(
            scopes=["https://www.googleapis.com/auth/cloud-platform"]
        )
        self._refresh_token()
        self._setup_client()
        self._start_refresh_loop()

    def __call__(
        self,
        messages: list[dict[str, str]],
        **kwargs,
    ) -> ChatMessage:

        # Prepare the API call parameters
        completion_kwargs = self._prepare_completion_kwargs(
            messages=messages,
            model=self.model_id,
            **self.kwargs,
        )

        # Make the API call to Vertex AI
        response = self.client.chat.completions.create(**completion_kwargs)
        self.last_input_token_count = response.usage.prompt_tokens
        self.last_output_token_count = response.usage.completion_tokens

        # Convert API response to ChatMessage format
        message = ChatMessage.from_dict(
            response.choices[0].message.model_dump(
                include={"role", "content", "tool_calls"}
            )
        )
        return message

    def _refresh_token(self):
        """Refresh the Google Cloud token"""
        try:
            self.credentials.refresh(google.auth.transport.requests.Request())
            self._setup_client()
        except Exception as e:
            print(f"Token refresh failed: {e}")

    def _setup_client(self):
        """Setup OpenAI client with current credentials"""
        self.client = openai.OpenAI(
            base_url=f"https://{self.location}-aiplatform.googleapis.com/v1beta1/projects/{self.project_id}/locations/{self.location}/endpoints/{self.endpoint_id}",
            api_key=self.credentials.token,
        )

    def _start_refresh_loop(self):
        """Start the token refresh loop"""

        def refresh_loop():
            while True:
                time.sleep(3600)
                self._refresh_token()

        self._refresh_thread = threading.Thread(target=refresh_loop, daemon=True)
        self._refresh_thread.start()

### Create a math tool using a DeepSeek model

In the context of language agents, a tool is a self-contained function the agent can utilize.  For a language model to effectively use a tool, the tool must have a well-defined API, including a name, a concise description, specifications for input types and their descriptions, and a defined output type.  

To integrate our deployed DeepSeek model on Vertex AI as a tool within a smol-agents framework, a custom [Tool](https://huggingface.co/docs/smolagents/en/guided_tour#tools) class is required. This class will represent the DeepSeek model, serving as the mean to take action for your agent. In this case, the tool would verify math results.


In [None]:
class DeepSeekMathVerifierTool(Tool):
    """A tool that verifies math responses"""

    name = "math_verifier"
    description = """This is a tool that verifies math responses"""
    inputs = {
        "content": {
            "type": "string",
            "description": "a text containing math",
        }
    }
    output_type = "string"

    def __init__(self, project_id: str, location: str, endpoint_id: str, **kwargs):
        try:
            from google.cloud import aiplatform
            import vertexai
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                "Please install 'vertexai' and 'google-cloud-aiplatform' extra to use DeepSeekMathVerifierTool"
            ) from None

        super().__init__()
        self.endpoint_id = endpoint_id
        self.project_id = project_id
        self.location = location
        self.kwargs = kwargs
        self._refresh_task = None

        # Initialize credentials and set up Google Cloud authentication with required permissions
        self.credentials, _ = default(
            scopes=["https://www.googleapis.com/auth/cloud-platform"]
        )
        self._refresh_token()
        self._start_refresh_loop()

        # Initialize Vertex ai session and the endpoint
        vertexai.init(
            project=self.project_id,
            location=self.location,
            credentials=self.credentials,
            **self.kwargs,
        )
        self.endpoint = aiplatform.Endpoint(
            endpoint_name=f"projects/{self.project_id}/locations/{self.location}/endpoints/{self.endpoint_id}"
        )

    def forward(self, content: str):
        """Submit the prediction request"""
        content = str(content)
        prediction_request = {
            "instances": [
                {
                    "@requestFormat": "chatCompletions",
                    "messages": [{"role": "user", "content": content}],
                }
            ]
        }

        try:
            output = self.endpoint.predict(instances=prediction_request["instances"])
        except Exception as e:
            print(f"Prediction failed: {e}")
            return None
        prediction = output.predictions[0][0]["message"]["content"]
        return prediction

    def _refresh_token(self):
        """Refresh the Google Cloud token"""
        try:
            self.credentials.refresh(google.auth.transport.requests.Request())
        except Exception as e:
            print(f"Token refresh failed: {e}")

    def _start_refresh_loop(self):
        """Start the token refresh loop"""

        def refresh_loop():
            while True:
                time.sleep(3600)
                self._refresh_token()

        self._refresh_thread = threading.Thread(target=refresh_loop, daemon=True)
        self._refresh_thread.start()

### Assemble the agent

Having defined both the model and the tool, we can now assemble a basic agent.  

`smolagents` provides a default implementation called `CodeAgent`, which is designed to write and execute Python code at each step of its process.  

For more detailed information on agent construction and capabilities, refer to the `smolagents` [Agent](https://huggingface.co/docs/smolagents/en/guided_tour#codeagent-and-toolcallingagent) documentation.


In [None]:
endpoint_id = next(
    (
        endpoint.name
        for endpoint in aiplatform.Endpoint.list()
        if endpoint.display_name == MODEL_ID.replace("/", "--").lower() + "-endpoint"
    ),
    None,
)

In [None]:
model = VertexAIServerModel(
    model_id="google/gemini-2.0-flash",
    endpoint_id="openapi",
    project_id=PROJECT_ID,
    location=LOCATION,
)

tools = [
    DeepSeekMathVerifierTool(
        endpoint_id=endpoint_id, project_id=PROJECT_ID, location=LOCATION
    )
]

agent = CodeAgent(model=model, tools=tools, add_base_tools=False)

### Test the agent

After you assemble the agent, you are now able to test it.

In [None]:
response = agent.run("Hello! How are you?")

In [None]:
print(format_output_as_markdown(parse_smolagents_output_to_dictionary(agent, response)))

In [None]:
response = agent.run(
    "Count the number of 'r' in the word Strawberry. Verify the answer"
)

In [None]:
print(format_output_as_markdown(parse_smolagents_output_to_dictionary(agent, response)))

## Evaluate the smolagent with Vertex AI Gen AI Evaluation

Building effective AI agents requires careful performance evaluation.  This involves two key practices: monitoring and observability.  Monitoring focuses on task-specific performance: how well an agent executes individual actions. Observability provides a broader view, assessing the agent's overall health and efficiency.  

The [Vertex AI Gen AI Evaluation service](https://cloud.google.com/blog/products/ai-machine-learning/introducing-agent-evaluation-in-vertex-ai-gen-ai-evaluation-service?e=48754805) streamlines both monitoring and observability, offering pre-built criteria and metrics applicable from prototyping to production.  This allows you to gain deep insights into agent performance, pinpoint areas for improvement, and optimize your AI solutions.  Explore the documentation for details on available evaluation tools.

### Prepare Agent Evaluation dataset

To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent as shown below.

In [None]:
eval_data = {
    "prompt": [
        "Count the number of 'r' in the word Strawberry. Verify the answer",
        "How many times does the digit '2' appear in the number 2,222,222? Verify your answer.",
        "Count the number of words with more than five letters in this sentence: `The quick brown fox jumps over the lazy dog.` Verify your answer.",
    ],
    "reference_trajectory": [
        [
            {
                "tool_name": "python_interpreter",
                "tool_input": {
                    "arg_0": "count = 0",
                    "arg_1": 'for letter in "Strawberry":',
                    "arg_2": "if letter == 'r':",
                    "arg_3": "count += 1",
                    "arg_4": "print(f\"There are {count} \\'r\\'s in the word Strawberry.\")",
                    "arg_5": "verification = math_verifier(content={\\'type\\': \\'string\\', \\'description\\': f\"There are {count} \\'r\\'s in the word Strawberry.\"})",
                    "arg_6": "final_answer(verification)",
                    "arg_7": "final_answer(verification)",
                },
            }
        ],
        [
            {
                "tool_name": "python_interpreter",
                "tool_input": {
                    "arg_0": "count = 0",
                    "arg_1": "num_str = str(2222222)",
                    "arg_2": "for digit in num_str:",
                    "arg_3": "if digit == '2':",
                    "arg_4": "count += 1",
                    "arg_5": 'print(f"The digit 2 appears {count} times in the number 2,222,222.")',
                    "arg_6": "verification = math_verifier(content={'type': 'string', 'description': f\"The digit 2 appears {count} times in the number 2,222,222.\"})",
                    "arg_7": "final_answer(verification)",
                },
            },
        ],
        [
            {
                "tool_name": "python_interpreter",
                "tool_input": {
                    "arg_0": "count = 0",
                    "arg_1": 'sentence = "The quick brown fox jumps over the lazy dog."',
                    "arg_2": "words = sentence.split()",
                    "arg_3": "for word in words:",
                    "arg_4": "if len(word) > 5:",
                    "arg_5": "count += 1",
                    "arg_6": 'print(f"There are {count} words with more than five letters.")',
                    "arg_7": "verification = math_verifier(content={'type': 'string', 'description': f\"There are {count} words with more than five letters in the sentence.\"})",
                    "arg_8": "final_answer(verification)",
                },
            }
        ],
    ],
}

eval_sample_dataset = pd.DataFrame(eval_data)

Print some samples from the dataset.

In [None]:
display_dataframe_rows(eval_sample_dataset, num_rows=3)

### Prepare an Agent function

In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation.

In [None]:
def agent_parsed_response(input: str) -> dict:
    """Parse the agent output and pass it to Vertex AI Gen AI Evaluation."""

    result = agent.run(input)

    # Parse function calls separately
    agent_output = parse_smolagents_output_to_dictionary(agent, result)

    return agent_output

### Run an evaluation task

Once you've assembled your evaluation dataset, the next step is to select the appropriate metrics for assessing your agent's performance.  A comprehensive overview of available metrics and their interpretations can be found in the [Evaluate Gen AI agents documentation](https://cloud.google.com/blog/products/ai-machine-learning/introducing-agent-evaluation-in-vertex-ai-gen-ai-evaluation-service?e=48754805).

With your dataset and chosen metrics in hand, you're ready to launch your first agent evaluation job on Vertex AI. This is accomplished by initiating an EvalTask with your defined dataset and metrics, followed by executing the evaluate method.  Vertex AI Gen AI evaluation seamlessly integrates with [Vertex AI Experiments](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments), the platform's managed experiment tracking service, automatically logging your evaluation run as an experiment.


In [None]:
EXPERIMENT_NAME = f"evaluate-smolagent-deepseek-{get_id()}"
EXPERIMENT_RUN_NAME = f"response-and-tools-{get_id()}"

response_tool_metrics = [
    "trajectory_exact_match",
    "trajectory_in_order_match",
    "coherence",
]

response_eval_tool_task = EvalTask(
    dataset=eval_data,
    metrics=response_tool_metrics,
    experiment=EXPERIMENT_NAME,
)

response_eval_tool_result = response_eval_tool_task.evaluate(
    experiment_run_name=EXPERIMENT_RUN_NAME,
    runnable=agent_parsed_response,
)

display_eval_report(response_eval_tool_result)

### Visualize evaluation results

Visualize evaluation result sample.

In [None]:
display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)

In [None]:
plot_bar_plot(
    response_eval_tool_result,
    title="Agent eval metrics",
    metrics=[f"{metric}/mean" for metric in response_tool_metrics],
)

## Deploy the agent on Vertex AI Reasoning Engine

Your agent prototype is running smoothly in Colab, but it's time to scale it for wider accessibility.

[Reasoning Engine on Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/reasoning-engine) provides a managed platform for creating and deploying advanced agent reasoning frameworks.  

This notebook's approach utilizes a custom application template within Reasoning Engine, which can be further extended with frameworks like smolagents.

Let's explore how to deploy our smol-agents agent using Reasoning Engine on Vertex AI.


### Assemble the agent

Define a SmolAgent class designed to interact with a DeepSeek model deployed on Vertex AI.

The two main components for building a custom agent are `set_up` and query methods:

- The `set_up` method instantiates the agent's core components: a VertexAIServerModel to connect to the deployed DeepSeek model, a DeepSeekMathVerifierTool for mathematical verification tasks, and a CodeAgent to orchestrate the model and tools.

- The `query` method provides a simple interface for sending input to the agent and receiving its response, effectively triggering the agent's execution.

To know more about custom agent, check out how to [customize an application template ](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).


In [None]:
class SmolAgent:

    def __init__(
        self,
        model_id: str,
        endpoint_id: str,
        tool_endpoint_id: str,
        project_id: str,
        location: str,
        **kwargs,
    ):
        self.model_id = model_id
        self.endpoint_id = endpoint_id
        self.tool_endpoint_id = tool_endpoint_id
        self.project_id = project_id
        self.location = location
        self.add_base_tools = False
        self.kwargs = kwargs

    def set_up(self) -> None:
        """Set up the agent."""

        self.model = VertexAIServerModel(
            model_id=self.model_id,
            endpoint_id=self.endpoint_id,
            project_id=self.project_id,
            location=self.location,
            **self.kwargs,
        )
        self.tools = [
            DeepSeekMathVerifierTool(
                project_id=self.project_id,
                location=self.location,
                endpoint_id=self.tool_endpoint_id,
                **self.kwargs,
            )
        ]
        self.app = CodeAgent(
            model=self.model,
            tools=self.tools,
            add_base_tools=self.add_base_tools,
            **self.kwargs,
        )

    def query(self, input: str):
        """Query the application."""
        return self.app.run(input)

### Test the agent

After you get the agent assembled, you can now test it locally to confirm its expected behavior.

In [None]:
local_custom_agent = SmolAgent(
    model_id="google/gemini-2.0-flash",
    endpoint_id="openapi",
    tool_endpoint_id=endpoint_id,
    project_id=PROJECT_ID,
    location=LOCATION,
)
local_custom_agent.set_up()

In [None]:
output = local_custom_agent.query(input="Hello! How are you?")
print(output)

In [None]:
output = local_custom_agent.query(
    input="Count the number of 'r' in the word Strawberry. Verify the answer"
)
print(output)

### Deploy the SmolAgent

Your `smol-agent` application is running smoothly locally‚Äîexcellent!  

Let's now deploy it to Reasoning Engine on Vertex AI. This deployment will make your application accessible remotely, opening up possibilities for integration with broader systems and use as a standalone service.

In [None]:
local_custom_agent = SmolAgent(
    model_id="google/gemini-2.0-flash",
    endpoint_id="openapi",
    tool_endpoint_id=endpoint_id,
    project_id=PROJECT_ID,
    location=LOCATION,
)

remote_custom_agent = reasoning_engines.ReasoningEngine.create(
    local_custom_agent,
    requirements=[
        "google-cloud-aiplatform[reasoningengine]",
        "openai",
        "smolagents",
        "cloudpickle==3.0.0",
        "pydantic>=2.10",
        "requests",
    ],
)

### Call the agent

Now that the agent is deployed, let's call the agent to answer our math questions.

In [None]:
output = remote_custom_agent.query(
    input="Count the number of 'r' in the word Strawberry. Verify the answer"
)
print("Agent response:", output)

## Cleaning up

In [None]:
delete_bucket = False
delete_endpoint = False
delete_model = False
delete_remote_agent = False

if delete_bucket:
    ! gsutil rm -r $BUCKET_URI
if delete_endpoint:
    deepseek_endpoint.delete(force=True)
if delete_model:
    deepseek_model.delete()
if delete_remote_agent:
    remote_custom_agent.delete()