In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama Guard

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/instances">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_llama_guard_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_llama_guard_deployment.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates downloading and deploying [Llama Guard models](https://huggingface.co/meta-llama) with [vLLM](https://github.com/vllm-project/vllm) on GPU, and demonstrates using the Llama Guard model to safeguard LLM inputs and outputs with the Vertex Llama API service.

### Objective

- Download and deploy Llama Guard models with [vLLM](https://github.com/vllm-project/vllm) on GPU
- Use the Llama Guard models to safeguard LLM inputs and outputs with the Vertex Llama 3.1 API service
- Use the Llama Guard models to safeguard LLM vision inputs and outputs with the Vertex Llama 3.2 API service
- Use the Llama Guard models to safeguard LLM vision inputs and outputs with the Vertex Llama 4 API service

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Request for quota

# @markdown By default, the quota for A100_80GB and H100 deployment `Custom model serving per region` is 0. You need to request quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown For better chance to get resources, we recommend to request A100_80GB quota in the regions `us-central1, us-east1`, and request H100 quota in the regions `us-central1, us-west1`.

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# Import the necessary packages

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.84.0'
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

import importlib
import os
import re
from typing import Tuple

from google.cloud import aiplatform

if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

LABEL = "vllm_gpu"
models, endpoints = {}, {}


# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID
import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

# @markdown # Access Llama Guard models on Vertex AI
# @markdown The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.
# @markdown Accept the model agreement to access the models:
# @markdown 1. Open the [Llama Guard model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama-guard) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
# @markdown 2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
# @markdown 3. After accepting the agreement, a `gs://` URI containing Llama Guard pretrained and finetuned models will be shared.
# @markdown 4. Paste the URI in the `VERTEX_AI_MODEL_GARDEN_LLAMA_GUARD` field below.
# @markdown 5. The Llama Guard models will be copied into `BUCKET_URI`.


VERTEX_AI_MODEL_GARDEN_LLAMA_GUARD = ""  # @param {type:"string", isTemplate:true}
assert (
    VERTEX_AI_MODEL_GARDEN_LLAMA_GUARD
), "Click the agreement in Vertex AI Model Garden at https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama-guard, and get the GCS path of Llama Guard model artifacts."
parsed_gcs_url = re.search("gs://.*?(?=[ ]|$)", VERTEX_AI_MODEL_GARDEN_LLAMA_GUARD)
if parsed_gcs_url:
    VERTEX_AI_MODEL_GARDEN_LLAMA_GUARD = parsed_gcs_url.group()
assert VERTEX_AI_MODEL_GARDEN_LLAMA_GUARD.startswith(
    "gs://"
), "VERTEX_AI_MODEL_GARDEN_LLAMA_GUARD is expected to be a GCS URI and must start with `gs://`."

## Deploy Llama Guard

In [None]:
# @title Select the model variants

# @markdown Select one of the three model variations.

base_model_name = "Llama-Guard-4-12B"  # @param ["Llama-Guard-4-12B", "Llama-Guard-3-8B", "Llama-Guard-3-1B", "Llama-Guard-3-11B-Vision"] {allow-input: true, isTemplate: true}
model_id = os.path.join(VERTEX_AI_MODEL_GARDEN_LLAMA_GUARD, base_model_name)
hf_model_id = "meta-llama/" + base_model_name
version_id = base_model_name.lower()
PUBLISHER_MODEL_NAME = f"publishers/meta/models/llama-guard@{version_id}"

# The pre-built serving docker images.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250429_0916_RC01"

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
if "3-1B" in base_model_name or "3-8B" in base_model_name:
    accelerator_type = "NVIDIA_L4"
    machine_type = "g2-standard-12"
    accelerator_count = 1
    max_num_seqs = 256
elif "3-11B" in base_model_name or "4-12B" in base_model_name:
    accelerator_type = "NVIDIA_TESLA_A100"
    machine_type = "a2-highgpu-1g"
    accelerator_count = 1
    max_num_seqs = 12
else:
    raise ValueError(f"Recommended GPU setting not found for: {base_model_name}.")

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

In [None]:
# @title [Option 1] Deploy with Model Garden SDK

# @markdown Deploy with Gen AI model-centric SDK. This section uploads the prebuilt model to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model. See [use open models with Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-open-models) for documentation on other use cases.
from vertexai.preview import model_garden

model = model_garden.OpenModel(PUBLISHER_MODEL_NAME)
endpoints[LABEL] = model.deploy(
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
    accept_eula=True,  # Accept the End User License Agreement (EULA) on the model card before deploy. Otherwise, the deployment will be forbidden.
)

In [None]:
# @title [Option 2] Deploy with customized configurations

# @markdown This section uploads Llama Guard models to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

gpu_memory_utilization = 0.9
max_model_len = 4096


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    model_type: str = None,
    enable_llama_tool_parser: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-stats",
    ]

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    if enable_llama_tool_parser:
        vllm_args.append("--enable-auto-tool-choice")
        vllm_args.append("--tool-call-parser=vertex-llama-3")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_llama_guard_deployment.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models[LABEL], endpoints[LABEL] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="llama-guard"),
    model_id=model_id,
    publisher="meta",
    publisher_model_id="llama-guard",
    base_model_id=hf_model_id,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
    enforce_eager=False,
    use_dedicated_endpoint=use_dedicated_endpoint,
    max_num_seqs=max_num_seqs,
    enable_llama_tool_parser=False,
)
# @markdown Click "Show Code" to see more details.

## Use the Llama Guard models to safeguard LLM inputs and outputs with the Vertex Llama 3.1 API service

We use [meta-llama/Llama-Guard-3-8B](https://huggingface.co/meta-llama/Llama-Guard-3-8B) to safeguard input and output conversations with the [Llama 3.1 405B Instruct model API service on Vertex](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama-3.1-405b-instruct-maas).

Llama Guard 3 builds on the capabilities introduced with Llama Guard 2, adding three new categories, Defamation, Elections and Code Interpreter Abuse. Additionally this model is multilingual and a new prompt format is introduced, making Llama Guard 3â€™s prompt format consistent with Llama 3+ Instruct models.

This section references [LlamaGuard.ipynb](https://colab.research.google.com/drive/16s0tlCSEDtczjPzdIK3jq0Le5LlnSYGf?usp=sharing) from [https://huggingface.co/meta-llama/LlamaGuard-7b](https://huggingface.co/meta-llama/LlamaGuard-7b).

In [None]:
!pip install --upgrade --quiet openai

In [None]:
import google.auth
import openai

# @markdown Set up the Llama 3.1 405B Instruct model API service.

# Programmatically get an access token
creds, _ = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
# Note: the credential lives for 1 hour by default (https://cloud.google.com/docs/authentication/token-types#at-lifetime); after expiration, it must be refreshed.

client = openai.OpenAI(
    base_url=f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/openapi",
    api_key=creds.token,
)
LLAMA3_405B_INSTRUCT = "meta/llama-3.1-405b-instruct-maas"

In [None]:
# @markdown Define input message in conversation and get output message from model.

message_role = "user"  # @param {type: "string"}
message_content = "What is a car?"  # @param {type: "string"}

messages = [
    {
        "role": message_role,
        "content": message_content,
    }
]
print("Conversation [turn 1]:", messages)

response = client.chat.completions.create(
    model=LLAMA3_405B_INSTRUCT,
    messages=messages,
)
print("Response:", response)

messages.append(
    {
        "role": response.choices[0].message.role,
        "content": response.choices[0].message.content,
    }
)
print("Conversation [turn 2]:", messages)

In [None]:
# @markdown Use Llama Guard to classify the conversation: safe versus unsafe.
# @markdown Classification is performed on the last turn of the conversation.
# @markdown If the content is safe, the model will return `safe`. If the content is unsafe, the model will return `unsafe` and additionally the list of offending categories as a comma-separated list in a new line.
# @markdown Set `"@requestFormat": "chatCompletions"` to use the OpenAI chat completions format.

instances = [
    {
        "messages": messages,
        "@requestFormat": "chatCompletions",
    },
]
response = endpoints["vllm_gpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

prediction = response.predictions["choices"][0]["message"]["content"]
print("Llama Guard prediction:", prediction)

## Use the Llama Guard models to safeguard LLM vision inputs and outputs with the Vertex Llama 3.2 API service

We use [meta-llama/Llama-Guard-3-11B-Vision](https://huggingface.co/meta-llama/Llama-Guard-3-11B-Vision) to safeguard input and output conversations with the [Llama 3.2 90B-Vision-Instruct model API service on Vertex](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama-3.2-90b-vision-instruct-maas).

In [None]:
!pip install --upgrade --quiet openai

In [None]:
import google.auth
import openai

# @markdown Set up the Llama 3.2 90B-Vision-Instruct model API service.

# Programmatically get an access token
creds, _ = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
# Note: the credential lives for 1 hour by default (https://cloud.google.com/docs/authentication/token-types#at-lifetime); after expiration, it must be refreshed.

client = openai.OpenAI(
    base_url=f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/openapi",
    api_key=creds.token,
)
LLAMA3_90B_VISION_INSTRUCT = "meta/llama-3.2-90b-vision-instruct-maas"

In [None]:
# @markdown Define input message in conversation and get output message from model.

user_image = "https://upload.wikimedia.org/wikipedia/commons/thumb/c/cb/The_Blue_Marble_%28remastered%29.jpg/580px-The_Blue_Marble_%28remastered%29.jpg"  # @param {type: "string"}
user_message = "What is in the image?"  # @param {type: "string"}

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {"url": user_image},
            },
            {"type": "text", "text": user_message},
        ],
    }
]

print("Conversation [turn 1]:", messages)

response = client.chat.completions.create(
    model=LLAMA3_90B_VISION_INSTRUCT,
    messages=messages,
)
print("Response:", response)

messages.append(
    {
        "role": response.choices[0].message.role,
        "content": response.choices[0].message.content,
    }
)

print("Conversation [turn 2]:", messages)

In [None]:
# @markdown Use Llama Guard to classify the conversation: safe versus unsafe.
# @markdown Classification is performed on the last turn of the conversation.
# @markdown If the content is safe, the model will return `safe`. If the content is unsafe, the model will return `unsafe` and additionally the list of offending categories as a comma-separated list in a new line.
# @markdown Set `"@requestFormat": "chatCompletions"` to use the OpenAI chat completions format.

instances = [
    {
        "messages": messages,
        "@requestFormat": "chatCompletions",
    },
]
response = endpoints["vllm_gpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

prediction = response.predictions["choices"][0]["message"]["content"]
print("Llama Guard prediction:", prediction)

## Use the Llama Guard models to safeguard LLM vision inputs and outputs with the Vertex Llama 4 API service

We use [meta-llama/Llama-Guard-4-12B](https://huggingface.co/meta-llama/Llama-Guard-4-12B) to safeguard input and output conversations with the [Llama 4 model API service on Vertex](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama-4-maverick-17b-128e-instruct-maas).

In [None]:
!pip install --upgrade --quiet openai

In [None]:
import google.auth
import openai

# @markdown Set up the Llama 4 model API service.

# Programmatically get an access token
creds, _ = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
# Note: the credential lives for 1 hour by default (https://cloud.google.com/docs/authentication/token-types#at-lifetime); after expiration, it must be refreshed.

client = openai.OpenAI(
    base_url=f"https://us-east5-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/openapi",
    api_key=creds.token,
)
LLAMA4_MODEL_ID = "meta/llama-4-scout-17b-16e-instruct-maas"  # @param ["meta/llama-4-scout-17b-16e-instruct-maas", "meta/llama-4-maverick-17b-128e-instruct-maas"]

In [None]:
# @markdown Define input message in conversation and get output message from model.

user_image = "https://upload.wikimedia.org/wikipedia/commons/thumb/c/cb/The_Blue_Marble_%28remastered%29.jpg/580px-The_Blue_Marble_%28remastered%29.jpg"  # @param {type: "string"}
user_message = "What is in the image?"  # @param {type: "string"}

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {"url": user_image},
            },
            {"type": "text", "text": user_message},
        ],
    }
]

print("Conversation [turn 1]:", messages)

response = client.chat.completions.create(
    model=LLAMA4_MODEL_ID,
    messages=messages,
)
print("Response:", response)

messages.append(
    {
        "role": response.choices[0].message.role,
        "content": response.choices[0].message.content,
    }
)

print("Conversation [turn 2]:", messages)

In [None]:
# @markdown Use Llama Guard to classify the conversation: safe versus unsafe.
# @markdown Classification is performed on the last turn of the conversation.
# @markdown If the content is safe, the model will return `safe`. If the content is unsafe, the model will return `unsafe` and additionally the list of offending categories as a comma-separated list in a new line.
# @markdown Set `"@requestFormat": "chatCompletions"` to use the OpenAI chat completions format.

instances = [
    {
        "messages": messages,
        "@requestFormat": "chatCompletions",
    },
]
response = endpoints["vllm_gpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

prediction = response.predictions["choices"][0]["message"]["content"]
print("Llama Guard prediction:", prediction)

## Clean up resources

In [None]:
# @title Delete the models and endpoints
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()