In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Nvidia Cosmos 1.0

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_nvidia_cosmos_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_nvidia_cosmos_deployment.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates deploying Nvidia Cosmos world foundation models (WFM) on Vertex AI for online prediction.
  - [nvidia/Cosmos-1.0-Diffusion-7B-Text2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-7B-Text2World)
  - [nvidia/Cosmos-1.0-Diffusion-14B-Text2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-14B-Text2World)
  - [nvidia/Cosmos-1.0-Diffusion-7B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-7B-Video2World)
  - [nvidia/Cosmos-1.0-Diffusion-14B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-14B-Video2World)

### Objective

- Upload the model to [Model Registry](https://cloud.google.com/vertex-ai/docs/model-registry/introduction).
- Deploy the model on [Endpoint](https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints).
- Run online predictions for `text-to-world` and `video-to-world`.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Run the notebook

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

import importlib
import os

from google.cloud import aiplatform
from IPython.display import HTML

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

! gcloud config set project $PROJECT_ID

models, endpoints = {}, {}

! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

In [None]:
# @title Deploy the [Text2World] model to Vertex for online predictions

# @markdown This section uploads the [Text2World] model to Model Registry and deploys it on the Endpoint with the specified accelerator.

# @markdown The deployment process takes approximately 15-30 minutes to complete.
# @markdown A valid HF_TOKEN is required for model deployment.
# @markdown Follow the instructions at [Hugging Face Token Guide](https://huggingface.co/docs/hub/en/security-tokens) to obtain your HF_TOKEN.
# @markdown Additionally, ensure you have access to the model by following the instructions on its Hugging Face model card page.

HF_TOKEN = ""  # @param {type:"string", isTemplate: true}
if not HF_TOKEN:
    print("Error: HF_TOKEN is required to deploy the model.")

# @markdown The inference timeout is set to 30 minutes, as the video generation process can take a long time.
INFERENCE_TIMEOUT_SECS = 1800
model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Text2World"  # @param ["nvidia/Cosmos-1.0-Diffusion-7B-Text2World", "nvidia/Cosmos-1.0-Diffusion-14B-Text2World"]
task = "text-to-world"

accelerator_type = "NVIDIA_H100_80GB"  # @param ["NVIDIA_H100_80GB", "NVIDIA_A100_80GB"]

machine_type_map = {
    "NVIDIA_A100_80GB": "a2-ultragpu-1g",
    "NVIDIA_H100_80GB": "a3-highgpu-2g",
}

machine_type = machine_type_map.get(accelerator_type)
accelerator_count = 1

if accelerator_type == "NVIDIA_H100_80GB":
    machine_type = "a3-highgpu-2g"
    accelerator_count = 2


# The pre-built serving docker image. It contains serving scripts and models.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-cosmos:20250314"


def deploy_model(model_id, task, machine_type, accelerator_type, accelerator_count):
    """Create a Vertex AI Endpoint and deploy the specified model to the endpoint."""
    common_util.check_quota(
        project_id=PROJECT_ID,
        region=REGION,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        is_for_training=False,
    )

    model_name = model_id

    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=True,
        sync=True,
        inference_timeout=INFERENCE_TIMEOUT_SECS,
    )
    serving_env = {
        "MODEL_ID": model_id,
        "TASK": task,
        "DEPLOY_SOURCE": "notebook",
        "HUGGING_FACE_HUB_TOKEN": HF_TOKEN,
        "OFFLOAD_NETWORK": "false",
        "OFFLOAD_TOKENIZER": "false",
        "OFFLOAD_TEXT_ENCODER_MODEL": "false",
        "OFFLOAD_GUARDRAIL_MODELS": "true",
        "OFFLOAD_PROMPT_UPSAMPLER": "true",
    }

    # Also offload the text encoder model for 14B models, to avoid CUDA OOM issue.
    if model_id.lower().includes("14b"):
        serving_env["OFFLOAD_TEXT_ENCODER_MODEL"] = "true"

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SERVE_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predict",
        serving_container_health_route="/health",
        serving_container_environment_variables=serving_env,
        model_garden_source_model_name="publishers/nvidia/models/cosmos",
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=SERVICE_ACCOUNT,
        system_labels={"NOTEBOOK_NAME": "model_garden_nvidia_cosmos_deployment.ipynb"},
    )
    return model, endpoint


models["model"], endpoints["endpoint"] = deploy_model(
    model_id=model_id,
    task=task,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)

print("endpoint_name:", endpoints["endpoint"].name)

In [None]:
# @title [Text2World] Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. The inference takes:

# @markdown - ~800s with 1 A100 80GB GPU.
# @markdown
# @markdown - ~420s with 2 H100 80GB GPU

# @markdown Example:
# @markdown ```json
# @markdown {
# @markdown   "instances":[
# @markdown     {
# @markdown       "text":"A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves.",
# @markdown     }
# @markdown   ],
# @markdown    "parameters": {
# @markdown      "negative_prompt": "",
# @markdown      "guidance": 7.0,
# @markdown      "num_steps": 30,
# @markdown      "height": 704,
# @markdown      "width": 1280,
# @markdown      "fps": 24,
# @markdown      "num_video_frames": 121,
# @markdown      "seed": 42
# @markdown    }
# @markdown  }
# @markdown }
# @markdown ```

# @markdown You can adjust the parameters below to use your own text prompt.
# @markdown The `negative_prompt` parameter is optional. If not specified, a default value will be used.
# @markdown You can find the default value here: [Inference Utils (Line 104)](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/diffusion/inference/inference_utils.py#L104).
# @markdown
# @markdown For inference tasks exceeding 10 minutes, we recommend using CURL for predictions. Refer to the following sections for detailed instructions.

text = "A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves."  # @param {type: "string"}

instances = [{"text": text}]
parameters = {
    "negative_prompt": "",
    "guidance": 7.0,
    "num_steps": 30,
    "height": 704,
    "width": 1280,
    "fps": 24,
    "num_video_frames": 121,
    "seed": 42,
}


response = endpoints["endpoint"].predict(
    instances=instances, parameters=parameters, use_dedicated_endpoint=True
)

video_bytes = response.predictions[0]["output"]

video_html = f"""
<video width="1280" height="704" controls>
<source src="data:video/mp4;base64,{video_bytes}" type="video/mp4">
Your browser does not support the video tag.
</video>
"""  # Assumes MP4. Change type if needed (e.g., video/webm)

display(HTML(video_html))

In [None]:
# @title Deploy the [Video2World] model to Vertex for online predictions

# @markdown This section uploads the [Video2World] model to Model Registry and deploys it on the Endpoint with the specified accelerator.

# @markdown The deployment process takes approximately 15-30 minutes to complete.
# @markdown A valid HF_TOKEN is required for model deployment.
# @markdown Follow the instructions at [Hugging Face Token Guide](https://huggingface.co/docs/hub/en/security-tokens) to obtain your HF_TOKEN.
# @markdown Additionally, ensure you have access to the model by following the instructions on its Hugging Face model card page.

HF_TOKEN = ""  # @param {type:"string", isTemplate: true}
if not HF_TOKEN:
    print("Error: HF_TOKEN is required to deploy the model.")
# @markdown The inference timeout is set to 30 minutes, as the video generation process can take a long time.
INFERENCE_TIMEOUT_SECS = 1800

model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Video2World"  # @param ["nvidia/Cosmos-1.0-Diffusion-7B-Video2World", "nvidia/Cosmos-1.0-Diffusion-14B-Video2World"]
task = "video-to-world"

accelerator_type = "NVIDIA_H100_80GB"  # @param ["NVIDIA_H100_80GB", "NVIDIA_A100_80GB"]

machine_type_map = {
    "NVIDIA_A100_80GB": "a2-ultragpu-1g",
    "NVIDIA_H100_80GB": "a3-highgpu-2g",
}

machine_type = machine_type_map.get(accelerator_type)
accelerator_count = 1

if accelerator_type == "NVIDIA_H100_80GB":
    machine_type = "a3-highgpu-2g"
    accelerator_count = 2


# The pre-built serving docker image. It contains serving scripts and models.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-cosmos:20250314"


def deploy_model(model_id, task, machine_type, accelerator_type, accelerator_count):
    """Create a Vertex AI Endpoint and deploy the specified model to the endpoint."""
    common_util.check_quota(
        project_id=PROJECT_ID,
        region=REGION,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        is_for_training=False,
    )

    model_name = model_id

    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=True,
        sync=True,
        inference_timeout=INFERENCE_TIMEOUT_SECS,
    )
    serving_env = {
        "MODEL_ID": model_id,
        "TASK": task,
        "DEPLOY_SOURCE": "notebook",
        "HUGGING_FACE_HUB_TOKEN": HF_TOKEN,
        "OFFLOAD_NETWORK": "false",
        "OFFLOAD_TOKENIZER": "false",
        "OFFLOAD_TEXT_ENCODER_MODEL": "false",
        "OFFLOAD_GUARDRAIL_MODELS": "true",
        "OFFLOAD_PROMPT_UPSAMPLER": "true",
    }

    # Also offload the text encoder model for 14B models, to avoid CUDA OOM issue.
    if model_id.lower().includes("14b"):
        serving_env["OFFLOAD_TEXT_ENCODER_MODEL"] = "true"

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SERVE_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predict",
        serving_container_health_route="/health",
        serving_container_environment_variables=serving_env,
        model_garden_source_model_name="publishers/nvidia/models/cosmos",
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=SERVICE_ACCOUNT,
        system_labels={"NOTEBOOK_NAME": "model_garden_nvidia_cosmos_deployment.ipynb"},
    )
    return model, endpoint


models["model"], endpoints["endpoint"] = deploy_model(
    model_id=model_id,
    task=task,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)

print("endpoint_name:", endpoints["endpoint"].name)

In [None]:
# @title [Video2World] Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. The inference takes:

# @markdown - ~400s with 1 A100 GPU.
# @markdown
# @markdown - ~400s with 2 H100 GPU

# @markdown Example:
# @markdown ```json
# @markdown  {
# @markdown    "instances": [
# @markdown      {
# @markdown        "gcs_uri": "gs://vertex-model-garden-public-us/cosmos/video2world_input0.jpg",
# @markdown        "num_input_frames": 1
# @markdown      }
# @markdown    ],
# @markdown    "parameters": {
# @markdown      "negative_prompt": "",
# @markdown      "guidance": 7.0,
# @markdown      "num_steps": 25,
# @markdown      "height": 704,
# @markdown      "width": 1280,
# @markdown      "fps": 24,
# @markdown      "num_video_frames": 121,
# @markdown      "seed": 42
# @markdown    }
# @markdown  }
# @markdown ```

# @markdown You can adjust the parameters below to use your own video.
# @markdown The model also supports single-image input by setting `num_input_frames = 1`.
# @markdown Note that `num_input_frames` should match the actual number of frames in your video.
# @markdown The `negative_prompt` parameter is optional. If not specified, a default value will be used.
# @markdown You can find the default value here: [Inference Utils (Line 104)](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/diffusion/inference/inference_utils.py#L104).

# @markdown
# @markdown For inference tasks exceeding 10 minutes, we recommend using CURL for predictions. Refer to the following sections for detailed instructions.

gcs_uri = "gs://vertex-model-garden-public-us/cosmos/video2world_input0.jpg"  # @param {type: "string"}
num_input_frames = 1  # @param {type: "integer"}
negative_prompt = ""  # @param {type: "string"}

instances = [{"gcs_uri": gcs_uri, "num_input_frames": num_input_frames}]
parameters = {
    "negative_prompt": negative_prompt,
    "guidance": 7.0,
    "num_steps": 25,
    "height": 704,
    "width": 1280,
    "fps": 24,
    "num_video_frames": 121,
    "seed": 42,
}


response = endpoints["endpoint"].predict(
    instances=instances, parameters=parameters, use_dedicated_endpoint=True
)

video_bytes = response.predictions[0]["output"]

video_html = f"""
<video width="1280" height="704" controls>
<source src="data:video/mp4;base64,{video_bytes}" type="video/mp4">
Your browser does not support the video tag.
</video>
"""  # Assumes MP4. Change type if needed (e.g., video/webm)

display(HTML(video_html))

In [None]:
# @title Predict with CURL for long-running prediction tasks

# @markdown For inference tasks exceeding 10 minutes, we recommend using CURL for predictions.

os.environ["ENDPOINT_ID"] = endpoints["endpoint"].name
os.environ["PROJECT_ID"] = project_number
os.environ["REGION"] = REGION

In [None]:
%%bash

# Leverage CURL in shell for predictions, especially for long-running tasks (exceeding 10 minutes). 
ENDPOINT_URL="https://${ENDPOINT_ID}.${REGION}-${PROJECT_ID}.prediction.vertexai.goog/v1/projects/${PROJECT_ID}/locations/${REGION}/endpoints/${ENDPOINT_ID}:predict"
TEXT="A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves."
DATA='{"instances": [{"text":"'${TEXT}'"}], "parameters": {"negative_prompt":"", "guidance":7.0,"num_steps":35,"height":704,"width":1280,"fps":24,"num_video_frames":121,"seed":42}}'

curl \
  -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" \
  "${ENDPOINT_URL}" \
  -d "${DATA}" > /content/t2w_response.json

In [None]:
import json

with open("/content/t2w_response.json", "r") as f:
    response_data = json.load(f)

video_bytes = response_data["predictions"][0]["output"]
print(video_bytes)

video_html = f"""
<video width="1280" height="704" controls>
<source src="data:video/mp4;base64,{video_bytes}" type="video/mp4">
Your browser does not support the video tag.
</video>
"""  # Assumes MP4. Change type if needed (e.g., video/webm)

display(HTML(video_html))

In [None]:
# @title Clean up resources
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()