dags/inference/trtllm_bench_inference.py (28 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """A DAG to run TensorRT-LLM inference benchmarks with nightly version.""" import datetime from airflow import models from dags import composer_env from dags.common.vm_resource import H200_INFERENCE_SUBNETWORKS, INFERENCE_NETWORKS, GpuVersion, Zone, ImageFamily, ImageProject, MachineVersion, Project from dags.inference.configs import trtllm_bench_inference_config # Run once a day at 4 am UTC (8 pm PST) SCHEDULED_TIME = "0 4 * * *" if composer_env.is_prod_env() else None with models.DAG( dag_id="trtllm_bench_inference", schedule=SCHEDULED_TIME, tags=["inference_team", "tensorrt_llm", "nightly", "benchmark"], start_date=datetime.datetime(2025, 1, 25), catchup=False, ) as dag: test_name_prefix = "trtllm_bench_inference" # Running on H200 GPU trtllm_bench_inference_config.get_trtllm_bench_config( machine_type=MachineVersion.A3_ULTRAGPU_8G, image_project=ImageProject.ML_IMAGES, image_family=ImageFamily.COMMON_CU124_DEBIAN_11, accelerator_type=GpuVersion.H200, count=8, gpu_zone=Zone.EUROPE_WEST1_B, time_out_in_min=1600, test_name=f"{test_name_prefix}-nightly-h200-8", project=Project.CLOUD_TPU_INFERENCE_TEST, network=INFERENCE_NETWORKS, subnetwork=H200_INFERENCE_SUBNETWORKS, existing_instance_name="yijiaj-a3u-test-h200x8", ).run()