dags/inference/trt_llm_mlperf_v41

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """DAGs to run TensorRT-LLM MLPerf Inference benchmarks on multiple GPUs.""" import datetime from airflow import models from dags import composer_env from dags.common.vm_resource import A100_INFERENCE_SUBNETWORKS, H100_INFERENCE_SUBNETWORKS, GpuVersion, Zone, ImageFamily, ImageProject, MachineVersion, Project, INFERENCE_NETWORKS, L4_INFERENCE_SUBNETWORKS from dags.inference.configs import trt_llm_mlperf_v41_config # Run once a day at 1 pm UTC (5 am PST) SCHEDULED_TIME = "1 3 * * *" if composer_env.is_prod_env() else None with models.DAG( dag_id="trt_llm_mlperf_v41", schedule=SCHEDULED_TIME, tags=[ "inference_team", "tensorrt_llm", "mlperf", "nightly", "benchmark", ], start_date=datetime.datetime(2024, 9, 9), catchup=False, ) as dag: test_name_prefix = "tensorrt-llm-mlperf-v41-inference" config_ver = "default,high_accuracy" test_mode = "PerformanceOnly" g2_configs = { "model_name": "bert,3d-unet", "config_ver": config_ver, "test_mode": test_mode, "docker_config": "gs://yijiaj/mlperf/config.json", "models": "gs://yijiaj/mlperf/g2/models", "preprocessed_data": "gs://yijiaj/mlperf/g2/preprocessed_data", } g2_model_parameters = { "bert": { "Offline": { "offline_expected_qps": (1000, 1200), }, "Server": { "server_target_qps": (900, 1200), }, }, "3d-unet": { "Offline": { "offline_expected_qps": (1.3, 2.6), }, }, "dlrm-v2": { "Offline": { "offline_expected_qps": (3400, 3500), }, "Server": { "server_target_qps": (3300, 3500), }, }, "gptj": { "Offline": { "offline_expected_qps": (1.3, 1.6), }, "Server": { "server_target_qps": (0.88, 1), }, }, "resnet50": { "Offline": { "offline_expected_qps": (13000, 15000), }, "Server": { "server_target_qps": (11532.8125, 11600), }, }, "retinanet": { "Offline": { "offline_expected_qps": (220, 230), }, "Server": { "server_target_qps": (200, 220), }, }, } g2_parameter_position = { "bert": { "Offline": { "offline_expected_qps": 309, }, "Server": { "server_target_qps": 278, }, }, "3d-unet": { "Offline": { "offline_expected_qps": 55, }, }, "dlrm-v2": { "Offline": { "offline_expected_qps": 233, }, "Server": { "server_target_qps": 176, }, }, "gptj": { "Offline": { "offline_expected_qps": 191, }, "Server": { "server_target_qps": 158, }, }, "resnet50": { "Offline": { "offline_expected_qps": 48, }, "Server": { "server_target_qps": 52, }, }, "retinanet": { "Offline": { "offline_expected_qps": 51, }, "Server": { "server_target_qps": 57, }, }, } a2_configs = { "model_name": "bert,3d-unet", "config_ver": config_ver, "test_mode": test_mode, "docker_config": "gs://yijiaj/mlperf/config.json", "models": "gs://yijiaj/mlperf/a2/models", "preprocessed_data": "gs://yijiaj/mlperf/a2/preprocessed_data", } a2_model_parameters = { "bert": { "Offline": { "offline_expected_qps": (27000, 27500), }, "Server": { "server_target_qps": (25400, 25600), }, }, "3d-unet": { "Offline": { "offline_expected_qps": (30, 40), }, }, "resnet50": { "Offline": { "offline_expected_qps": (340000, 360000), }, "Server": { "server_target_qps": (290000, 299000), }, }, "retinanet": { "Offline": { "offline_expected_qps": (5840, 5980), }, "Server": { "server_target_qps": (5600, 5800), }, }, } a2_parameter_position = { "bert": { "Offline": { "offline_expected_qps": 411, }, "Server": { "server_target_qps": 560, }, }, "3d-unet": { "Offline": { "offline_expected_qps": 623, }, }, "resnet50": { "Offline": { "offline_expected_qps": 456, }, "Server": { "server_target_qps": 396, }, }, "retinanet": { "Offline": { "offline_expected_qps": 269, }, "Server": { "server_target_qps": 244, }, }, } a3_configs = { "model_name": "resnet50,retinanet,stable-diffusion-xl,llama2-70b,mixtral-8x7b", "config_ver": config_ver, "test_mode": test_mode, "docker_config": "gs://yijiaj/mlperf/config.json", "models": "gs://yijiaj/mlperf/a3/models", "preprocessed_data": "gs://yijiaj/mlperf/a3/preprocessed_data", } a3_model_parameters = { "bert": { "Offline": { "offline_expected_qps": (75200, 76000), }, "Server": { "server_target_qps": (56000, 60000), }, }, "3d-unet": { "Offline": { "offline_expected_qps": (54.4, 64), }, }, "dlrm-v2": { "Offline": { "offline_expected_qps": (616000, 620000), }, "Server": { "server_target_qps": (458203.125, 510000), }, }, "gptj": { "Offline": { "offline_expected_qps": (288, 300), }, "Server": { "server_target_qps": (279.36, 285), }, }, "resnet50": { "Offline": { "offline_expected_qps": (720000, 740000), }, "Server": { "server_target_qps": (584000, 586000), }, }, "retinanet": { "Offline": { "offline_expected_qps": (13600, 14000), }, "Server": { "server_target_qps": (12880, 13000), }, }, "stable-diffusion-xl": { "Offline": { "offline_expected_qps": (16, 18), }, "Server": { "server_target_qps": (16.3, 18), }, }, "llama2-70b": { "Offline": { "offline_expected_qps": (80, 86), }, "Server": { "server_target_qps": (75, 80), }, }, "mixtral-8x7b": { "Offline": { "offline_expected_qps": (368, 386), }, "Server": { "server_target_qps": (345, 360), }, }, } a3_parameter_position = { "bert": { "Offline": { "offline_expected_qps": 196, }, "Server": { "server_target_qps": 238, }, }, "3d-unet": { "Offline": { "offline_expected_qps": 160, }, }, "dlrm-v2": { "Offline": { "offline_expected_qps": 65, }, "Server": { "server_target_qps": 65, }, }, "gptj": { "Offline": { "offline_expected_qps": 48, }, "Server": { "server_target_qps": 91, }, }, "resnet50": { "Offline": { "offline_expected_qps": 84, }, "Server": { "server_target_qps": 132, }, }, "retinanet": { "Offline": { "offline_expected_qps": 139, }, "Server": { "server_target_qps": 127, }, }, "stable-diffusion-xl": { "Offline": { "offline_expected_qps": 55, }, "Server": { "server_target_qps": 59, }, }, "llama2-70b": { "Offline": { "offline_expected_qps": 75, }, "Server": { "server_target_qps": 74, }, }, "mixtral-8x7b": { "Offline": { "offline_expected_qps": 74, }, "Server": { "server_target_qps": 64, }, }, } # Running on A100 GPU trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config( machine_type=MachineVersion.A2_ULTRAGPU_8G, image_project=ImageProject.ML_IMAGES, image_family=ImageFamily.COMMON_CU124_DEBIAN_11, accelerator_type=GpuVersion.A100_80G, count=8, gpu_zone=Zone.US_CENTRAL1_A, time_out_in_min=1600, test_name=f"{test_name_prefix}-nightly-test-a100-8", project=Project.CLOUD_TPU_INFERENCE_TEST, network=INFERENCE_NETWORKS, subnetwork=A100_INFERENCE_SUBNETWORKS, benchmark_configs=a2_configs, model_parameters=a2_model_parameters, parameter_positions=a2_parameter_position, binary_search_steps=2, ).run() # Running on L4 GPU trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config( machine_type=MachineVersion.G2_STAND_96, image_project=ImageProject.ML_IMAGES, image_family=ImageFamily.COMMON_CU124_DEBIAN_11, accelerator_type=GpuVersion.L4, count=8, gpu_zone=Zone.US_CENTRAL1_C, time_out_in_min=1600, test_name=f"{test_name_prefix}-nightly-test-l4-1", project=Project.CLOUD_TPU_INFERENCE_TEST, network=INFERENCE_NETWORKS, subnetwork=L4_INFERENCE_SUBNETWORKS, benchmark_configs=g2_configs, model_parameters=g2_model_parameters, parameter_positions=g2_parameter_position, binary_search_steps=2, ).run() # Running on H100 GPU trt_llm_mlperf_v41_config.get_trt_llm_mlperf_gpu_config( machine_type=MachineVersion.A3_HIGHGPU_8G, image_project=ImageProject.DEEP_LEARNING_PLATFORM_RELEASE, image_family=ImageFamily.COMMON_CU124_DEBIAN_11, accelerator_type=GpuVersion.H100, count=8, gpu_zone=Zone.US_CENTRAL1_A, time_out_in_min=1600, test_name=f"{test_name_prefix}-nightly-test-h100-8", project=Project.CLOUD_TPU_INFERENCE_TEST, network=INFERENCE_NETWORKS, subnetwork=H100_INFERENCE_SUBNETWORKS, benchmark_configs=a3_configs, model_parameters=a3_model_parameters, parameter_positions=a3_parameter_position, binary_search_steps=2, ).run()

dags/inference/trt_llm_mlperf_v41_inference.py (395 lines of code) (raw):