dags/solutions_team/solutionsteam_vllm

"""A DAG to run vllm benchmarks with nightly version.""" import datetime import enum from airflow import models from airflow.models.baseoperator import chain from dags import composer_env from dags.common import test_owner from dags.common.vm_resource import AcceleratorType, GpuVersion, TpuVersion, Region, Zone, Project, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS, BM_NETWORKS, A100_BM_SUBNETWORKS, ImageProject, ImageFamily, MachineVersion, RuntimeVersion from dags.multipod.configs.common import SetupMode, Platform from dags.solutions_team.configs.vllm import vllm_benchmark_config # Run once a day at 6 pm UTC (10 am PST) SCHEDULED_TIME = "0 18 * * *" if composer_env.is_prod_env() else None with models.DAG( dag_id="solutionsteam_vllm_benchmark", schedule=SCHEDULED_TIME, tags=["solutions_team", "nightly", "supported", "xlml"], start_date=datetime.datetime(2024, 1, 19), catchup=False, ) as dag: test_name_prefix = "solutionsteam-vllm-benchmark" # Generate a test run id in the format YYYYMMDD-HHmm to group # tests together. now = datetime.datetime.now() test_run_id = now.strftime("%Y%m%d-%H%M") test_models = { "llama3-8b": { "accelerator_specs": [ ( AcceleratorType.GPU, (MachineVersion.A2_HIGHGPU_1G, GpuVersion.A100, 1), ), (AcceleratorType.TPU, (TpuVersion.V5E, 8)), ], # TODO: Support other backends "backend": ["vllm"], "model_id": ["meta-llama/Meta-Llama-3.1-8B"], "dataset": "ShareGPT_V3_unfiltered_cleaned_split.json", "request_rates": "1,2,4,8,16,32", "num_prompts": 1000, "max_output_length": 1024, "max_cache_length": 2048, # TODO: Add support for quantization "sharding_config": "default_shardings/llama.yaml", }, } for model, sweep_model_configs in test_models.items(): for backend in sweep_model_configs["backend"]: for model_id in sweep_model_configs["model_id"]: for accelerator_type, accelerator_spec in sweep_model_configs[ "accelerator_specs" ]: model_configs = {} model_configs["backend"] = backend model_configs["model_id"] = model_id model_configs["dataset"] = sweep_model_configs["dataset"] model_configs["request_rates"] = sweep_model_configs["request_rates"] model_configs["num_prompts"] = sweep_model_configs["num_prompts"] model_configs["max_output_length"] = sweep_model_configs[ "max_output_length" ] model_configs["max_cache_length"] = sweep_model_configs[ "max_cache_length" ] model_configs["sharding_config"] = sweep_model_configs[ "sharding_config" ] if accelerator_type == AcceleratorType.TPU: project = Project.TPU_PROD_ENV_AUTOMATED zone = Zone.US_EAST1_C tpu_version, tpu_cores = accelerator_spec runtime_version = RuntimeVersion.V2_ALPHA_TPUV5_LITE.value network = V5_NETWORKS subnetwork = V5E_SUBNETWORKS vllm_benchmark_config.get_tpu_vllm_gce_config( tpu_version=tpu_version, tpu_cores=tpu_cores, tpu_zone=zone, backend=backend, runtime_version=runtime_version, project=project, time_out_in_min=120, is_tpu_reserved=True, test_name=f"{test_name_prefix}-tpu-nightly-{model}-{backend}", test_run_id=test_run_id, network=network, subnetwork=subnetwork, model_configs=model_configs, ) elif accelerator_type == AcceleratorType.GPU: project = Project.CLOUD_ML_BENCHMARKING zone = Zone.US_WEST4_B machine_version, gpu_version, count = accelerator_spec image_project = ImageProject.DEEP_LEARNING_PLATFORM_RELEASE image_family = ImageFamily.COMMON_CU124_DEBIAN_11 network = BM_NETWORKS subnetwork = A100_BM_SUBNETWORKS vllm_benchmark_config.get_gpu_vllm_gce_config( machine_version=machine_version, image_project=image_project, image_family=image_family, gpu_version=gpu_version, count=count, backend=backend, project=project, gpu_zone=zone, time_out_in_min=120, test_name=f"{test_name_prefix}-gpu-nightly-{model}-{backend}", test_run_id=test_run_id, network=network, subnetwork=subnetwork, model_configs=model_configs, reservation=True, ).run()

dags/solutions_team/solutionsteam_vllm_benchmarks.py (107 lines of code) (raw):