dags/pytorch_xla/pytorchxla-torchbench-release.py (112 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A DAG to run all TorchBench tests with nightly version."""
from airflow import models
import datetime
from dags import composer_env
from dags.pytorch_xla.configs import pytorchxla_torchbench_config as config
import dags.common.vm_resource as resource
SCHEDULED_TIME = None
with models.DAG(
dag_id="pytorchxla-torchbench-release",
schedule=SCHEDULED_TIME,
tags=["pytorchxla", "release", "torchbench"],
start_date=datetime.datetime(2024, 1, 1),
catchup=False,
) as dag:
model = "all" if composer_env.is_prod_env() else "BERT_pytorch"
torchbench_extra_flags = [f"--filter={model}"]
test_version = config.VERSION.R2_7
# Running on V4-8:
config.get_torchbench_tpu_config(
tpu_version=resource.TpuVersion.V4,
tpu_cores=8,
project=resource.Project.CLOUD_ML_BENCHMARKING,
tpu_zone=resource.Zone.US_CENTRAL2_B,
runtime_version=resource.RuntimeVersion.TPU_UBUNTU2204_BASE,
network=resource.BM_NETWORKS,
subnetwork=resource.V4_BM_SUBNETWORKS,
test_version=test_version,
model_name=model,
time_out_in_min=1800,
reserved=False,
preemptible=True,
extraFlags=" ".join(torchbench_extra_flags),
)
# Running on V5P
config.get_torchbench_tpu_config(
tpu_version=resource.TpuVersion.V5P,
tpu_cores=8,
project=resource.Project.TPU_PROD_ENV_AUTOMATED,
tpu_zone=resource.Zone.US_EAST5_A,
runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV5,
network=resource.V5_NETWORKS,
subnetwork=resource.V5P_SUBNETWORKS,
time_out_in_min=1800,
model_name=model,
reserved=False,
preemptible=False,
extraFlags=" ".join(torchbench_extra_flags),
)
# Running on V5E
config.get_torchbench_tpu_config(
tpu_version=resource.TpuVersion.V5E,
tpu_cores=4,
project=resource.Project.CLOUD_ML_BENCHMARKING,
tpu_zone=resource.Zone.US_WEST1_C,
runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV5_LITE,
network=resource.BM_NETWORKS,
subnetwork=resource.V5E_BM_SUBNETWORKS,
time_out_in_min=1600,
test_version=test_version,
model_name=model,
reserved=False,
preemptible=False,
extraFlags=" ".join(torchbench_extra_flags),
)
# Running on V100 GPU
config.get_torchbench_gpu_gke_config(
machine_type=resource.MachineVersion.N1_STANDARD_16,
image_family=resource.ImageFamily.COMMON_CU124_DEBIAN_11,
accelerator_type=resource.GpuVersion.V100,
count=2,
gpu_zone=resource.Region.US_CENTRAL1,
test_version=test_version,
project_name=resource.Project.CLOUD_ML_BENCHMARKING,
cluster_name="benchmarking-gpu-uc1",
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()
# Running on A100 GPU
config.get_torchbench_gpu_gke_config(
machine_type=resource.MachineVersion.A2_HIGHGPU_1G,
image_family=resource.ImageFamily.COMMON_CU124_DEBIAN_11,
accelerator_type=resource.GpuVersion.A100,
count=1,
gpu_zone=resource.Region.US_CENTRAL1,
test_version=test_version,
project_name=resource.Project.CLOUD_ML_BENCHMARKING,
cluster_name="benchmarking-gpu-uc1",
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()
# Running on H100 GPU
config.get_torchbench_gpu_gke_config(
machine_type=resource.MachineVersion.A3_HIGHGPU_8G,
image_family=resource.ImageFamily.COMMON_CU124_DEBIAN_11,
accelerator_type=resource.GpuVersion.H100,
count=8,
gpu_zone=resource.Region.US_CENTRAL1,
test_version=test_version,
project_name=resource.Project.CLOUD_ML_BENCHMARKING,
cluster_name="benchmarking-gpu-uc1",
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()
# Running on L4 GPU
config.get_torchbench_gpu_gke_config(
machine_type=resource.MachineVersion.G2_STAND_16,
image_family=resource.ImageFamily.COMMON_CU124_DEBIAN_11,
accelerator_type=resource.GpuVersion.L4,
count=1,
gpu_zone=resource.Region.US_CENTRAL1,
test_version=test_version,
project_name=resource.Project.CLOUD_ML_BENCHMARKING,
cluster_name="benchmarking-gpu-uc1",
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()