dags/framework3p/microbenchmarks_dag.py (83 lines of code) (raw):
import datetime
from airflow import models
from dags import composer_env
from dags.common import test_owner, vm_resource
from dags.framework3p.configs.microbenchmarks_config import get_microbenchmark_config, get_microbenchmark_xpk_config
# Run once a day at 2 am
SCHEDULED_TIME = "0 2 * * *" if composer_env.is_prod_env() else None
with models.DAG(
dag_id="framework_microbenchmark",
schedule=SCHEDULED_TIME,
tags=["framework_team", "microbenchmark", "xlml"],
start_date=datetime.datetime(2024, 9, 11),
catchup=False,
) as dag:
microbenchmarks_v4_8 = get_microbenchmark_config(
tpu_version=vm_resource.TpuVersion.V4,
tpu_cores=8,
tpu_zone=vm_resource.Zone.US_CENTRAL2_B,
time_out_in_min=120,
runtime_version=vm_resource.RuntimeVersion.TPU_UBUNTU2204_BASE,
project=vm_resource.Project.CLOUD_ML_AUTO_SOLUTIONS,
)
microbenchmarks_v4_16 = get_microbenchmark_config(
tpu_version=vm_resource.TpuVersion.V4,
tpu_cores=16,
tpu_zone=vm_resource.Zone.US_CENTRAL2_B,
time_out_in_min=120,
runtime_version=vm_resource.RuntimeVersion.TPU_UBUNTU2204_BASE,
project=vm_resource.Project.CLOUD_ML_AUTO_SOLUTIONS,
)
microbenchmarks_v5p_8 = get_microbenchmark_config(
tpu_version=vm_resource.TpuVersion.V5P,
tpu_cores=8,
tpu_zone=vm_resource.Zone.US_EAST5_A,
time_out_in_min=60,
runtime_version=vm_resource.RuntimeVersion.V2_ALPHA_TPUV5,
project=vm_resource.Project.TPU_PROD_ENV_AUTOMATED,
network=vm_resource.V5_NETWORKS,
subnetwork=vm_resource.V5P_SUBNETWORKS,
)
microbenchmarks_v5p_256 = get_microbenchmark_config(
tpu_version=vm_resource.TpuVersion.V5P,
tpu_cores=256,
tpu_zone=vm_resource.Zone.US_EAST5_A,
time_out_in_min=60,
runtime_version=vm_resource.RuntimeVersion.V2_ALPHA_TPUV5,
project=vm_resource.Project.TPU_PROD_ENV_AUTOMATED,
network=vm_resource.V5_NETWORKS,
subnetwork=vm_resource.V5P_SUBNETWORKS,
)
microbenchmarks_v5e_4 = get_microbenchmark_config(
tpu_version=vm_resource.TpuVersion.V5E,
tpu_cores=4,
tpu_zone=vm_resource.Zone.US_EAST1_C,
time_out_in_min=120,
runtime_version=vm_resource.RuntimeVersion.V2_ALPHA_TPUV5_LITE,
project=vm_resource.Project.TPU_PROD_ENV_AUTOMATED,
network=vm_resource.V5_NETWORKS,
subnetwork=vm_resource.V5E_SUBNETWORKS,
)
microbenchmarks_v5e_16 = get_microbenchmark_config(
tpu_version=vm_resource.TpuVersion.V5E,
tpu_cores=16,
tpu_zone=vm_resource.Zone.US_EAST1_C,
time_out_in_min=60,
runtime_version=vm_resource.RuntimeVersion.TPU_VM_TF_NIGHTLY_POD,
project=vm_resource.Project.TPU_PROD_ENV_AUTOMATED,
network=vm_resource.V5_NETWORKS,
subnetwork=vm_resource.V5E_SUBNETWORKS,
)
microbenchmarks_v5e_256 = get_microbenchmark_xpk_config(
time_out_in_min=60,
test_name="framework-microbenchmark-v5e-256",
docker_image=vm_resource.DockerImage.MICROBENCH_NIGHTLY.value,
test_owner=test_owner.QINY_Y,
cluster=vm_resource.XpkClusters.TPU_V5E_256_CLUSTER,
).run()
microbenchmarks_v6e_256 = get_microbenchmark_xpk_config(
time_out_in_min=60,
test_name="framework-microbenchmark-v6e-256",
docker_image=vm_resource.DockerImage.MICROBENCH_NIGHTLY.value,
test_owner=test_owner.QINY_Y,
cluster=vm_resource.XpkClusters.TPU_V6E_256_MLPERF_CLUSTER,
).run()
# Test dependency: run in parallel