dags/common/vm_resource.py (285 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """The file for common projects, zone, and runtime versions.""" import datetime import enum from xlml.apis.xpk_cluster_config import XpkClusterConfig V5_NETWORKS_PREFIX = "projects/tpu-prod-env-automated" V5_NETWORKS = f"{V5_NETWORKS_PREFIX}/global/networks/mas-test" V5E_SUBNETWORKS = f"{V5_NETWORKS_PREFIX}/regions/us-east1/subnetworks/mas-test" V5P_SUBNETWORKS = f"{V5_NETWORKS_PREFIX}/regions/us-east5/subnetworks/mas-test" V6E_SUBNETWORKS = ( f"{V5_NETWORKS_PREFIX}/regions/us-central2/subnetworks/mas-test" ) # TODO: Figure V6E_GCE_NETWORK and V6E_GCE_SUBNETWORK V6E_GCE_NETWORK = "default" V6E_GCE_SUBNETWORK = "default" BM_NETWORKS_PREFIX_BENCHMARKING = "projects/cloud-ml-benchmarking" BM_NETWORKS = f"{BM_NETWORKS_PREFIX_BENCHMARKING}/global/networks/mas-test" A100_BM_SUBNETWORKS = "regions/us-west4/subnetworks/mas-test" V4_BM_SUBNETWORKS = f"{BM_NETWORKS}/regions/us-central2/subnetworks/mas-test" V5E_BM_SUBNETWORKS = f"{BM_NETWORKS}/regions/us-west1/subnetworks/mas-test" V5P_BM_SUBNETWORKS = f"{BM_NETWORKS}/regions/us-east5/subnetworks/mas-test" INFERENCE_NETWORK_PREFIX = "projects/cloud-tpu-inference-test" INFERENCE_NETWORKS = f"{INFERENCE_NETWORK_PREFIX}/global/networks/mas-test" US_EAST5_INFERENCE_SUBNETWORKS = ( "regions/us-central1/subnetworks/mas-test-us-east5" ) H100_INFERENCE_SUBNETWORKS = ( "regions/us-central1/subnetworks/mas-test-us-central1" ) A100_INFERENCE_SUBNETWORKS = ( "regions/us-central1/subnetworks/mas-test-us-central1" ) L4_INFERENCE_SUBNETWORKS = ( "regions/us-central1/subnetworks/mas-test-us-central1" ) H200_INFERENCE_SUBNETWORKS = ( "regions/europe-west1/subnetworks/mas-test-europe-west1 " ) class Project(enum.Enum): """Common GCP projects.""" CLOUD_ML_AUTO_SOLUTIONS = "cloud-ml-auto-solutions" CLOUD_ML_BENCHMARKING = "cloud-ml-benchmarking" TPU_PROD_ENV_MULTIPOD = "tpu-prod-env-multipod" TPU_PROD_ENV_AUTOMATED = "tpu-prod-env-automated" CLOUD_TPU_MULTIPOD_DEV = "cloud-tpu-multipod-dev" SUPERCOMPUTER_TESTING = "supercomputer-testing" CLOUD_TPU_INFERENCE_TEST = "cloud-tpu-inference-test" TPU_PROD_ENV_LARGE_ADHOC = "tpu-prod-env-large-adhoc" TPU_PROD_ENV_ONE_VM = "tpu-prod-env-one-vm" TPU_PROD_ENV_LARGE_CONT = "tpu-prod-env-large-cont" class ImageProject(enum.Enum): """Common image projects for GPU.""" DEEP_LEARNING_PLATFORM_RELEASE = "deeplearning-platform-release" ML_IMAGES = "ml-images" class ImageFamily(enum.Enum): """Common image families for GPU.""" COMMON_CU124_DEBIAN_11 = "common-cu124-debian-11" class Region(enum.Enum): """Common GCP regions.""" # used for GKE US_CENTRAL1 = "us-central1" class Zone(enum.Enum): """Common GCP zones.""" # reserved/on-demand v2-32 in cloud-ml-auto-solutions US_CENTRAL1_A = "us-central1-a" # on-demand v3-8 in cloud-ml-auto-solutions US_CENTRAL1_B = "us-central1-b" # reserved v4-8 & v4-32 in cloud-ml-auto-solutions US_CENTRAL2_B = "us-central2-b" # reserved/on-demand v2-8 in cloud-ml-auto-solutions # & reserved h100 in supercomputer-testing US_CENTRAL1_C = "us-central1-c" # committed resource for A100 US_CENTRAL1_F = "us-central1-f" # reserved v5e in tpu-prod-env-automated US_EAST1_C = "us-east1-c" # reserved v3-8 & reserved/on-demand v3-32 in cloud-ml-auto-solutions US_EAST1_D = "us-east1-d" # reserved h100-mega in supercomputer-testing US_EAST4_A = "us-east4-a" # reserved v5p in tpu-prod-env-automated US_EAST5_A = "us-east5-a" # reserved v6e in tpu-prod-env-one-vm US_EAST5_B = "us-east5-b" # reserved v6e in tpu-prod-env-automated US_EAST5_C = "us-east5-c" # reserved v5e in tpu-prod-env-multipod US_WEST4_B = "us-west4-b" # reserved v5e in cloud-tpu-inference-test US_WEST1_C = "us-west1-c" # reserved a3+ cluster in supercomputer-testing AUSTRALIA_SOUTHEAST1_C = "australia-southeast1-c" # reserved H200 capacity in cloud-tpu-inference-test EUROPE_WEST1_B = "europe-west1-b" # reserved TRILLIUM capacity EUROPE_WEST4_A = "europe-west4-a" SOUTHAMERICA_WEST1_A = "southamerica-west1-a" # reserved v5e capacity in tpu-prod-env-multipod EUROPE_WEST4_B = "europe-west4-b" # reserved l4 in cloud-tpu-inference-test ASIA_EAST1_A = "asia-east1-a" ASIA_EAST1_C = "asia-east1-c" class MachineVersion(enum.Enum): """Common machine types.""" N1_STANDARD_8 = "n1-standard-8" N1_STANDARD_16 = "n1-standard-16" # 60GB memory N1_STANDARD_32 = "n1-standard-32" A2_HIGHGPU_1G = "a2-highgpu-1g" A2_HIGHGPU_4G = "a2-highgpu-4g" A2_ULTRAGPU_1G = "a2-ultragpu-1g" A2_ULTRAGPU_2G = "a2-ultragpu-2g" A2_ULTRAGPU_4G = "a2-ultragpu-4g" A2_ULTRAGPU_8G = "a2-ultragpu-8g" A3_HIGHGPU_8G = "a3-highgpu-8g" A3_MEGAGPU_8G = "a3-megagpu-8g" A3_ULTRAGPU_8G = "a3-ultragpu-8g" G2_STAND_4 = "g2-standard-4" G2_STAND_16 = "g2-standard-16" # 64GB memory G2_STAND_32 = "g2-standard-32" # 128GB memroy G2_STAND_48 = "g2-standard-48" # 4 GPUs, 192GB memory G2_STAND_96 = "g2-standard-96" # 8 GPUs, 384GB memory class AcceleratorType(enum.Enum): CPU = "CPU" GPU = "GPU" TPU = "TPU" class TpuVersion(enum.Enum): """Common TPU versions.""" V2 = "2" V3 = "3" V4 = "4" V5E = "5litepod" V5P = "5p" TRILLIUM = "6e" class GpuVersion(enum.Enum): """Common GPU versions.""" L4 = "nvidia-l4" A100 = "nvidia-tesla-a100" A100_80G = "nvidia-a100-80gb" H100 = "nvidia-h100-80gb" H200 = "nvidia-h200-80gb" XPK_H100 = "h100-80gb-8" XPK_H100_MEGA = "h100-mega-80gb-8" V100 = "nvidia-tesla-v100" class CpuVersion(enum.Enum): """Common CPU versions.""" M1_MEGAMEM = "m1-megamem-96" N2_STANDARD = "n2-standard-64" class RuntimeVersion(enum.Enum): """Common runtime versions.""" TPU_VM_TF_NIGHTLY = "tpu-vm-tf-nightly" TPU_VM_TF_NIGHTLY_POD = "tpu-vm-tf-nightly-pod" TPU_VM_TF_STABLE_SE = "tpu-vm-tf-2.16.0-se" TPU_VM_TF_STABLE_POD_SE = "tpu-vm-tf-2.16.0-pod-se" TPU_VM_TF_STABLE_PJRT = "tpu-vm-tf-2.16.0-pjrt" TPU_VM_TF_STABLE_POD_PJRT = "tpu-vm-tf-2.16.0-pod-pjrt" TPU_VM_TF_V5P_ALPHA = "tpu-vm-tf-v5p-alpha-sc" TPU_UBUNTU2204_BASE = "tpu-ubuntu2204-base" TPU_VM_V4_BASE = "tpu-vm-v4-base" V2_ALPHA_TPUV5_LITE = "v2-alpha-tpuv5-lite" V2_ALPHA_TPUV5 = "v2-alpha-tpuv5" V2_ALPHA_TPUV6 = "v2-alpha-tpuv6e" class XpkClusters: """Common XPK cluster configs.""" TPU_V4_8_MAS_CLUSTER = XpkClusterConfig( name="mas-v4-8", device_version=TpuVersion.V4, core_count=8, project=Project.CLOUD_ML_AUTO_SOLUTIONS.value, zone=Zone.US_CENTRAL2_B.value, ) TPU_V4_8_MAXTEXT_CLUSTER = XpkClusterConfig( name="v4-8-maxtext", device_version=TpuVersion.V4, core_count=8, project=Project.TPU_PROD_ENV_MULTIPOD.value, zone=Zone.US_CENTRAL2_B.value, ) TPU_V4_16_CLUSTER = XpkClusterConfig( name="v4-16-maxtext", device_version=TpuVersion.V4, core_count=16, project=Project.TPU_PROD_ENV_MULTIPOD.value, zone=Zone.US_CENTRAL2_B.value, ) TPU_V4_128_CLUSTER = XpkClusterConfig( name="v4-128-bodaborg-us-central2-b", device_version=TpuVersion.V4, core_count=128, project=Project.CLOUD_TPU_MULTIPOD_DEV.value, zone=Zone.US_CENTRAL2_B.value, ) TPU_V5P_8_CLUSTER = XpkClusterConfig( name="v5p-8-bodaborg-europe-west4-b", device_version=TpuVersion.V5P, core_count=8, project=Project.CLOUD_TPU_MULTIPOD_DEV.value, zone=Zone.EUROPE_WEST4_B.value, ) TPU_V5E_256_CLUSTER = XpkClusterConfig( name="v5e-256-bodaborg-europe-west4", device_version=TpuVersion.V5E, core_count=256, project=Project.TPU_PROD_ENV_MULTIPOD.value, zone=Zone.EUROPE_WEST4_B.value, ) TPU_V6E_256_CLUSTER = XpkClusterConfig( name="bodaborg-v6e-256", device_version=TpuVersion.TRILLIUM, core_count=256, project=Project.TPU_PROD_ENV_LARGE_ADHOC.value, zone=Zone.US_CENTRAL2_B.value, ) TPU_V6E_256_MLPERF_CLUSTER = XpkClusterConfig( name="bodaborg-v6e-256-lcscld-c", device_version=TpuVersion.TRILLIUM, core_count=256, project=Project.TPU_PROD_ENV_ONE_VM.value, zone=Zone.SOUTHAMERICA_WEST1_A.value, ) TPU_V6E_16_IN_MEM_CLUSTER = XpkClusterConfig( name="in-mem-airflow-v6e-16", device_version=TpuVersion.TRILLIUM, core_count=16, project=Project.TPU_PROD_ENV_ONE_VM.value, zone=Zone.US_EAST5_C.value, ) GPU_A3_CLUSTER = XpkClusterConfig( name="ninacai-maxtext-a3", device_version=GpuVersion.XPK_H100, core_count=8, project=Project.SUPERCOMPUTER_TESTING.value, zone=Zone.US_EAST5_A.value, ) GPU_A3PLUS_CLUSTER = XpkClusterConfig( name="a3plus-benchmark", device_version=GpuVersion.XPK_H100_MEGA, core_count=8, project=Project.SUPERCOMPUTER_TESTING.value, zone=Zone.AUSTRALIA_SOUTHEAST1_C.value, ) CPU_M1_MEGAMEM_96_CLUSTER = XpkClusterConfig( name="m1-megamem-96-shared", device_version=CpuVersion.M1_MEGAMEM, core_count=96, project=Project.TPU_PROD_ENV_MULTIPOD.value, zone=Zone.US_CENTRAL1_B.value, ) CPU_N2_STANDARD_64_CLUSTER = XpkClusterConfig( name="shared-n2-standard-64", device_version=CpuVersion.N2_STANDARD, core_count=64, project=Project.TPU_PROD_ENV_MULTIPOD.value, zone=Zone.US_CENTRAL1_B.value, ) class DockerImage(enum.Enum): """Common docker images.""" XPK_JAX_TEST = "gcr.io/cloud-ml-auto-solutions/xpk_jax_test:latest" PYTORCH_NIGHTLY = ( "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/" f"xla:nightly_3.10_tpuvm_{datetime.datetime.today().strftime('%Y%m%d')}" ) AXLEARN_TPU_JAX_STABLE_STACK = ( "us-docker.pkg.dev/tpu-prod-env-multipod/bite/tpu/axlearn:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) AXLEARN_GPU_JAX_NIGHTLY = ( "us-docker.pkg.dev/tpu-prod-env-multipod/bite/gpu/jax_nightly:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_TPU_JAX_STABLE_STACK = ( "gcr.io/tpu-prod-env-multipod/maxtext_jax_stable_stack:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_TPU_STABLE_STACK_NIGHTLY_JAX = ( "gcr.io/tpu-prod-env-multipod/maxtext_stable_stack_nightly_jax:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_TPU_JAX_STABLE_STACK_CANDIDATE = ( "gcr.io/tpu-prod-env-multipod/maxtext_stable_stack_candidate:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXDIFFUSION_TPU_JAX_STABLE_STACK = ( "gcr.io/tpu-prod-env-multipod/maxdiffusion_jax_stable_stack:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXDIFFUSION_TPU_STABLE_STACK_NIGHTLY_JAX = ( "gcr.io/tpu-prod-env-multipod/maxdiffusion_jax_nightly:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXDIFFUSION_TPU_JAX_STABLE_STACK_CANDIDATE = ( "gcr.io/tpu-prod-env-multipod/maxdiffusion_stable_stack_candidate:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_TPU_JAX_NIGHTLY = ( "gcr.io/tpu-prod-env-multipod/maxtext_jax_nightly:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_GPU_JAX_PINNED = ( "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_pinned:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_GPU_JAX_STABLE_STACK = ( "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_stable_stack:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_GPU_JAX_STABLE = ( "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_stable:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MAXTEXT_GPU_STABLE_STACK_NIGHTLY_JAX = ( "gcr.io/tpu-prod-env-multipod/maxtext_gpu_stable_stack_nightly_jax:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) CLOUD_HYBRIDSIM_NIGHTLY = ( "us-docker.pkg.dev/cloud-tpu-v2-images-dev/hybridsim/cloud_hybridsim_gcloud_python:" f"{datetime.datetime.today().strftime('%Y-%m-%d')}" ) MICROBENCH_NIGHTLY = ( "gcr.io/tpu-prod-env-one-vm/microbenchmarks_runner:latest" )