ai-infrastructure/terraform-modules/gke-aiml/tpu_node_pools.tf (214 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. locals { tpu_types = { v5litepod-1 = ["1x1", 1, "tpu-v5-lite-podslice", "ct5lp-hightpu-1t", 1, true] v5litepod-4 = ["2x2", 1, "tpu-v5-lite-podslice", "ct5lp-hightpu-4t", 4, true] v5litepod-8 = ["2x4", 1, "tpu-v5-lite-podslice", "ct5lp-hightpu-8t", 8, true] v5litepod-16 = ["4x4", 4, "tpu-v5-lite-podslice", "ct5lp-hightpu-4t", 4, false] v5litepod-32 = ["4x8", 8, "tpu-v5-lite-podslice", "ct5lp-hightpu-4t", 4, false] v5litepod-64 = ["8x8", 16, "tpu-v5-lite-podslice", "ct5lp-hightpu-4t", 4, false] v5litepod-128 = ["8x16", 32, "tpu-v5-lite-podslice", "ct5lp-hightpu-4t", 4, false] v5litepod-256 = ["16x16", 64, "tpu-v5-lite-podslice", "ct5lp-hightpu-4t", 4, false] v4-8 = ["2x2x1", 1, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, true] v4-16 = ["2x2x2", 2, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-32 = ["2x2x4", 4, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-64 = ["2x4x4", 8, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-128 = ["4x4x4", 16, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-256 = ["4x4x8", 32, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-512 = ["4x8x8", 64, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-1024 = ["8x8x8", 128, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-1536 = ["8x8x12", 192, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-2048 = ["8x8x16", 256, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v4-4096 = ["8x16x16", 512, "tpu-v4-podslice", "ct4p-hightpu-4t", 4, false] v5p-8 = ["2x2x1", 1, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, true] v5p-16 = ["2x2x2", 2, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-32 = ["2x2x4", 4, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-64 = ["2x4x4", 8, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-128 = ["4x4x4", 16, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-256 = ["4x4x8", 32, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-384 = ["4x4x12", 48, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-512 = ["4x8x8", 64, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-640 = ["4x4x20", 80, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-768 = ["4x8x12", 96, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-896 = ["4x4x28", 112, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-1024 = ["8x8x8", 128, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-1152 = ["4x12x12", 144, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-1280 = ["4x8x20", 160, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-1408 = ["4x4x44", 176, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-1536 = ["8x8x12", 192, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-1664 = ["4x4x52", 208, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-1792 = ["4x8x28", 224, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-1920 = ["4x12x20", 240, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-2048 = ["8x8x16", 256, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-2176 = ["4x4x68", 272, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-2304 = ["8x12x12", 288, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-2432 = ["4x4x76", 304, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-2560 = ["8x8x20", 320, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-2688 = ["4x12x28", 336, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-2816 = ["4x8x44", 352, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-2944 = ["4x4x92", 368, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-3072 = ["4x12x16", 384, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-3200 = ["4x20x20", 400, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-3328 = ["4x8x52", 416, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-3456 = ["12x12x12", 432, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-3584 = ["8x8x28", 448, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-3712 = ["4x4x116", 464, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-3840 = ["8x12x20", 480, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-3968 = ["4x4x124", 496, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-4096 = ["8x16x16", 512, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-4224 = ["4x12x44", 528, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-4352 = ["4x8x68", 544, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-4480 = ["4x20x28", 560, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-4608 = ["12x12x16", 576, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-4736 = ["4x4x148", 592, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-4864 = ["4x8x76", 608, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-4992 = ["4x12x52", 624, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-5120 = ["8x16x20", 640, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-5248 = ["4x4x164", 656, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-5376 = ["8x12x28", 672, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-5504 = ["4x4x172", 688, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-5632 = ["8x8x44", 704, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-5760 = ["12x12x20", 720, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-5888 = ["4x8x92", 736, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-6016 = ["4x4x188", 752, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-6144 = ["12x16x16", 768, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-6272 = ["4x28x28", 784, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-6400 = ["8x20x20", 800, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-6528 = ["4x12x68", 816, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-6656 = ["8x8x52", 832, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-6784 = ["4x4x212", 848, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-6912 = ["12x12x24", 864, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-7040 = ["4x20x44", 880, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-7168 = ["8x16x28", 896, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-7296 = ["4x12x76", 912, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-7424 = ["4x8x116", 928, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-7552 = ["4x4x236", 944, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-7680 = ["12x16x20", 960, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-7808 = ["4x4x244", 976, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-7936 = ["4x8x124", 992, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-8064 = ["12x12x28", 1008, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-8192 = ["16x16x16", 1024, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-8320 = ["4x20x52", 1040, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-8448 = ["8x12x44", 1056, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-8704 = ["8x8x68", 1088, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-8832 = ["4x12x92", 1104, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-8960 = ["8x20x28", 1120, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-9216 = ["12x16x24", 1152, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-9472 = ["4x8x148", 1184, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-9600 = ["12x20x20", 1200, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-9728 = ["8x8x76", 1216, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-9856 = ["4x28x44", 1232, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-9984 = ["8x12x52", 1248, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-10240 = ["16x16x20", 1280, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-10368 = ["12x12x36", 1296, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-10496 = ["4x8x164", 1312, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-10752 = ["12x16x28", 1344, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-10880 = ["4x20x68", 1360, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-11008 = ["4x8x172", 1376, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-11136 = ["4x12x116", 1392, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-11264 = ["8x16x44", 1408, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-11520 = ["12x20x24", 1440, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-11648 = ["4x28x52", 1456, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-11776 = ["8x8x92", 1472, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-11904 = ["4x12x124", 1488, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-12032 = ["4x8x188", 1504, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-12160 = ["4x20x76", 1520, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-12288 = ["16x16x24", 1536, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-13824 = ["12x24x24", 1728, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] v5p-17920 = ["16x20x28", 2240, "tpu-v5p-slice", "ct5p-hightpu-4t", 4, false] } tpu_node_pools = { for node_pool_name, node_pool in var.tpu_node_pools : node_pool_name => { machine_type = local.tpu_types[node_pool.tpu_type][3] disk_type = node_pool.disk_type disk_size_gb = node_pool.disk_size_gb tpu_topology = local.tpu_types[node_pool.tpu_type][0] zones = node_pool.zones multihost = local.tpu_types[node_pool.tpu_type][5] min_node_count = node_pool.min_node_count max_node_count = node_pool.max_node_count node_count = local.tpu_types[node_pool.tpu_type][1] gvnic = node_pool.gvnic gke_version = var.cluster_config.version initial_node_count = ( node_pool.min_node_count < node_pool.max_node_count ? 0 : local.tpu_types[node_pool.tpu_type][5] ? node_pool.max_node_count : local.tpu_types[node_pool.tpu_type][1] ) autoscaling = node_pool.min_node_count < node_pool.max_node_count gvnic = node_pool.gvnic taints = node_pool.taints labels = node_pool.labels oauth_scopes = node_pool.oauth_scopes reservation_affinity = node_pool.reservation_affinity spot = node_pool.spot } } } resource "google_container_node_pool" "tpu_node_pools" { for_each = local.tpu_node_pools provider = google-beta project = var.project_id cluster = module.cluster.id name = each.key node_locations = each.value.zones initial_node_count = each.value.initial_node_count dynamic "autoscaling" { for_each = each.value.autoscaling && !each.value.multihost ? [""] : [] content { total_min_node_count = each.value.min_node_count total_max_node_count = each.value.max_node_count location_policy = "ANY" } } dynamic "autoscaling" { for_each = each.value.autoscaling && each.value.multihost ? [""] : [] content { max_node_count = each.value.node_count location_policy = "ANY" } } node_config { machine_type = each.value.machine_type service_account = local.node_pool_sa_email disk_type = each.value.disk_type disk_size_gb = each.value.disk_size_gb oauth_scopes = each.value.oauth_scopes spot = each.value.spot gvnic { enabled = each.value.gvnic } workload_metadata_config { mode = "GKE_METADATA" } dynamic "taint" { for_each = each.value.taints content { key = taint.key value = taint.value.key effect = taint.value.effect } } labels = each.value.labels dynamic "reservation_affinity" { for_each = each.value.reservation_affinity != null ? [""] : [] content { consume_reservation_type = each.value.reservation_affinity.consume_reservation_type key = each.value.reservation_affinity.key values = each.value.reservation_affinity.values } } } dynamic "placement_policy" { for_each = each.value.multihost ? [] : [1] content { type = "COMPACT" tpu_topology = each.value.tpu_topology } } timeouts { create = "120m" update = "60m" } }