modules/compute/gke-node-pool/main.tf (364 lines of code) (raw):

/** * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ locals { # This label allows for billing report tracking based on module. labels = merge(var.labels, { ghpc_module = "gke-node-pool", ghpc_role = "compute" }) } locals { upgrade_settings = { strategy = var.upgrade_settings.strategy max_surge = coalesce(var.upgrade_settings.max_surge, 0) max_unavailable = coalesce(var.upgrade_settings.max_unavailable, 1) } } module "gpu" { source = "../../internal/gpu-definition" machine_type = var.machine_type guest_accelerator = var.guest_accelerator } locals { guest_accelerator = module.gpu.guest_accelerator has_gpu = length(local.guest_accelerator) > 0 allocatable_gpu_per_node = local.has_gpu ? max(local.guest_accelerator[*].count...) : -1 is_static_node_pool_with_gpus = var.static_node_count != null && local.allocatable_gpu_per_node != -1 static_gpu_count = local.is_static_node_pool_with_gpus ? var.static_node_count * local.allocatable_gpu_per_node : 0 gpu_taint = local.has_gpu ? [{ key = "nvidia.com/gpu" value = "present" effect = "NO_SCHEDULE" }] : [] autoscale_set = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000 static_node_set = var.static_node_count != null initial_node_set = try(var.initial_node_count > 0, false) module_unique_id = replace(lower(var.internal_ghpc_module_id), "/[^a-z0-9\\-]/", "") } locals { cluster_id_parts = split("/", var.cluster_id) cluster_name = local.cluster_id_parts[5] cluster_location = local.cluster_id_parts[3] } data "google_container_cluster" "gke_cluster" { name = local.cluster_name location = local.cluster_location } resource "google_container_node_pool" "node_pool" { provider = google-beta count = max(var.num_node_pools, var.num_slices) name = (max(var.num_node_pools, var.num_slices) == 1) ? coalesce(var.name, join("-", [var.machine_type, local.module_unique_id])) : join("-", [coalesce(var.name, join("-", [var.machine_type, local.module_unique_id])), count.index]) cluster = var.cluster_id node_locations = var.zones node_count = var.static_node_count dynamic "autoscaling" { for_each = local.static_node_set ? [] : [1] content { total_min_node_count = var.autoscaling_total_min_nodes total_max_node_count = var.autoscaling_total_max_nodes location_policy = "ANY" } } initial_node_count = var.initial_node_count max_pods_per_node = var.max_pods_per_node management { auto_repair = var.auto_repair auto_upgrade = var.auto_upgrade } upgrade_settings { strategy = local.upgrade_settings.strategy max_surge = local.upgrade_settings.max_surge max_unavailable = local.upgrade_settings.max_unavailable } dynamic "placement_policy" { for_each = var.placement_policy.type != null ? [1] : [] content { type = var.placement_policy.type policy_name = var.placement_policy.name tpu_topology = var.placement_policy.tpu_topology } } dynamic "queued_provisioning" { for_each = var.enable_queued_provisioning ? [1] : [] content { enabled = true } } node_config { disk_size_gb = var.disk_size_gb disk_type = var.disk_type resource_labels = local.labels labels = var.kubernetes_labels service_account = var.service_account_email oauth_scopes = var.service_account_scopes machine_type = var.machine_type spot = var.spot image_type = var.image_type dynamic "guest_accelerator" { for_each = local.guest_accelerator iterator = ga content { type = coalesce(ga.value.type, try(local.generated_guest_accelerator[0].type, "")) count = coalesce(try(ga.value.count, 0) > 0 ? ga.value.count : try(local.generated_guest_accelerator[0].count, "0")) gpu_partition_size = try(ga.value.gpu_partition_size, null) dynamic "gpu_driver_installation_config" { # in case user did not specify guest_accelerator settings, we need a try to default to [] for_each = try([ga.value.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }]) iterator = gdic content { gpu_driver_version = gdic.value.gpu_driver_version } } dynamic "gpu_sharing_config" { for_each = try(ga.value.gpu_sharing_config == null, true) ? [] : [ga.value.gpu_sharing_config] iterator = gsc content { gpu_sharing_strategy = gsc.value.gpu_sharing_strategy max_shared_clients_per_gpu = gsc.value.max_shared_clients_per_gpu } } } } dynamic "taint" { for_each = concat(var.taints, local.gpu_taint) content { key = taint.value.key value = taint.value.value effect = taint.value.effect } } dynamic "ephemeral_storage_local_ssd_config" { for_each = local.local_ssd_config.local_ssd_count_ephemeral_storage != null ? [1] : [] content { local_ssd_count = local.local_ssd_config.local_ssd_count_ephemeral_storage } } dynamic "local_nvme_ssd_block_config" { for_each = local.local_ssd_config.local_ssd_count_nvme_block != null ? [1] : [] content { local_ssd_count = local.local_ssd_config.local_ssd_count_nvme_block } } shielded_instance_config { enable_secure_boot = var.enable_secure_boot enable_integrity_monitoring = true } dynamic "gcfs_config" { for_each = var.enable_gcfs ? [1] : [] content { enabled = true } } gvnic { enabled = var.image_type == "COS_CONTAINERD" } dynamic "advanced_machine_features" { for_each = local.set_threads_per_core ? [1] : [] content { threads_per_core = local.threads_per_core # relies on threads_per_core_calc.tf } } # Implied by Workload Identity workload_metadata_config { mode = "GKE_METADATA" } # Implied by workload identity. metadata = { "disable-legacy-endpoints" = "true" } linux_node_config { sysctls = { "net.ipv4.tcp_rmem" = "4096 87380 16777216" "net.ipv4.tcp_wmem" = "4096 16384 16777216" } } reservation_affinity { consume_reservation_type = var.reservation_affinity.consume_reservation_type key = length(local.verified_specific_reservations) != 1 ? null : local.reservation_resource_api_label values = length(local.verified_specific_reservations) != 1 ? null : [ for i, r in local.verified_specific_reservations : (length(local.input_reservation_suffixes[i]) > 0 ? format("%s%s", r.name, local.input_reservation_suffixes[i]) : "projects/${r.project}/reservations/${r.name}") ] } dynamic "host_maintenance_policy" { for_each = var.host_maintenance_interval != "" ? [1] : [] content { maintenance_interval = var.host_maintenance_interval } } } network_config { dynamic "additional_node_network_configs" { for_each = var.additional_networks content { network = additional_node_network_configs.value.network subnetwork = additional_node_network_configs.value.subnetwork } } enable_private_nodes = var.enable_private_nodes } timeouts { create = var.timeout_create update = var.timeout_update } lifecycle { ignore_changes = [ node_config[0].labels, initial_node_count, # Ignore local/ephemeral ssd configs as they are tied to machine types. node_config[0].ephemeral_storage_local_ssd_config, node_config[0].local_nvme_ssd_block_config, ] precondition { condition = (var.max_pods_per_node == null) || (data.google_container_cluster.gke_cluster.networking_mode == "VPC_NATIVE") error_message = "max_pods_per_node does not work on `routes-based` clusters, that don't have IP Aliasing enabled." } precondition { condition = !local.static_node_set || !local.autoscale_set error_message = "static_node_count cannot be set with either autoscaling_total_min_nodes or autoscaling_total_max_nodes." } precondition { condition = !local.static_node_set || !local.initial_node_set error_message = "initial_node_count cannot be set with static_node_count." } precondition { condition = !local.initial_node_set || (coalesce(var.initial_node_count, 0) >= var.autoscaling_total_min_nodes && coalesce(var.initial_node_count, 0) <= var.autoscaling_total_max_nodes) error_message = "initial_node_count must be between autoscaling_total_min_nodes and autoscaling_total_max_nodes included." } precondition { condition = !(coalesce(local.local_ssd_config.local_ssd_count_ephemeral_storage, 0) > 0 && coalesce(local.local_ssd_config.local_ssd_count_nvme_block, 0) > 0) error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set to a non-zero value." } precondition { condition = ( (var.reservation_affinity.consume_reservation_type != "SPECIFIC_RESERVATION" && local.input_specific_reservations_count == 0) || (var.reservation_affinity.consume_reservation_type == "SPECIFIC_RESERVATION" && local.input_specific_reservations_count == 1) ) error_message = <<-EOT When using NO_RESERVATION or ANY_RESERVATION as the `consume_reservation_type`, `specific_reservations` cannot be set. On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservations`. EOT } precondition { condition = ( (local.input_specific_reservations_count == 0) || (local.input_specific_reservations_count == 1 && length(local.verified_specific_reservations) > 0 && length(local.specific_reservation_requirement_violations) == 0) ) error_message = <<-EOT Check if your reservation is configured correctly: - A reservation with the name must exist in the specified project and one of the specified zones - Its consumption type must be "specific" %{for property in local.specific_reservation_requirement_violations} - ${local.specific_reservation_requirement_violation_messages[property]} %{endfor} EOT } precondition { condition = ( (local.input_specific_reservations_count == 0) || (local.input_specific_reservations_count == 1 && length(local.input_reservation_suffixes) == 0) || (local.input_specific_reservations_count == 1 && length(local.input_reservation_suffixes) > 0 && try(local.input_reservation_projects[0], var.project_id) == var.project_id) ) error_message = "Shared extended reservations are not supported by GKE." } precondition { condition = contains(["SURGE"], local.upgrade_settings.strategy) error_message = "Only SURGE strategy is supported" } precondition { condition = local.upgrade_settings.max_unavailable >= 0 error_message = "max_unavailable should be set to 0 or greater" } precondition { condition = local.upgrade_settings.max_surge >= 0 error_message = "max_surge should be set to 0 or greater" } precondition { condition = local.upgrade_settings.max_unavailable > 0 || local.upgrade_settings.max_surge > 0 error_message = "At least one of max_unavailable or max_surge must greater than 0" } precondition { condition = var.placement_policy.type != "COMPACT" || (var.zones != null ? (length(var.zones) == 1) : false) error_message = "Compact placement is only available for node pools operating in a single zone." } precondition { condition = var.placement_policy.type != "COMPACT" || local.upgrade_settings.strategy != "BLUE_GREEN" error_message = "Compact placement is not supported with blue-green upgrades." } precondition { condition = !(var.enable_queued_provisioning == true && var.placement_policy.type == "COMPACT") error_message = "placement_policy cannot be COMPACT when enable_queued_provisioning is true." } precondition { condition = !(var.enable_queued_provisioning == true && var.reservation_affinity.consume_reservation_type != "NO_RESERVATION") error_message = "reservation_affinity should be NO_RESERVATION when enable_queued_provisioning is true." } precondition { condition = !(var.enable_queued_provisioning == true && var.autoscaling_total_min_nodes != 0) error_message = "autoscaling_total_min_nodes should be 0 when enable_queued_provisioning is true." } precondition { condition = !(var.num_node_pools > 1 && var.num_slices > 1) error_message = "num_node_pools is for CPUs and GPUS, and num_slices is for TPUs. Both cannot be set at the same time to create a group of identical nodepools / slices." } precondition { condition = !(var.num_node_pools == 0 && var.num_slices == 0) error_message = "Either num_node_pools (for CPUs and GPUS) or num_slices (for TPUs) should be set to a positive integer value." } precondition { condition = !(var.num_node_pools < 0 || var.num_slices < 0) error_message = "Negative integer value of num_node_pools or num_slices is not valid. Please use a positive integer value to set num_node_pools for CPUs and GPUS, and num_slices for TPUs." } } } locals { supported_machine_types_for_install_dependencies = ["a3-highgpu-8g", "a3-megagpu-8g"] } resource "null_resource" "install_dependencies" { count = var.run_workload_script && contains(local.supported_machine_types_for_install_dependencies, var.machine_type) ? 1 : 0 provisioner "local-exec" { command = "pip3 install pyyaml" } } locals { gpu_direct_setting = lookup(local.gpu_direct_settings, var.machine_type, { gpu_direct_manifests = [], updated_workload_path = "", rxdm_version = "" }) } # execute script to inject rxdm sidecar into workload to enable tcpx for a3-highgpu-8g VM workload resource "null_resource" "enable_tcpx_in_workload" { count = var.run_workload_script && var.machine_type == "a3-highgpu-8g" ? 1 : 0 triggers = { always_run = timestamp() } provisioner "local-exec" { command = "python3 ${path.module}/gpu-direct-workload/scripts/enable-tcpx-in-workload.py --file ${local.workload_path_tcpx} --rxdm ${local.gpu_direct_setting.rxdm_version}" } depends_on = [null_resource.install_dependencies] } # execute script to inject rxdm sidecar into workload to enable tcpxo for a3-megagpu-8g VM workload resource "null_resource" "enable_tcpxo_in_workload" { count = var.run_workload_script && var.machine_type == "a3-megagpu-8g" ? 1 : 0 triggers = { always_run = timestamp() } provisioner "local-exec" { command = "python3 ${path.module}/gpu-direct-workload/scripts/enable-tcpxo-in-workload.py --file ${local.workload_path_tcpxo} --rxdm ${local.gpu_direct_setting.rxdm_version}" } depends_on = [null_resource.install_dependencies] } # apply manifest to enable tcpx module "kubectl_apply" { source = "../../management/kubectl-apply" cluster_id = var.cluster_id project_id = var.project_id apply_manifests = flatten([ for manifest in local.gpu_direct_setting.gpu_direct_manifests : [ { source = manifest } ] ]) }