platforms/gke-aiml/playground/container_cluster.tf (227 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. locals { cluster_cloud_dns_zone_name = "gke-${local.cluster_name}-${local.cluster_secondary_range_hash}-dns" cluster_name = local.unique_identifier_prefix # Minimal roles for nodepool SA https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster#use_least_privilege_sa cluster_sa_roles = [ "roles/monitoring.viewer", "roles/monitoring.metricWriter", "roles/logging.logWriter", "roles/stackdriver.resourceMetadata.writer", "roles/autoscaling.metricsWriter", "roles/artifactregistry.reader", "roles/serviceusage.serviceUsageConsumer" ] cluster_secondary_range_hash = reverse(split("-", google_container_cluster.mlp.ip_allocation_policy[0].cluster_secondary_range_name))[0] } # Create dedicated service account for node pools resource "google_service_account" "cluster" { project = data.google_project.environment.project_id account_id = "vm-${local.cluster_name}" display_name = "${local.cluster_name} Service Account" description = "Terraform-managed service account for cluster ${local.cluster_name}" } # Bind minimum role list + additional roles to nodepool SA on project resource "google_project_iam_member" "cluster_sa" { for_each = toset(local.cluster_sa_roles) project = data.google_project.environment.project_id member = google_service_account.cluster.member role = each.value } resource "google_container_cluster" "mlp" { provider = google-beta datapath_provider = "ADVANCED_DATAPATH" deletion_protection = false enable_shielded_nodes = true location = var.region name = local.cluster_name network = google_compute_network.default.id project = data.google_project.environment.project_id remove_default_node_pool = false subnetwork = google_compute_subnetwork.default.id addons_config { gcp_filestore_csi_driver_config { enabled = true } gcs_fuse_csi_driver_config { enabled = true } gce_persistent_disk_csi_driver_config { enabled = true } } cluster_autoscaling { autoscaling_profile = "OPTIMIZE_UTILIZATION" enabled = true auto_provisioning_defaults { disk_type = "pd-balanced" oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform" ] service_account = google_service_account.cluster.email management { auto_repair = true auto_upgrade = true } shielded_instance_config { enable_integrity_monitoring = true enable_secure_boot = true } upgrade_settings { max_surge = 0 max_unavailable = 1 strategy = "SURGE" } } resource_limits { resource_type = "cpu" minimum = 4 maximum = 1024 } resource_limits { resource_type = "memory" minimum = 16 maximum = 4096 } resource_limits { resource_type = "nvidia-a100-80gb" maximum = 32 } resource_limits { resource_type = "nvidia-l4" maximum = 32 } resource_limits { resource_type = "nvidia-tesla-t4" maximum = 256 } resource_limits { resource_type = "nvidia-tesla-a100" maximum = 64 } resource_limits { resource_type = "nvidia-tesla-k80" maximum = 32 } resource_limits { resource_type = "nvidia-tesla-p4" maximum = 32 } resource_limits { resource_type = "nvidia-tesla-p100" maximum = 32 } resource_limits { resource_type = "nvidia-tesla-v100" maximum = 32 } } control_plane_endpoints_config { dns_endpoint_config { allow_external_traffic = true } } cost_management_config { enabled = true } dns_config { cluster_dns = "CLOUD_DNS" cluster_dns_scope = "CLUSTER_SCOPE" } gateway_api_config { channel = "CHANNEL_STANDARD" } ip_allocation_policy { } lifecycle { ignore_changes = [ node_pool ] } logging_config { enable_components = [ "APISERVER", "CONTROLLER_MANAGER", "SCHEDULER", "SYSTEM_COMPONENTS", "WORKLOADS" ] } master_authorized_networks_config { cidr_blocks { cidr_block = var.subnet_ip_cidr_range display_name = "vpc-cidr" } } monitoring_config { advanced_datapath_observability_config { enable_metrics = true enable_relay = false } enable_components = [ "APISERVER", "CADVISOR", "CONTROLLER_MANAGER", "DAEMONSET", "DCGM", "DEPLOYMENT", "HPA", "KUBELET", "POD", "SCHEDULER", "STATEFULSET", "STORAGE", "SYSTEM_COMPONENTS" ] managed_prometheus { enabled = true } } node_pool { initial_node_count = 1 name = "system" autoscaling { location_policy = "BALANCED" total_max_node_count = 32 total_min_node_count = 1 } network_config { enable_private_nodes = true } node_config { machine_type = "e2-standard-4" service_account = google_service_account.cluster.email oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform" ] gcfs_config { enabled = true } shielded_instance_config { enable_integrity_monitoring = true enable_secure_boot = true } } } node_pool_defaults { node_config_defaults { gcfs_config { enabled = true } } } private_cluster_config { enable_private_nodes = true enable_private_endpoint = true master_ipv4_cidr_block = "172.16.0.32/28" } release_channel { channel = "RAPID" } secret_manager_config { enabled = true } security_posture_config { mode = "BASIC" vulnerability_mode = "VULNERABILITY_ENTERPRISE" } workload_identity_config { workload_pool = "${data.google_project.environment.project_id}.svc.id.goog" } }