terraform/modules/gke-standard/main.tf (351 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
data "google_project" "environment" {
project_id = var.project_id
}
data "google_compute_regions" "available" {
project = data.google_project.environment.project_id
}
# Get available zones for the region
data "google_compute_zones" "available" {
project = data.google_project.environment.project_id
region = var.region
}
# Random zone selection
resource "random_shuffle" "zone" {
input = data.google_compute_zones.available.names
result_count = 3
}
data "google_container_engine_versions" "central1b" {
provider = google-beta
location = var.region
version_prefix = var.min_master_version
}
resource "google_container_cluster" "risk-research" {
deletion_protection = false
provider = google-beta
name = var.cluster_name
project = var.project_id
location = var.region
datapath_provider = var.datapath_provider
node_locations = [random_shuffle.zone.result[0], random_shuffle.zone.result[1], random_shuffle.zone.result[2]]
depends_on = [google_kms_crypto_key_iam_member.gke_crypto_key]
min_master_version = data.google_container_engine_versions.central1b.latest_master_version
# We do this to ensure we have large control plane nodes created initially
initial_node_count = var.scaled_control_plane ? 700 : 1
remove_default_node_pool = true
control_plane_endpoints_config {
dns_endpoint_config {
allow_external_traffic = true
}
}
node_config {
service_account = var.cluster_service_account.email
shielded_instance_config {
enable_secure_boot = var.enable_secure_boot
enable_integrity_monitoring = var.enable_shielded_nodes
}
machine_type = "e2-standard-2"
preemptible = false
}
network = var.network
subnetwork = var.subnet
database_encryption {
state = "ENCRYPTED"
key_name = google_kms_crypto_key.gke-key.id
}
private_cluster_config {
enable_private_nodes = true
enable_private_endpoint = var.enable_private_endpoint
master_ipv4_cidr_block = cidrsubnet("100.64.0.0/16", 12, index(data.google_compute_regions.available.names, var.region) * 4 + var.cluster_index) # /28 blocks index
# Enables access to the control plane from any network
master_global_access_config {
enabled = true
}
}
# Custom maintenance window
maintenance_policy {
recurring_window {
start_time = var.maintenance_start_time
end_time = var.maintenance_end_time
recurrence = var.maintenance_recurrence
}
}
enable_intranode_visibility = var.enable_intranode_visibility
enable_cilium_clusterwide_network_policy = var.enable_cilium_clusterwide_network_policy
monitoring_config {
# Only enable advanced datapath observability when ADVANCED_DATAPATH is selected
dynamic "advanced_datapath_observability_config" {
for_each = var.datapath_provider == "ADVANCED_DATAPATH" ? [1] : []
content {
enable_metrics = var.enable_advanced_datapath_observability_metrics
enable_relay = var.enable_advanced_datapath_observability_relay
}
}
enable_components = [
"SYSTEM_COMPONENTS",
"STORAGE",
"POD",
"DEPLOYMENT",
"STATEFULSET",
"DAEMONSET",
"HPA",
"CADVISOR",
"KUBELET",
"APISERVER",
"SCHEDULER",
"CONTROLLER_MANAGER"
]
managed_prometheus {
enabled = true
}
}
logging_config {
enable_components = [
"SYSTEM_COMPONENTS",
"APISERVER",
"CONTROLLER_MANAGER",
"SCHEDULER",
"WORKLOADS"
]
}
ip_allocation_policy {
stack_type = "IPV4"
services_secondary_range_name = var.ip_range_services
cluster_secondary_range_name = var.ip_range_pods
}
workload_identity_config {
workload_pool = var.enable_workload_identity ? "${var.project_id}.svc.id.goog" : null
}
node_pool_defaults {
node_config_defaults {
logging_variant = "MAX_THROUGHPUT"
gcfs_config {
enabled = true
}
}
}
# Support for mTLS
mesh_certificates {
enable_certificates = var.enable_mesh_certificates
}
dns_config {
cluster_dns = "CLOUD_DNS"
cluster_dns_scope = "CLUSTER_SCOPE"
}
addons_config {
gcp_filestore_csi_driver_config {
enabled = var.enable_csi_filestore
}
gcs_fuse_csi_driver_config {
enabled = var.enable_csi_gcs_fuse
}
gce_persistent_disk_csi_driver_config {
enabled = true
}
dns_cache_config {
enabled = true
}
parallelstore_csi_driver_config {
enabled = var.enable_csi_parallelstore
}
}
cluster_autoscaling {
enabled = true
autoscaling_profile = "OPTIMIZE_UTILIZATION"
resource_limits {
resource_type = "cpu"
minimum = 4
maximum = var.cluster_max_cpus
}
resource_limits {
resource_type = "memory"
minimum = 16
maximum = var.cluster_max_memory
}
resource_limits {
resource_type = "nvidia-a100-80gb"
maximum = 30
}
resource_limits {
resource_type = "nvidia-l4"
maximum = 30
}
resource_limits {
resource_type = "nvidia-tesla-t4"
maximum = 300
}
resource_limits {
resource_type = "nvidia-tesla-a100"
maximum = 50
}
resource_limits {
resource_type = "nvidia-tesla-k80"
maximum = 30
}
resource_limits {
resource_type = "nvidia-tesla-p4"
maximum = 30
}
resource_limits {
resource_type = "nvidia-tesla-p100"
maximum = 30
}
resource_limits {
resource_type = "nvidia-tesla-v100"
maximum = 30
}
auto_provisioning_defaults {
management {
auto_repair = true
auto_upgrade = true
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
upgrade_settings {
strategy = "SURGE"
max_surge = 1
max_unavailable = 0
}
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
service_account = var.cluster_service_account.email
}
}
release_channel {
channel = var.release_channel
}
secret_manager_config {
enabled = true
}
pod_autoscaling {
hpa_profile = "PERFORMANCE"
}
lifecycle {
# Once deleted the node_config will change. We can ignore this.
ignore_changes = [
node_config,
maintenance_policy
]
}
}
resource "google_container_node_pool" "primary_ondemand_nodes" {
count = var.create_ondemand_nodepool ? 1 : 0
name = "ondemand-node-1"
provider = google-beta
project = var.project_id
location = var.region
cluster = google_container_cluster.risk-research.name
node_locations = [random_shuffle.zone.result[0], random_shuffle.zone.result[1], random_shuffle.zone.result[2]]
autoscaling {
location_policy = "ANY"
total_min_node_count = var.min_nodes_ondemand
total_max_node_count = var.max_nodes_ondemand
}
management {
auto_repair = true
auto_upgrade = true
}
upgrade_settings {
max_surge = 1
max_unavailable = 0
strategy = "SURGE"
}
node_config {
logging_variant = "MAX_THROUGHPUT"
shielded_instance_config {
enable_integrity_monitoring = var.enable_shielded_nodes
enable_secure_boot = var.enable_secure_boot
}
preemptible = false
machine_type = var.node_machine_type_ondemand
labels = {
"resource-model" : "n2"
"resource-type" : "cpu"
"billing-type" : "on-demand"
}
gvnic {
enabled = true
}
# Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles.
service_account = var.cluster_service_account.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
}
lifecycle {
ignore_changes = [
node_config,
]
}
}
resource "google_container_node_pool" "primary_spot_nodes" {
count = var.create_spot_nodepool ? 1 : 0
name = "spot-nodes-1"
provider = google-beta
project = var.project_id
location = var.region
cluster = google_container_cluster.risk-research.name
node_locations = [random_shuffle.zone.result[0], random_shuffle.zone.result[1], random_shuffle.zone.result[2]]
initial_node_count = 5
autoscaling {
location_policy = "ANY"
total_min_node_count = var.min_nodes_spot
total_max_node_count = var.max_nodes_spot
}
management {
auto_repair = true
auto_upgrade = true
}
upgrade_settings {
max_surge = 1
max_unavailable = 0
strategy = "SURGE"
}
node_config {
logging_variant = "MAX_THROUGHPUT"
shielded_instance_config {
enable_integrity_monitoring = var.enable_shielded_nodes
enable_secure_boot = var.enable_secure_boot
}
preemptible = true
machine_type = var.node_machine_type_spot
labels = {
"resource-model" : "n2"
"resource-type" : "cpu"
"billing-type" : "spot"
"cloud.google.com/compute-class" : "spot-capacity"
}
taint {
key = "cloud.google.com/compute-class"
value = "spot-capacity"
effect = "NO_SCHEDULE"
}
gvnic {
enabled = true
}
# Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles.
service_account = var.cluster_service_account.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
}
lifecycle {
ignore_changes = [
node_config,
initial_node_count
]
}
}
# KMS for Encryption
resource "random_string" "random" {
length = 5
special = true
override_special = "_-"
}
resource "google_kms_key_ring" "gke-keyring" {
name = "${var.cluster_name}-${random_string.random.id}"
project = data.google_project.environment.project_id
location = var.region
}
resource "google_kms_crypto_key" "gke-key" {
name = "${var.cluster_name}-key"
key_ring = google_kms_key_ring.gke-keyring.id
rotation_period = "7776000s"
purpose = "ENCRYPT_DECRYPT"
}
resource "google_kms_crypto_key_iam_member" "gke_crypto_key" {
crypto_key_id = google_kms_crypto_key.gke-key.id
role = "roles/cloudkms.cryptoKeyEncrypterDecrypter"
member = "serviceAccount:service-${data.google_project.environment.number}@container-engine-robot.iam.gserviceaccount.com"
}