infrastructure/main.tf (169 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#######################################################
#### PLATFORM
#######################################################
## GPU locations where L4 & T4 are supported.
locals {
gpu_l4_t4_location = {
asia-east1 = "asia-east1-a,asia-east1-c"
asia-northeast1 = "asia-northeast1-a,asia-northeast1-c"
asia-northeast3 = "asia-northeast3-b"
asia-south1 = "asia-south1-a,asia-south1-b"
asia-southeast1 = "asia-southeast1-a,asia-southeast1-b,asia-southeast1-c"
europe-west1 = "europe-west1-b,europe-west1-c"
europe-west2 = "europe-west2-a,europe-west2-b"
europe-west3 = "europe-west3-b"
europe-west4 = "europe-west4-a,europe-west4-b,europe-west4-c"
us-central1 = "us-central1-a,us-central1-b,us-central1-c"
us-east1 = "us-east1-c,us-east1-d"
us-east4 = "us-east4-a,us-east4-c"
us-west1 = "us-west1-a,us-west1-b"
us-west4 = "us-west4-a"
}
}
data "google_compute_network" "existing-network" {
count = var.create_network ? 0 : 1
name = var.network_name
project = var.project_id
}
data "google_compute_subnetwork" "subnetwork" {
count = var.create_network ? 0 : 1
name = var.subnetwork_name
region = var.subnetwork_region
project = var.project_id
}
module "custom-network" {
source = "../modules/gcp-network"
count = var.create_network ? 1 : 0
project_id = var.project_id
network_name = var.network_name
create_psa = true
subnets = [
{
subnet_name = var.subnetwork_name
subnet_ip = var.subnetwork_cidr
subnet_region = var.subnetwork_region
subnet_private_access = var.subnetwork_private_access
description = var.subnetwork_description
}
]
}
locals {
network_name = var.create_network ? module.custom-network[0].network_name : var.network_name
subnetwork_name = var.create_network ? module.custom-network[0].subnets_names[0] : var.subnetwork_name
subnetwork_cidr = var.create_network ? module.custom-network[0].subnets_ips[0] : data.google_compute_subnetwork.subnetwork[0].ip_cidr_range
region = length(split("-", var.cluster_location)) == 2 ? var.cluster_location : ""
regional = local.region != "" ? true : false
# zone needs to be set even for regional clusters, otherwise this module picks random zones that don't have GPU availability:
# https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/blob/af354afdf13b336014cefbfe8f848e52c17d4415/main.tf#L46
zone = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : split(",", local.gpu_l4_t4_location[local.region])
# Update gpu_pools with node_locations according to region and zone gpu availibility, if not provided
gpu_pools = [for elm in var.gpu_pools : (local.regional && contains(keys(local.gpu_l4_t4_location), local.region) && elm["node_locations"] == "") ? merge(elm, { "node_locations" : local.gpu_l4_t4_location[local.region] }) : elm]
}
## create public GKE standard
module "public-gke-standard-cluster" {
count = var.create_cluster && !var.private_cluster && !var.autopilot_cluster ? 1 : 0
source = "../modules/gke-standard-public-cluster"
project_id = var.project_id
## network values
network_name = local.network_name
subnetwork_name = local.subnetwork_name
## gke variables
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
gcs_fuse_csi_driver = var.gcs_fuse_csi_driver
master_authorized_networks = var.master_authorized_networks
deletion_protection = var.deletion_protection
## pools config variables
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = local.gpu_pools
enable_tpu = var.enable_tpu
tpu_pools = var.tpu_pools
all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes
all_node_pools_labels = var.all_node_pools_labels
all_node_pools_metadata = var.all_node_pools_metadata
all_node_pools_tags = var.all_node_pools_tags
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]
}
## create public GKE autopilot
module "public-gke-autopilot-cluster" {
count = var.create_cluster && !var.private_cluster && var.autopilot_cluster ? 1 : 0
source = "../modules/gke-autopilot-public-cluster"
project_id = var.project_id
## network values
network_name = local.network_name
subnetwork_name = local.subnetwork_name
## gke variables
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
deletion_protection = var.deletion_protection
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]
}
## create private GKE standard
module "private-gke-standard-cluster" {
count = var.create_cluster && var.private_cluster && !var.autopilot_cluster ? 1 : 0
source = "../modules/gke-standard-private-cluster"
project_id = var.project_id
## network values
network_name = local.network_name
subnetwork_name = local.subnetwork_name
## gke variables
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
gcs_fuse_csi_driver = var.gcs_fuse_csi_driver
deletion_protection = var.deletion_protection
master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
ray_addon_enabled = var.ray_addon_enabled
## pools config variables
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = local.gpu_pools
enable_tpu = var.enable_tpu
tpu_pools = var.tpu_pools
all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes
all_node_pools_labels = var.all_node_pools_labels
all_node_pools_metadata = var.all_node_pools_metadata
all_node_pools_tags = var.all_node_pools_tags
depends_on = [module.custom-network]
}
## create private GKE autopilot
module "private-gke-autopilot-cluster" {
count = var.create_cluster && var.private_cluster && var.autopilot_cluster ? 1 : 0
source = "../modules/gke-autopilot-private-cluster"
project_id = var.project_id
## network values
network_name = local.network_name
subnetwork_name = local.subnetwork_name
## gke variables
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
deletion_protection = var.deletion_protection
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]
}
## configure cloud NAT for private GKE
module "cloud-nat" {
source = "terraform-google-modules/cloud-nat/google"
version = "5.0.0"
count = var.create_network && var.private_cluster ? 1 : 0
region = local.region
project_id = var.project_id
create_router = true
router = "${var.network_name}-router"
name = "cloud-nat-${var.network_name}-router"
network = module.custom-network[0].network_name
}