ai-infrastructure/tpu-training-on-gke/environment/1-base-infrastructure/variables.tf (258 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. variable "project_id" { description = "The GCP project ID" type = string nullable = false } variable "region" { description = "The region for a GKE cluster and a GCS bucket" type = string nullable = false } variable "deletion_protection" { description = "Prevent Terraform from destroying data storage resources (storage buckets, GKE clusters). When this field is set, a terraform destroy or terraform apply that would delete data storage resources will fail." type = bool default = true nullable = false } variable "prefix" { description = "Prefix used for resource names." type = string default = "" nullable = false } variable "node_pool_sa" { description = "The config for a node pool service account" type = object({ name = string description = string roles = list(string) }) default = { name = "gke-node-pool-sa" description = "GKE node pool service account" roles = [ "storage.objectAdmin", "logging.logWriter", "aiplatform.user", "artifactregistry.reader", ] } } variable "create_artifact_registry" { description = "Whether to create an Artifact Registry" type = bool default = true } variable "registry_config" { description = "The configs for Artifact registry" type = object({ name = string location = string }) default = { name = "ml-images" location = "us" } } variable "gcs_configs" { description = "GCS storage configs" type = map(map(any)) default = { "artifact-repository" = {} } } variable "cluster_config" { description = "GKE cluster configs" type = map(any) default = { name = "gke-ml-cluster" workloads_namespace = "training" } } variable "cpu_node_pools" { description = "Configurations for a CPU node pool" type = map(object({ zones = list(string) min_node_count = optional(number, 3) max_node_count = optional(number, 5) machine_type = optional(string, "n1-standard-16") disk_size_gb = optional(number, 200) labels = optional(map(string), {}) })) nullable = false } variable "tpu_node_pools" { description = "Configurations for a TPU node pools" type = map(object({ zones = list(string) tpu_type = string disk_size_gb = optional(number, 200) autoscaling = optional(bool, false) spot = optional(bool, false) })) validation { condition = alltrue([ for tpu_type in [for name, node_pool in var.tpu_node_pools : node_pool.tpu_type] : contains( [ "v5litepod-4", "v5litepod-16", "v5litepod-32", "v5litepod-64", "v5litepod-128", "v5litepod-256", "v4-8", "v4-16", "v4-32", "v4-64", "v4-128", "v4-256", "v4-512", "v4-1024", "v4-1536", "v4-2048", "v4-4096", "v5p-8", "v5p-16", "v5p-32", "v5p-64", "v5p-128", "v5p-256", "v5p-384", "v5p-512", "v5p-640", "v5p-768", "v5p-896", "v5p-1024", "v5p-1152", "v5p-1280", "v5p-1408", "v5p-1536", "v5p-1664", "v5p-1792", "v5p-1920", "v5p-2048", "v5p-2176", "v5p-2304", "v5p-2432", "v5p-2560", "v5p-2688", "v5p-2816", "v5p-2944", "v5p-3072", "v5p-3200", "v5p-3328", "v5p-3456", "v5p-3584", "v5p-3712", "v5p-3840", "v5p-3968", "v5p-4096", "v5p-4224", "v5p-4352", "v5p-4480", "v5p-4608", "v5p-4736", "v5p-4864", "v5p-4992", "v5p-5120", "v5p-5248", "v5p-5376", "v5p-5504", "v5p-5632", "v5p-5760", "v5p-5888", "v5p-6016", "v5p-6144", "v5p-6272", "v5p-6400", "v5p-6528", "v5p-6656", "v5p-6784", "v5p-6912", "v5p-7040", "v5p-7168", "v5p-7296", "v5p-7424", "v5p-7552", "v5p-7680", "v5p-7808", "v5p-7936", "v5p-8064", "v5p-8192", "v5p-8320", "v5p-8448", "v5p-8704", "v5p-8832", "v5p-8960", "v5p-9216", "v5p-9472", "v5p-9600", "v5p-9728", "v5p-9856", "v5p-9984", "v5p-10240", "v5p-10368", "v5p-10496", "v5p-10752", "v5p-10880", "v5p-11008", "v5p-11136", "v5p-11264", "v5p-11520", "v5p-11648", "v5p-11776", "v5p-11904", "v5p-12032", "v5p-12160", "v5p-12288", "v5p-13824", "v5p-17920", ], tpu_type )]) error_message = "Unsupported TPU type" } nullable = false } variable "vpc_config" { description = "VPC configuration" type = object({ network_name = string subnet_name = string pods_ip_cidr_range = string services_ip_cidr_range = string }) default = { network_name = "gke-cluster-network" subnet_name = "gke-cluster-subnet" pods_ip_cidr_range = "192.168.64.0/18" services_ip_cidr_range = "192.168.128.0/20" } } variable "tensorboard_config" { description = "Tensorboard instance configuration" type = object({ name = optional(string, "TPU Training") description = optional(string, "TPU on GKE training monitoring") region = string }) nullable = true default = null } variable "automation" { description = "Automation configs" type = object({ outputs_bucket = string }) default = { outputs_bucket = null } nullable = false } variable "env_name" { description = "The name of the folder in the automation bucket where auto.tfvars, setting files, etc will be stored." type = string nullable = false default = "environment" }