infrastructure/variables.tf (247 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. variable "project_id" { type = string description = "GCP project id" } variable "region" { type = string description = "GCP project region or zone" default = "us-central1" } ## network variables variable "create_network" { type = bool } variable "network_name" { type = string } variable "subnetwork_name" { type = string } variable "subnetwork_cidr" { type = string default = "10.128.0.0/20" } variable "subnetwork_region" { type = string default = "us-central1" } variable "subnetwork_private_access" { type = string default = "true" } variable "subnetwork_description" { type = string default = "" } variable "network_secondary_ranges" { type = map(list(object({ range_name = string, ip_cidr_range = string }))) default = {} } ## GKE variables variable "create_cluster" { type = bool default = true } variable "private_cluster" { type = bool default = true } variable "autopilot_cluster" { type = bool } variable "cluster_regional" { type = bool default = true } variable "cluster_name" { type = string } variable "cluster_labels" { type = map(any) description = "GKE cluster labels" default = { "created-by" = "ai-on-gke" } } variable "kubernetes_version" { type = string default = "1.30" } variable "release_channel" { type = string default = "REGULAR" } variable "cluster_location" { type = string } variable "ip_range_pods" { type = string default = "" } variable "ip_range_services" { type = string default = "" } variable "monitoring_enable_managed_prometheus" { type = bool default = true } variable "gcs_fuse_csi_driver" { type = bool default = true } variable "deletion_protection" { type = bool default = false } variable "ray_addon_enabled" { type = bool description = "Set to true to enable ray addon" default = true } variable "master_authorized_networks" { type = list(object({ cidr_block = string display_name = optional(string) })) default = [] } variable "master_ipv4_cidr_block" { type = string default = "" } variable "all_node_pools_oauth_scopes" { type = list(string) default = [ "https://www.googleapis.com/auth/logging.write", "https://www.googleapis.com/auth/monitoring", "https://www.googleapis.com/auth/devstorage.read_only", "https://www.googleapis.com/auth/trace.append", "https://www.googleapis.com/auth/service.management.readonly", "https://www.googleapis.com/auth/servicecontrol", ] } variable "all_node_pools_labels" { type = map(string) default = { "created-by" = "ai-on-gke" } } variable "all_node_pools_metadata" { type = map(string) default = { disable-legacy-endpoints = "true" } } variable "all_node_pools_tags" { type = list(string) default = ["gke-node", "ai-on-gke"] } variable "enable_tpu" { type = bool description = "Set to true to create TPU node pool" default = false } variable "enable_gpu" { type = bool description = "Set to true to create GPU node pool" default = true } variable "cpu_pools" { type = list(object({ name = string machine_type = string node_locations = optional(string, "") autoscaling = optional(bool, false) min_count = optional(number, 1) max_count = optional(number, 3) local_ssd_count = optional(number, 0) spot = optional(bool, false) disk_size_gb = optional(number, 100) disk_type = optional(string, "pd-standard") image_type = optional(string, "COS_CONTAINERD") enable_gcfs = optional(bool, false) enable_gvnic = optional(bool, false) logging_variant = optional(string, "DEFAULT") auto_repair = optional(bool, true) auto_upgrade = optional(bool, true) create_service_account = optional(bool, true) preemptible = optional(bool, false) initial_node_count = optional(number, 1) accelerator_count = optional(number, 0) queued_provisioning = optional(bool, false) })) default = [{ name = "cpu-pool" machine_type = "n1-standard-16" autoscaling = true min_count = 1 max_count = 3 disk_size_gb = 100 disk_type = "pd-standard" }] } variable "gpu_pools" { type = list(object({ name = string machine_type = string node_locations = optional(string, "") autoscaling = optional(bool, false) min_count = optional(number, 1) max_count = optional(number, 3) local_ssd_count = optional(number, 0) spot = optional(bool, false) disk_size_gb = optional(number, 100) disk_type = optional(string, "pd-standard") image_type = optional(string, "COS_CONTAINERD") enable_gcfs = optional(bool, false) enable_gvnic = optional(bool, false) logging_variant = optional(string, "DEFAULT") auto_repair = optional(bool, true) auto_upgrade = optional(bool, true) create_service_account = optional(bool, true) preemptible = optional(bool, false) initial_node_count = optional(number, 1) accelerator_count = optional(number, 0) accelerator_type = optional(string, "nvidia-tesla-t4") gpu_driver_version = optional(string, "DEFAULT") queued_provisioning = optional(bool, false) })) default = [{ name = "gpu-pool" machine_type = "n1-standard-16" autoscaling = true min_count = 1 max_count = 3 disk_size_gb = 100 disk_type = "pd-standard" accelerator_count = 2 accelerator_type = "nvidia-tesla-t4" gpu_driver_version = "DEFAULT" }] } variable "tpu_pools" { type = list(object({ name = string machine_type = string node_locations = string autoscaling = optional(bool, false) min_count = optional(number, 1) max_count = optional(number, 3) local_ssd_count = optional(number, 0) spot = optional(bool, false) disk_size_gb = optional(number, 100) disk_type = optional(string, "pd-standard") image_type = optional(string, "COS_CONTAINERD") enable_gcfs = optional(bool, false) enable_gvnic = optional(bool, false) logging_variant = optional(string, "DEFAULT") auto_repair = optional(bool, true) auto_upgrade = optional(bool, true) create_service_account = optional(bool, true) preemptible = optional(bool, false) initial_node_count = optional(number, 1) accelerator_count = optional(number, 0) accelerator_type = optional(string, "nvidia-tesla-t4") queued_provisioning = optional(bool, false) })) default = [] }