infra/terraform/modules/a3/cluster/gke/variables.tf (174 lines of code) (raw):
/*
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
variable "disk_size_gb" {
description = <<-EOT
Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB. Defaults to 200GB.
Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--disk-size).
EOT
type = number
default = 200
nullable = false
}
variable "disk_type" {
description = <<-EOT
Type of the disk attached to each node. The default disk type is 'pd-standard'
Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]`
Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--disk-type).
EOT
type = string
default = "pd-ssd"
nullable = false
}
variable "enable_gke_dashboard" {
description = <<-EOT
Flag to enable GPU usage dashboards for the GKE cluster.
EOT
type = bool
default = true
nullable = false
}
variable "gke_version" {
description = <<-EOT
The GKE version to be used as the minimum version of the master. The default value for that is latest master version.
More details can be found [here](https://cloud.google.com/kubernetes-engine/versioning#specifying_cluster_version)
Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--name).
EOT
type = string
default = null
}
variable "host_maintenance_interval" {
description = "Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for host_maintenance_interval. This enables using stable fleet VM."
type = string
default = "PERIODIC"
validation {
condition = var.host_maintenance_interval != null ? contains(
["PERIODIC"],
var.host_maintenance_interval,
) : true
error_message = "'PERIODIC' is th only supported value for host_maintenance_interval."
}
}
variable "ksa" {
description = <<-EOT
The configuration for setting up Kubernetes Service Account (KSA) after GKE
cluster is created. Disable by setting to null.
- `name`: The KSA name to be used for Pods
- `namespace`: The KSA namespace to be used for Pods
Related Docs: [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity)
EOT
type = object({
name = string
namespace = string
})
default = {
name = "aiinfra-gke-sa"
namespace = "default"
}
}
variable "network_existing" {
description = "Existing network to attach to nic0. Setting to null will create a new network for it."
type = object({
network_name = string
subnetwork_name = string
})
default = null
}
variable "node_pools" {
description = <<-EOT
The list of node pools for the GKE cluster.
- `zone`: The zone in which the node pool's nodes should be located. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_locations)
- `node_count`: The number of nodes per node pool. This field can be used to update the number of nodes per node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_count)
- `machine_type`: (Optional) The machine type for the node pool. Only supported machine types are 'a3-highgpu-8g' and 'a2-highgpu-1g'. [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#machine_type)
- `compact_placement_policy`:(Optional) The object for superblock level compact placement policy for the instances. Currently only 1 resource policy is supported. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#policy_name)
- `new_policy`: (Optional) Flag for creating a new resource policy.
- `existing_policy_name`: (Optional) The existing resource policy.
EOT
type = list(object({
zone = string,
node_count = number,
machine_type = optional(string, "a3-highgpu-8g"),
compact_placement_policy = optional(object({
new_policy = optional(bool, false)
existing_policy_name = optional(string)
specific_reservation = optional(string)
}))
}))
default = []
nullable = false
validation {
condition = length(var.node_pools) != 0
error_message = "must be non-empty list"
}
validation {
condition = alltrue([
for rp in var.node_pools[*].compact_placement_policy
: rp != null ? (
rp.new_policy != (rp.existing_policy_name != null || rp.specific_reservation != null)
) : true
])
error_message = "must specify exactly one of `new_compact` or `existing_name`"
}
}
variable "default_node_pool" {
description = <<-EOT
The list of node pools for the GKE cluster.
- `zone`: The zone in which the node pool's nodes should be located. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_locations)
- `node_count`: The number of nodes per node pool. This field can be used to update the number of nodes per node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_count)
- `machine_type`: The machine type for the default node pool. [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#machine_type)
EOT
type = object({
zone = string,
node_count = number,
machine_type = string
})
default = {
zone = null,
node_count = 0,
machine_type = null
}
nullable = false
# validation {
# condition = length(var.default_node_pool) == 1
# error_message = "must have one default pool"
# }
}
variable "node_service_account" {
description = <<-EOT
The service account to be used by the Node VMs. If not specified, the "default" service account is used.
Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#nested_node_config), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--service-account).
EOT
type = string
default = null
}
variable "project_id" {
description = "GCP Project ID to which the cluster will be deployed."
type = string
nullable = false
}
variable "region" {
description = "The region in which the cluster master will be created. The cluster will be a regional cluster with multiple masters spread across zones in the region, and with default node locations in those zones as well."
type = string
nullable = false
}
variable "zone" {
description = "The zone within a region in which the cluster master will be created. The cluster will be a zonal cluster with default node locations in this zone as well."
type = string
nullable = false
}
variable "is_zonal" {
type = bool
description = "Flag to determine if the cluster will be a zonal or regiona"
nullable = false
default = true
}
variable "resource_prefix" {
description = "Arbitrary string with which all names of newly created resources will be prefixed."
type = string
nullable = false
}