community/modules/internal/slurm-gcp/nodeset_tpu/variables.tf (137 lines of code) (raw):
/**
* Copyright (C) SchedMD LLC.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
variable "nodeset_name" {
description = "Name of Slurm nodeset."
type = string
validation {
condition = can(regex("^[a-z](?:[a-z0-9]{0,14})$", var.nodeset_name))
error_message = "Variable 'nodeset_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,14})$'."
}
}
variable "node_type" {
description = "Specify a node type to base the vm configuration upon it. Not needed if you use accelerator_config"
type = string
default = null
}
variable "accelerator_config" {
description = "Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details."
type = object({
topology = string
version = string
})
default = {
topology = ""
version = ""
}
validation {
condition = var.accelerator_config.version == "" ? true : contains(["V2", "V3", "V4"], upper(var.accelerator_config.version))
error_message = "accelerator_config.version must be one of [\"V2\", \"V3\", \"V4\"]"
}
validation {
condition = var.accelerator_config.topology == "" ? true : can(regex("^[1-9]x[1-9](x[1-9])?$", var.accelerator_config.topology))
error_message = "accelerator_config.topology must be a valid topology, like 2x2 4x4x4 4x2x4 etc..."
}
}
variable "docker_image" {
description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-9-tf-<var.tf_version>"
type = string
default = ""
}
variable "tf_version" {
description = "Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details."
type = string
}
variable "zone" {
description = "Nodes will only be created in this zone. Check https://cloud.google.com/tpu/docs/regions-zones to get zones with TPU-vm in it."
type = string
validation {
condition = can(coalesce(var.zone))
error_message = "Zone cannot be null or empty."
}
}
variable "preemptible" {
description = "Specify whether TPU-vms in this nodeset are preemtible, see https://cloud.google.com/tpu/docs/preemptible for details."
type = bool
default = false
}
variable "reserved" {
description = "Specify whether TPU-vms in this nodeset are created under a reservation."
type = bool
default = false
}
variable "preserve_tpu" {
description = "Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted"
type = bool
default = true
}
variable "node_count_static" {
description = "Number of nodes to be statically created."
type = number
default = 0
validation {
condition = var.node_count_static >= 0
error_message = "Value must be >= 0."
}
}
variable "node_count_dynamic_max" {
description = "Maximum number of nodes allowed in this partition to be created dynamically."
type = number
default = 0
validation {
condition = var.node_count_dynamic_max >= 0
error_message = "Value must be >= 0."
}
}
variable "enable_public_ip" {
description = "Enables IP address to access the Internet."
type = bool
default = false
}
variable "data_disks" {
type = list(string)
description = "The data disks to include in the TPU node"
default = []
}
variable "subnetwork" {
description = "The name of the subnetwork to attach the TPU-vm of this nodeset to."
type = string
}
variable "service_account" {
type = object({
email = string
scopes = set(string)
})
description = <<EOD
Service account to attach to the TPU-vm.
If none is given, the default service account and scopes will be used.
EOD
default = null
}
variable "project_id" {
type = string
description = "Project ID to create resources in."
}
variable "network_storage" {
description = "An array of network attached storage mounts to be configured on nodes."
type = list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
default = []
}