community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf (275 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
variable "partition_name" {
description = "The name of the slurm partition."
type = string
validation {
condition = can(regex("^[a-z](?:[a-z0-9]*)$", var.partition_name))
error_message = "Variable 'partition_name' must be a match of regex '^[a-z](?:[a-z0-9]*)$'."
}
}
variable "partition_conf" {
description = <<-EOD
Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
EOD
type = map(string)
default = {}
}
variable "is_default" {
description = <<-EOD
Sets this partition as the default partition by updating the partition_conf.
If "Default" is already set in partition_conf, this variable will have no effect.
EOD
type = bool
default = false
}
variable "exclusive" {
description = <<-EOD
Exclusive job access to nodes. When set to true nodes execute single job and are deleted
after job exits. If set to false, multiple jobs can be scheduled on one node.
EOD
type = bool
default = true
}
variable "nodeset" {
description = <<-EOD
A list of nodesets.
For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset
EOD
type = list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
disk_resource_manager_tags = optional(map(string), {})
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_resource_manager_tags = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, false)
enable_opportunistic_maintenance = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
use_bulk_insert = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
advanced_machine_features = object({
enable_nested_virtualization = optional(bool)
threads_per_core = optional(number)
turbo_mode = optional(string)
visible_core_count = optional(number)
performance_monitoring_unit = optional(string)
enable_uefi_networking = optional(bool)
})
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
resource_manager_tags = optional(map(string), {})
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
future_reservation = string
startup_script = optional(list(object({
filename = string
content = string })), [])
zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
default = []
validation {
condition = length(distinct(var.nodeset[*].nodeset_name)) == length(var.nodeset)
error_message = "All nodesets must have a unique name."
}
}
variable "nodeset_tpu" {
description = "Define TPU nodesets, as a list."
type = list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
default = []
validation {
condition = length(distinct([for x in var.nodeset_tpu : x.nodeset_name])) == length(var.nodeset_tpu)
error_message = "All TPU nodesets must have a unique name."
}
}
variable "nodeset_dyn" {
description = "Defines dynamic nodesets, as a list."
type = list(object({
nodeset_name = string
nodeset_feature = string
}))
default = []
validation {
condition = length(distinct([for x in var.nodeset_dyn : x.nodeset_name])) == length(var.nodeset_dyn)
error_message = "All dynamic nodesets must have a unique name."
}
}
variable "resume_timeout" {
description = <<-EOD
Maximum time permitted (in seconds) between when a node resume request is issued and when the node is actually available for use.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'ResumeTimeout' in partition_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout_1 for details.
EOD
type = number
default = 300
validation {
condition = var.resume_timeout == null ? true : var.resume_timeout > 0
error_message = "Value must be > 0."
}
}
variable "suspend_time" {
description = <<-EOD
Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram.
This sets 'SuspendTime' in partition_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details.
NOTE: use value -1 to exclude partition from suspend.
NOTE 2: if `var.exclusive` is set to true (default), nodes are deleted immediately after job finishes.
EOD
type = number
default = 300
validation {
condition = var.suspend_time >= -1
error_message = "Value must be >= -1."
}
}
variable "suspend_timeout" {
description = <<-EOD
Maximum time permitted (in seconds) between when a node suspend request is issued and when the node is shutdown.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'SuspendTimeout' in partition_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout_1 for details.
EOD
type = number
default = null
validation {
condition = var.suspend_timeout == null ? true : var.suspend_timeout > 0
error_message = "Value must be > 0."
}
}
# tflint-ignore: terraform_unused_declarations
variable "network_storage" {
description = "DEPRECATED"
type = list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
default = []
validation {
condition = length(var.network_storage) == 0
error_message = <<-EOD
network_storage in partition module is deprecated and should not be set.
To add network storage to compute nodes, use network_storage of nodeset module instead.
EOD
}
}