community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf (645 lines of code) (raw):

/** * Copyright (C) SchedMD LLC. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ ########### # GENERAL # ########### variable "project_id" { type = string description = "Project ID to create resources in." } variable "deployment_name" { description = "Name of the deployment." type = string } variable "slurm_cluster_name" { type = string description = <<-EOD Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). EOD default = null validation { condition = var.slurm_cluster_name == null || can(regex("^[a-z](?:[a-z0-9]{0,9})$", var.slurm_cluster_name)) error_message = "Variable 'slurm_cluster_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,9})$'." } } variable "region" { type = string description = "The default region to place resources in." } variable "zone" { type = string description = <<EOD Zone where the instances should be created. If not specified, instances will be spread across available zones in the region. EOD default = null } ########## # BUCKET # ########## variable "create_bucket" { description = <<-EOD Create GCS bucket instead of using an existing one. EOD type = bool default = true } variable "bucket_name" { description = <<-EOD Name of GCS bucket. Ignored when 'create_bucket' is true. EOD type = string default = null } variable "bucket_dir" { description = "Bucket directory for cluster files to be put into. If not specified, then one will be chosen based on slurm_cluster_name." type = string default = null } ##################### # CONTROLLER: CLOUD # See variables_controller_instance.tf for the controller instance variables. ##################### ######### # LOGIN # ######### variable "login_nodes" { description = "List of slurm login instance definitions." type = list(object({ group_name = string access_config = optional(list(object({ nat_ip = string network_tier = string }))) additional_disks = optional(list(object({ disk_name = optional(string) device_name = optional(string) disk_size_gb = optional(number) disk_type = optional(string) disk_labels = optional(map(string), {}) auto_delete = optional(bool, true) boot = optional(bool, false) disk_resource_manager_tags = optional(map(string), {}) })), []) additional_networks = optional(list(object({ access_config = optional(list(object({ nat_ip = string network_tier = string })), []) alias_ip_range = optional(list(object({ ip_cidr_range = string subnetwork_range_name = string })), []) ipv6_access_config = optional(list(object({ network_tier = string })), []) network = optional(string) network_ip = optional(string, "") nic_type = optional(string) queue_count = optional(number) stack_type = optional(string) subnetwork = optional(string) subnetwork_project = optional(string) })), []) bandwidth_tier = optional(string, "platform_default") can_ip_forward = optional(bool, false) disk_auto_delete = optional(bool, true) disk_labels = optional(map(string), {}) disk_resource_manager_tags = optional(map(string), {}) disk_size_gb = optional(number) disk_type = optional(string, "n1-standard-1") enable_confidential_vm = optional(bool, false) enable_oslogin = optional(bool, true) enable_shielded_vm = optional(bool, false) gpu = optional(object({ count = number type = string })) labels = optional(map(string), {}) machine_type = optional(string) advanced_machine_features = object({ enable_nested_virtualization = optional(bool) threads_per_core = optional(number) turbo_mode = optional(string) visible_core_count = optional(number) performance_monitoring_unit = optional(string) enable_uefi_networking = optional(bool) }) metadata = optional(map(string), {}) min_cpu_platform = optional(string) num_instances = optional(number, 1) on_host_maintenance = optional(string) preemptible = optional(bool, false) region = optional(string) resource_manager_tags = optional(map(string), {}) service_account = optional(object({ email = optional(string) scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) })) shielded_instance_config = optional(object({ enable_integrity_monitoring = optional(bool, true) enable_secure_boot = optional(bool, true) enable_vtpm = optional(bool, true) })) source_image_family = optional(string) source_image_project = optional(string) source_image = optional(string) static_ips = optional(list(string), []) subnetwork = string spot = optional(bool, false) tags = optional(list(string), []) zone = optional(string) termination_action = optional(string) })) default = [] validation { condition = length(distinct([for x in var.login_nodes : x.group_name])) == length(var.login_nodes) error_message = "All login_nodes must have a unique group name." } } ############ # NODESETS # ############ variable "nodeset" { description = "Define nodesets, as a list." # TODO: remove optional & defaults from fields, since they SHOULD be properly set by nodeset module and not here. type = list(object({ node_count_static = optional(number, 0) node_count_dynamic_max = optional(number, 1) node_conf = optional(map(string), {}) nodeset_name = string additional_disks = optional(list(object({ disk_name = optional(string) device_name = optional(string) disk_size_gb = optional(number) disk_type = optional(string) disk_labels = optional(map(string), {}) auto_delete = optional(bool, true) boot = optional(bool, false) disk_resource_manager_tags = optional(map(string), {}) })), []) bandwidth_tier = optional(string, "platform_default") can_ip_forward = optional(bool, false) disk_auto_delete = optional(bool, true) disk_labels = optional(map(string), {}) disk_resource_manager_tags = optional(map(string), {}) disk_size_gb = optional(number) disk_type = optional(string) enable_confidential_vm = optional(bool, false) enable_placement = optional(bool, false) placement_max_distance = optional(number, null) enable_oslogin = optional(bool, true) enable_shielded_vm = optional(bool, false) enable_maintenance_reservation = optional(bool, false) enable_opportunistic_maintenance = optional(bool, false) gpu = optional(object({ count = number type = string })) dws_flex = object({ enabled = bool max_run_duration = number use_job_duration = bool use_bulk_insert = bool }) labels = optional(map(string), {}) machine_type = optional(string) advanced_machine_features = object({ enable_nested_virtualization = optional(bool) threads_per_core = optional(number) turbo_mode = optional(string) visible_core_count = optional(number) performance_monitoring_unit = optional(string) enable_uefi_networking = optional(bool) }) maintenance_interval = optional(string) instance_properties_json = string metadata = optional(map(string), {}) min_cpu_platform = optional(string) network_tier = optional(string, "STANDARD") network_storage = optional(list(object({ server_ip = string remote_mount = string local_mount = string fs_type = string mount_options = string client_install_runner = optional(map(string)) mount_runner = optional(map(string)) })), []) on_host_maintenance = optional(string) preemptible = optional(bool, false) region = optional(string) resource_manager_tags = optional(map(string), {}) service_account = optional(object({ email = optional(string) scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) })) shielded_instance_config = optional(object({ enable_integrity_monitoring = optional(bool, true) enable_secure_boot = optional(bool, true) enable_vtpm = optional(bool, true) })) source_image_family = optional(string) source_image_project = optional(string) source_image = optional(string) subnetwork_self_link = string additional_networks = optional(list(object({ network = string subnetwork = string subnetwork_project = string network_ip = string nic_type = string stack_type = string queue_count = number access_config = list(object({ nat_ip = string network_tier = string })) ipv6_access_config = list(object({ network_tier = string })) alias_ip_range = list(object({ ip_cidr_range = string subnetwork_range_name = string })) }))) access_config = optional(list(object({ nat_ip = string network_tier = string }))) spot = optional(bool, false) tags = optional(list(string), []) termination_action = optional(string) reservation_name = optional(string) future_reservation = string startup_script = optional(list(object({ filename = string content = string })), []) zone_target_shape = string zone_policy_allow = set(string) zone_policy_deny = set(string) })) default = [] } variable "nodeset_tpu" { description = "Define TPU nodesets, as a list." type = list(object({ node_count_static = optional(number, 0) node_count_dynamic_max = optional(number, 5) nodeset_name = string enable_public_ip = optional(bool, false) node_type = string accelerator_config = optional(object({ topology = string version = string }), { topology = "" version = "" }) tf_version = string preemptible = optional(bool, false) preserve_tpu = optional(bool, false) zone = string data_disks = optional(list(string), []) docker_image = optional(string, "") network_storage = optional(list(object({ server_ip = string remote_mount = string local_mount = string fs_type = string mount_options = string client_install_runner = optional(map(string)) mount_runner = optional(map(string)) })), []) subnetwork = string service_account = optional(object({ email = optional(string) scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) })) project_id = string reserved = optional(string, false) })) default = [] } variable "nodeset_dyn" { description = "Defines dynamic nodesets, as a list." type = list(object({ nodeset_name = string nodeset_feature = string })) default = [] } ############# # PARTITION # ############# variable "partitions" { description = <<EOD Cluster partitions as a list. See module slurm_partition. EOD type = list(object({ partition_name = string partition_conf = optional(map(string), {}) partition_nodeset = optional(list(string), []) partition_nodeset_dyn = optional(list(string), []) partition_nodeset_tpu = optional(list(string), []) enable_job_exclusive = optional(bool, false) })) default = [] validation { condition = length(distinct([for x in var.partitions : x.partition_name])) == length(var.partitions) error_message = "All partitions must have a unique partition_name." } } ######### # SLURM # ######### variable "controller_state_disk" { description = <<EOD A disk that will be attached to the controller instance template to save state of slurm. The disk is created and used by default. To disable this feature, set this variable to null. NOTE: This will not save the contents at /opt/apps and /home. To preserve those, they must be saved externally. EOD type = object({ type = string size = number }) default = { type = "pd-ssd" size = 50 } } variable "enable_debug_logging" { type = bool description = "Enables debug logging mode." default = false } variable "extra_logging_flags" { type = map(bool) description = "The only available flag is `trace_api`" default = {} } variable "enable_cleanup_compute" { description = <<EOD Enables automatic cleanup of compute nodes and resource policies (e.g. placement groups) managed by this module, when cluster is destroyed. *WARNING*: Toggling this off will impact the running workload. Deployed compute nodes will be destroyed. EOD type = bool default = true } variable "enable_bigquery_load" { description = <<EOD Enables loading of cluster job usage into big query. NOTE: Requires Google Bigquery API. EOD type = bool default = false } variable "cloud_parameters" { description = "cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters)" type = object({ no_comma_params = optional(bool, false) private_data = optional(list(string)) scheduler_parameters = optional(list(string)) resume_rate = optional(number) resume_timeout = optional(number) suspend_rate = optional(number) suspend_timeout = optional(number) topology_plugin = optional(string) topology_param = optional(string) tree_width = optional(number) }) default = {} nullable = false } variable "enable_default_mounts" { description = <<-EOD Enable default global network storage from the controller - /home - /apps Warning: If these are disabled, the slurm etc and munge dirs must be added manually, or some other mechanism must be used to synchronize the slurm conf files and the munge key across the cluster. EOD type = bool default = true } variable "network_storage" { description = "An array of network attached storage mounts to be configured on all instances." type = list(object({ server_ip = string, remote_mount = string, local_mount = string, fs_type = string, mount_options = string, client_install_runner = optional(map(string)) mount_runner = optional(map(string)) })) default = [] } variable "login_network_storage" { description = "An array of network attached storage mounts to be configured on all login nodes." type = list(object({ server_ip = string, remote_mount = string, local_mount = string, fs_type = string, mount_options = string, })) default = [] } variable "slurmdbd_conf_tpl" { description = "Slurm slurmdbd.conf template file path." type = string default = null } variable "slurm_conf_tpl" { description = "Slurm slurm.conf template file path." type = string default = null } variable "cgroup_conf_tpl" { description = "Slurm cgroup.conf template file path." type = string default = null } variable "controller_startup_script" { description = "Startup script used by the controller VM." type = string default = "# no-op" } variable "controller_startup_scripts_timeout" { description = <<EOD The timeout (seconds) applied to each script in controller_startup_scripts. If any script exceeds this timeout, then the instance setup process is considered failed and handled accordingly. NOTE: When set to 0, the timeout is considered infinite and thus disabled. EOD type = number default = 300 } variable "login_startup_script" { description = "Startup script used by the login VMs." type = string default = "# no-op" } variable "login_startup_scripts_timeout" { description = <<EOD The timeout (seconds) applied to each script in login_startup_scripts. If any script exceeds this timeout, then the instance setup process is considered failed and handled accordingly. NOTE: When set to 0, the timeout is considered infinite and thus disabled. EOD type = number default = 300 } variable "compute_startup_scripts_timeout" { description = <<EOD The timeout (seconds) applied to each startup script in compute nodes. If any script exceeds this timeout, then the instance setup process is considered failed and handled accordingly. NOTE: When set to 0, the timeout is considered infinite and thus disabled. EOD type = number default = 300 } variable "enable_chs_gpu_health_check_prolog" { description = <<EOD Enable a Cluster Health Sacnner(CHS) GPU health check that slurmd executes as a prolog script whenever it is asked to run a job step from a new job allocation. Compute nodes that fail GPU health check during prolog will be marked as drained. Find more details at: https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/docs/CHS-Slurm.md EOD type = bool default = false nullable = false } variable "enable_chs_gpu_health_check_epilog" { description = <<EOD Enable a Cluster Health Sacnner(CHS) GPU health check that slurmd executes as an epilog script after completing a job step from a new job allocation. Compute nodes that fail GPU health check during epilog will be marked as drained. Find more details at: https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/docs/CHS-Slurm.md EOD type = bool default = false nullable = false } variable "prolog_scripts" { description = <<EOD List of scripts to be used for Prolog. Programs for the slurmd to execute whenever it is asked to run a job step from a new job allocation. See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. EOD type = list(object({ filename = string content = optional(string) source = optional(string) })) default = [] validation { condition = alltrue([ for script in var.prolog_scripts : (script.content != null && script.source == null) || (script.content == null && script.source != null) ]) error_message = "Either 'content' or 'source' must be defined, but not both." } } variable "epilog_scripts" { description = <<EOD List of scripts to be used for Epilog. Programs for the slurmd to execute on every node when a user's job completes. See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. EOD type = list(object({ filename = string content = optional(string) source = optional(string) })) default = [] validation { condition = alltrue([ for script in var.epilog_scripts : (script.content != null && script.source == null) || (script.content == null && script.source != null) ]) error_message = "Either 'content' or 'source' must be defined, but not both." } } variable "enable_external_prolog_epilog" { description = <<EOD Automatically enable a script that will execute prolog and epilog scripts shared by NFS from the controller to compute nodes. Find more details at: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/tools/prologs-epilogs/README.md EOD type = bool default = null } variable "cloudsql" { description = <<EOD Use this database instead of the one on the controller. server_ip : Address of the database server. user : The user to access the database as. password : The password, given the user, to access the given database. (sensitive) db_name : The database to access. user_managed_replication : The list of location and (optional) kms_key_name for secret EOD type = object({ server_ip = string user = string password = string # sensitive db_name = string user_managed_replication = optional(list(object({ location = string kms_key_name = optional(string) })), []) }) default = null sensitive = true } variable "universe_domain" { description = "Domain address for alternate API universe" type = string default = "googleapis.com" nullable = false } variable "endpoint_versions" { description = "Version of the API to use (The compute service is the only API currently supported)" type = object({ compute = string }) default = { compute = "beta" } nullable = false } variable "gcloud_path_override" { description = "Directory of the gcloud executable to be used during cleanup" type = string default = "" nullable = false } # DEPRECATED VARIABLES variable "enable_devel" { # tflint-ignore: terraform_unused_declarations description = "DEPRECATED: `enable_devel` is always on." type = bool default = null validation { condition = var.enable_devel == null error_message = "DEPRECATED: It is always on, remove `enable_devel` variable." } } variable "disable_default_mounts" { # tflint-ignore: terraform_unused_declarations description = "DEPRECATED: Use `enable_default_mounts` instead." type = bool default = null validation { condition = var.disable_default_mounts == null error_message = "DEPRECATED: Use `enable_default_mounts` instead." } } variable "enable_slurm_gcp_plugins" { # tflint-ignore: terraform_unused_declarations description = <<EOD DEPRECATED: Slurm GCP plugins have been deprecated. Instead of 'max_hops' plugin please use the 'placement_max_distance' nodeset property. Instead of 'enable_vpmu' plugin please use 'advanced_machine_features.performance_monitoring_unit' nodeset property. EOD type = any default = null validation { condition = var.enable_slurm_gcp_plugins == null error_message = <<EOD DEPRECATED: Slurm GCP plugins have been deprecated. Instead of 'max_hops' plugin please use the 'placement_max_distance' nodeset property. Instead of 'enable_vpmu' plugin please use 'advanced_machine_features.performance_monitoring_unit' nodeset property. EOD } } variable "compute_startup_script" { # tflint-ignore: terraform_unused_declarations description = <<EOD DEPRECATED: `compute_startup_script` has been deprecated. Use `startup_script` of nodeset module instead. EOD type = any default = null validation { condition = var.compute_startup_script == null error_message = <<EOD DEPRECATED: `compute_startup_script` has been deprecated. Use `startup_script` of nodeset module instead. EOD } }