community/modules/internal/slurm-gcp/nodeset_tpu/main.tf (109 lines of code) (raw):
/**
* Copyright (C) SchedMD LLC.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
###########
# NODESET #
###########
locals {
node_conf_hw = {
Mem334CPU96 = {
CPUs = 96
Boards = 1
Sockets = 2
CoresPerSocket = 24
ThreadsPerCore = 2
RealMemory = 307200
}
Mem400CPU240 = {
CPUs = 240
Boards = 1
Sockets = 2
CoresPerSocket = 60
ThreadsPerCore = 2
RealMemory = 400000
}
}
node_conf_mappings = {
"v2" = local.node_conf_hw.Mem334CPU96
"v3" = local.node_conf_hw.Mem334CPU96
"v4" = local.node_conf_hw.Mem400CPU240
}
simple_nodes = ["v2-8", "v3-8", "v4-8"]
}
locals {
snetwork = data.google_compute_subnetwork.nodeset_subnetwork.name
region = join("-", slice(split("-", var.zone), 0, 2))
tpu_fam = var.accelerator_config.version != "" ? lower(var.accelerator_config.version) : split("-", var.node_type)[0]
#If subnetwork is specified and it does not have private_ip_google_access, we need to have public IPs on the TPU
#if no subnetwork is specified, the default one will be used, this does not have private_ip_google_access so we need public IPs too
pub_need = !data.google_compute_subnetwork.nodeset_subnetwork.private_ip_google_access
can_preempt = var.node_type != null ? contains(local.simple_nodes, var.node_type) : false
nodeset_tpu = {
nodeset_name = var.nodeset_name
node_conf = local.node_conf_mappings[local.tpu_fam]
node_type = var.node_type
accelerator_config = var.accelerator_config
tf_version = var.tf_version
preemptible = local.can_preempt ? var.preemptible : false
reserved = var.reserved
node_count_dynamic_max = var.node_count_dynamic_max
node_count_static = var.node_count_static
enable_public_ip = var.enable_public_ip
zone = var.zone
service_account = var.service_account != null ? var.service_account : local.service_account
preserve_tpu = local.can_preempt ? var.preserve_tpu : false
data_disks = var.data_disks
docker_image = var.docker_image != "" ? var.docker_image : "us-docker.pkg.dev/schedmd-slurm-public/tpu/slurm-gcp-6-9:tf-${var.tf_version}"
subnetwork = local.snetwork
network_storage = var.network_storage
}
service_account = {
email = try(var.service_account.email, null)
scopes = try(var.service_account.scopes, ["https://www.googleapis.com/auth/cloud-platform"])
}
}
data "google_compute_subnetwork" "nodeset_subnetwork" {
name = var.subnetwork
region = local.region
project = var.project_id
self_link = (
length(regexall("/projects/([^/]*)", var.subnetwork)) > 0
&& length(regexall("/regions/([^/]*)", var.subnetwork)) > 0
? var.subnetwork
: null
)
}
resource "null_resource" "nodeset_tpu" {
triggers = {
nodeset = sha256(jsonencode(local.nodeset_tpu))
}
lifecycle {
precondition {
condition = sum([var.node_count_dynamic_max, var.node_count_static]) > 0
error_message = "Sum of node_count_dynamic_max and node_count_static must be > 0."
}
precondition {
condition = !(var.preemptible && var.reserved)
error_message = "Nodeset cannot be preemptible and reserved at the same time."
}
precondition {
condition = !(var.subnetwork == null && !var.enable_public_ip)
error_message = "Using the default subnetwork for the TPU nodeset requires enable_public_ip set to true."
}
precondition {
condition = !(var.subnetwork != null && (local.pub_need && !var.enable_public_ip))
error_message = "The subnetwork specified does not have Private Google Access enabled. This is required when enable_public_ip is set to false."
}
precondition {
condition = !(var.node_type == null && (var.accelerator_config.topology == "" && var.accelerator_config.version == ""))
error_message = "Either a node type or an accelerator_config must be provided."
}
}
}