a3/terraform/modules/cluster/slurm/main.tf (338 lines of code) (raw):
/**
* Copyright 2022 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
locals {
partition_names = var.compute_partitions[*].partition_name
compute_partitions = {
for partition in var.compute_partitions
: partition.partition_name
=> {
node_count_dynamic_max = partition.node_count_dynamic_max
node_count_static = partition.node_count_static
partition_name = partition.partition_name
zone = partition.zone
disk_size_gb = coalesce(try(partition.disk_size_gb, null), 128)
disk_type = coalesce(try(partition.disk_type, null), "pd-ssd")
machine_image = coalesce(try(partition.machine_image, null), {
project = "schedmd-slurm-public"
family = "schedmd-v5-slurm-22-05-8-ubuntu-2004-lts"
name = null
})
region = join("-", slice(split("-", partition.zone), 0, 2))
startup_runners = concat(
alltrue([for e in [null, ""] : partition.startup_script != e]) ? [{
type = "shell"
destination = "/tmp/startup_script.sh"
content = partition.startup_script
}] : [],
alltrue([for e in [null, ""] : partition.startup_script_file != e]) ? [{
type = "shell"
destination = "/tmp/startup_script_file.sh"
source = partition.startup_script_file
}] : [],
)
}
}
zeroeth_partition_name = var.compute_partitions[0].partition_name
zeroeth_partition_zone = local.compute_partitions[local.zeroeth_partition_name].zone
zeroeth_partition_region = join("-", slice(split("-", local.zeroeth_partition_zone), 0, 2))
controller_var = {
disk_size_gb = coalesce(try(var.controller_var.disk_size_gb, null), 50)
disk_type = coalesce(try(var.controller_var.disk_type, null), "pd-ssd")
machine_image = coalesce(try(var.controller_var.machine_image, null), {
project = "schedmd-slurm-public"
family = "schedmd-v5-slurm-22-05-8-ubuntu-2004-lts"
name = null
})
machine_type = coalesce(try(var.controller_var.machine_type, null), "c2-standard-4")
region = coalesce(
try(join("-", slice(split("-", var.controller_var.zone), 0, 2)), null),
local.zeroeth_partition_region,
)
startup_runners = concat(
alltrue([
for e in [null, ""]
: try(var.controller_var.startup_script != e, false)
]) ? [{
type = "shell"
destination = "/tmp/startup_script.sh"
content = var.controller_var.startup_script
}] : [],
alltrue([
for e in [null, ""]
: try(var.controller_var.startup_script_file != e, false)
]) ? [{
type = "shell"
destination = "/tmp/startup_script_file.sh"
source = var.controller_var.startup_script_file
}] : [],
)
zone = coalesce(try(var.controller_var.zone, null), local.zeroeth_partition_zone)
}
login_var = {
disk_size_gb = coalesce(try(var.login_var.disk_size_gb, null), 50)
disk_type = coalesce(try(var.login_var.disk_type, null), "pd-standard")
machine_image = coalesce(try(var.login_var.machine_image, null), {
project = "schedmd-slurm-public"
family = "schedmd-v5-slurm-22-05-8-ubuntu-2004-lts"
name = null
})
machine_type = coalesce(try(var.login_var.machine_type, null), "n2-standard-2")
region = coalesce(
try(join("-", slice(split("-", var.login_var.zone), 0, 2)), null),
local.zeroeth_partition_region,
)
startup_runners = concat(
alltrue([
for e in [null, ""]
: try(var.login_var.startup_script != e, false)
]) ? [{
type = "shell"
destination = "/tmp/startup_script.sh"
content = var.login_var.startup_script
}] : [],
alltrue([
for e in [null, ""]
: try(var.login_var.startup_script_file != e, false)
]) ? [{
type = "shell"
destination = "/tmp/startup_script_file.sh"
source = var.login_var.startup_script_file
}] : [],
)
zone = coalesce(try(var.login_var.zone, null), local.zeroeth_partition_zone)
}
_instance_template_prefix = "https://www.googleapis.com/compute/beta/projects/${var.project_id}/global/instanceTemplates"
compute_instance_templates = {
for name in local.partition_names
: name
=> "${local._instance_template_prefix}/${module.compute_instance_templates[name].name}"
}
controller_instance_template = "${local._instance_template_prefix}/${module.controller_instance_template.name}"
login_instance_template = "${local._instance_template_prefix}/${module.login_instance_template.name}"
}
module "network" {
source = "../../common/network"
nic0_existing = var.network_existing
project_id = var.project_id
region = local.zeroeth_partition_region
resource_prefix = var.resource_prefix
}
module "gcsfuse" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/pre-existing-network-storage//?ref=v1.17.0"
count = length(var.gcsfuse_existing)
fs_type = "gcsfuse"
local_mount = var.gcsfuse_existing[count.index].local_mount
mount_options = "defaults,_netdev,implicit_dirs,allow_other"
remote_mount = var.gcsfuse_existing[count.index].remote_mount
}
module "filestore" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore//?ref=v1.17.0"
count = length(var.filestore_new)
deployment_name = var.resource_prefix
filestore_share_name = "nfsshare_${count.index}"
filestore_tier = var.filestore_new[count.index].filestore_tier
local_mount = var.filestore_new[count.index].local_mount
network_id = module.network.network_ids[0]
project_id = var.project_id
region = local.zeroeth_partition_region
size_gb = var.filestore_new[count.index].size_gb
zone = local.zeroeth_partition_zone
labels = merge(var.labels, { ghpc_role = "file-system" })
depends_on = [
module.network,
]
}
module "compute_startups" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script/?ref=v1.17.0"
for_each = toset(local.partition_names)
deployment_name = var.resource_prefix
labels = merge(var.labels, { ghpc_role = "scripts" })
project_id = var.project_id
region = local.compute_partitions[each.key].region
gcs_bucket_path = var.startup_script_gcs_bucket_path
runners = concat(
module.gcsfuse[*].client_install_runner,
module.gcsfuse[*].mount_runner,
module.filestore[*].install_nfs_client_runner,
module.filestore[*].mount_runner,
local.compute_partitions[each.key].startup_runners,
)
depends_on = [
module.gcsfuse,
module.filestore,
]
}
module "controller_startup" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script/?ref=v1.17.0"
deployment_name = var.resource_prefix
labels = merge(var.labels, { ghpc_role = "scripts" })
project_id = var.project_id
region = local.controller_var.region
gcs_bucket_path = var.startup_script_gcs_bucket_path
runners = concat(
module.gcsfuse[*].client_install_runner,
module.gcsfuse[*].mount_runner,
module.filestore[*].install_nfs_client_runner,
module.filestore[*].mount_runner,
local.controller_var.startup_runners,
)
depends_on = [
module.gcsfuse,
module.filestore,
]
}
module "login_startup" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script/?ref=v1.17.0"
deployment_name = var.resource_prefix
labels = merge(var.labels, { ghpc_role = "scripts" })
project_id = var.project_id
region = local.login_var.region
gcs_bucket_path = var.startup_script_gcs_bucket_path
runners = concat(
module.gcsfuse[*].client_install_runner,
module.gcsfuse[*].mount_runner,
module.filestore[*].install_nfs_client_runner,
module.filestore[*].mount_runner,
local.login_var.startup_runners,
)
depends_on = [
module.gcsfuse,
module.filestore,
]
}
module "compute_instance_templates" {
source = "../../common/instance_template"
for_each = toset(local.partition_names)
disk_size_gb = local.compute_partitions[each.key].disk_size_gb
disk_type = local.compute_partitions[each.key].disk_type
labels = var.labels
machine_image = local.compute_partitions[each.key].machine_image
machine_type = "a3-highgpu-8g"
maintenance_interval = null
metadata = null
project_id = var.project_id
region = local.compute_partitions[each.key].region
resource_prefix = "${var.resource_prefix}-compute-${each.key}"
use_static_naming = true
service_account = var.service_account
startup_script = module.compute_startups[each.key].startup_script
subnetwork_self_links = module.network.subnetwork_self_links
network_self_links = module.network.network_self_links
depends_on = [
module.network,
]
}
module "controller_instance_template" {
source = "../../common/instance_template"
disk_size_gb = local.controller_var.disk_size_gb
disk_type = local.controller_var.disk_type
labels = var.labels
machine_image = local.controller_var.machine_image
machine_type = local.controller_var.machine_type
maintenance_interval = null
metadata = null
project_id = var.project_id
region = local.controller_var.region
resource_prefix = "${var.resource_prefix}-controller"
use_static_naming = true
service_account = var.service_account
startup_script = module.controller_startup.startup_script
subnetwork_self_links = module.network.subnetwork_self_links
network_self_links = module.network.network_self_links
depends_on = [
module.network,
]
}
module "login_instance_template" {
source = "../../common/instance_template"
disk_size_gb = local.login_var.disk_size_gb
disk_type = local.login_var.disk_type
labels = var.labels
machine_image = local.login_var.machine_image
machine_type = local.login_var.machine_type
maintenance_interval = null
metadata = null
project_id = var.project_id
region = local.login_var.region
resource_prefix = "${var.resource_prefix}-login"
use_static_naming = true
service_account = var.service_account
startup_script = null
subnetwork_self_links = module.network.subnetwork_self_links
network_self_links = module.network.network_self_links
depends_on = [
module.network,
]
}
module "compute_node_groups" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v5-node-group//?ref=v1.17.0"
for_each = toset(local.partition_names)
instance_template = local.compute_instance_templates[each.key]
labels = merge(var.labels, { ghpc_role = "compute" })
node_count_static = local.compute_partitions[each.key].node_count_static
node_count_dynamic_max = local.compute_partitions[each.key].node_count_dynamic_max
project_id = var.project_id
service_account = module.compute_instance_templates[each.key].service_account
}
module "compute_partitions" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v5-partition//?ref=v1.17.0"
for_each = toset(local.partition_names)
enable_placement = false
deployment_name = var.resource_prefix
is_default = each.key == local.partition_names[0]
node_groups = [module.compute_node_groups[each.key].node_groups]
partition_name = each.key
project_id = var.project_id
region = local.compute_partitions[each.key].region
startup_script = module.compute_startups[each.key].startup_script
subnetwork_self_link = module.network.subnetwork_self_links[0]
subnetwork_project = var.project_id
zone = local.compute_partitions[each.key].zone
depends_on = [
module.network,
]
}
module "controller" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scheduler/schedmd-slurm-gcp-v5-controller//?ref=v1.17.0"
deployment_name = var.resource_prefix
enable_cleanup_compute = var.enable_cleanup_compute
instance_template = local.controller_instance_template
labels = merge(var.labels, { ghpc_role = "scheduler" })
partition = [for k in local.partition_names : module.compute_partitions[k].partition]
project_id = var.project_id
region = local.controller_var.region
service_account = module.controller_instance_template.service_account
controller_startup_script = module.controller_startup.startup_script
subnetwork_self_link = module.network.subnetwork_self_links[0]
depends_on = [
module.network,
]
}
module "login" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scheduler/schedmd-slurm-gcp-v5-login//?ref=v1.17.0"
controller_instance_id = module.controller.controller_instance_id
deployment_name = var.resource_prefix
instance_template = local.login_instance_template
labels = merge(var.labels, { ghpc_role = "scheduler" })
project_id = var.project_id
region = local.login_var.region
service_account = module.login_instance_template.service_account
startup_script = module.login_startup.startup_script
subnetwork_self_link = module.network.subnetwork_self_links[0]
depends_on = [
module.network,
]
}