community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf (131 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # BUCKET locals { synt_suffix = substr(md5("${local.controller_project_id}${var.deployment_name}"), 0, 5) synth_bucket_name = "${local.slurm_cluster_name}${local.synt_suffix}" bucket_name = var.create_bucket ? module.bucket[0].name : var.bucket_name } module "bucket" { source = "terraform-google-modules/cloud-storage/google" version = "~> 6.1" count = var.create_bucket ? 1 : 0 location = var.region names = [local.synth_bucket_name] prefix = "slurm" project_id = local.controller_project_id force_destroy = { (local.synth_bucket_name) = true } labels = merge(local.labels, { slurm_cluster_name = local.slurm_cluster_name }) } # BUCKET IAMs locals { compute_sa = toset(flatten([for x in module.slurm_nodeset_template : x.service_account])) compute_tpu_sa = toset(flatten([for x in module.slurm_nodeset_tpu : x.service_account])) login_sa = toset(flatten([for x in module.login : x.service_account])) viewers = toset(flatten([ "serviceAccount:${module.slurm_controller_template.service_account.email}", formatlist("serviceAccount:%s", [for x in local.compute_sa : x.email]), formatlist("serviceAccount:%s", [for x in local.compute_tpu_sa : x.email if x.email != null]), formatlist("serviceAccount:%s", [for x in local.login_sa : x.email]), ])) } resource "google_storage_bucket_iam_member" "viewers" { for_each = local.viewers bucket = local.bucket_name role = "roles/storage.objectViewer" member = each.value } resource "google_storage_bucket_iam_member" "legacy_readers" { for_each = local.viewers bucket = local.bucket_name role = "roles/storage.legacyBucketReader" member = each.value } locals { daos_ns = [ for ns in var.network_storage : ns if ns.fs_type == "daos" ] daos_client_install_runners = [ for ns in local.daos_ns : ns.client_install_runner if ns.client_install_runner != null ] daos_mount_runners = [ for ns in local.daos_ns : ns.mount_runner if ns.mount_runner != null ] daos_network_storage_runners = concat( local.daos_client_install_runners, local.daos_mount_runners, ) daos_install_mount_script = { filename = "ghpc_daos_mount.sh" content = length(local.daos_ns) > 0 ? module.daos_network_storage_scripts[0].startup_script : "" } common_scripts = length(local.daos_ns) > 0 ? [local.daos_install_mount_script] : [] } # SLURM FILES locals { ghpc_startup_script_controller = concat( local.common_scripts, [{ filename = "ghpc_startup.sh" content = var.controller_startup_script }]) controller_state_disk = { device_name : try(google_compute_disk.controller_disk[0].name, null) } nodeset_startup_scripts = { for k, v in local.nodeset_map : k => concat(local.common_scripts, v.startup_script) } } module "daos_network_storage_scripts" { count = length(local.daos_ns) > 0 ? 1 : 0 source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name region = var.region runners = local.daos_network_storage_runners } module "slurm_files" { source = "./modules/slurm_files" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name bucket_dir = var.bucket_dir bucket_name = local.bucket_name controller_network_attachment = var.controller_network_attachment slurmdbd_conf_tpl = var.slurmdbd_conf_tpl slurm_conf_tpl = var.slurm_conf_tpl cgroup_conf_tpl = var.cgroup_conf_tpl cloud_parameters = var.cloud_parameters cloudsql_secret = try( one(google_secret_manager_secret_version.cloudsql_version[*].id), null) controller_startup_scripts = local.ghpc_startup_script_controller controller_startup_scripts_timeout = var.controller_startup_scripts_timeout nodeset_startup_scripts = local.nodeset_startup_scripts compute_startup_scripts_timeout = var.compute_startup_scripts_timeout controller_state_disk = local.controller_state_disk enable_debug_logging = var.enable_debug_logging extra_logging_flags = var.extra_logging_flags enable_bigquery_load = var.enable_bigquery_load enable_external_prolog_epilog = var.enable_external_prolog_epilog enable_chs_gpu_health_check_prolog = var.enable_chs_gpu_health_check_prolog enable_chs_gpu_health_check_epilog = var.enable_chs_gpu_health_check_epilog epilog_scripts = var.epilog_scripts prolog_scripts = var.prolog_scripts disable_default_mounts = !var.enable_default_mounts network_storage = [ for storage in var.network_storage : { server_ip = storage.server_ip, remote_mount = storage.remote_mount, local_mount = storage.local_mount, fs_type = storage.fs_type, mount_options = storage.mount_options } if storage.fs_type != "daos" ] nodeset = local.nodesets nodeset_dyn = values(local.nodeset_dyn_map) # Use legacy format for now nodeset_tpu = values(module.slurm_nodeset_tpu)[*] depends_on = [module.bucket] # Providers endpoint_versions = var.endpoint_versions }