community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf (133 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. locals { nodeset_map_ell = { for x in var.nodeset : x.nodeset_name => x... } nodeset_map = { for k, vs in local.nodeset_map_ell : k => vs[0] } nodeset_tpu_map_ell = { for x in var.nodeset_tpu : x.nodeset_name => x... } nodeset_tpu_map = { for k, vs in local.nodeset_tpu_map_ell : k => vs[0] } nodeset_dyn_map_ell = { for x in var.nodeset_dyn : x.nodeset_name => x... } nodeset_dyn_map = { for k, vs in local.nodeset_dyn_map_ell : k => vs[0] } no_reservation_affinity = { type : "NO_RESERVATION" } } # NODESET module "slurm_nodeset_template" { source = "../../internal/slurm-gcp/instance_template" for_each = local.nodeset_map project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name slurm_instance_role = "compute" slurm_bucket_path = module.slurm_files.slurm_bucket_path additional_disks = each.value.additional_disks bandwidth_tier = each.value.bandwidth_tier can_ip_forward = each.value.can_ip_forward advanced_machine_features = each.value.advanced_machine_features disk_auto_delete = each.value.disk_auto_delete disk_labels = each.value.disk_labels disk_resource_manager_tags = each.value.disk_resource_manager_tags disk_size_gb = each.value.disk_size_gb disk_type = each.value.disk_type enable_confidential_vm = each.value.enable_confidential_vm enable_oslogin = each.value.enable_oslogin enable_shielded_vm = each.value.enable_shielded_vm gpu = each.value.gpu labels = merge(each.value.labels, { slurm_nodeset = each.value.nodeset_name }) machine_type = each.value.machine_type metadata = merge(each.value.metadata, local.universe_domain) min_cpu_platform = each.value.min_cpu_platform name_prefix = each.value.nodeset_name on_host_maintenance = each.value.on_host_maintenance preemptible = each.value.preemptible resource_manager_tags = each.value.resource_manager_tags spot = each.value.spot termination_action = each.value.termination_action service_account = each.value.service_account shielded_instance_config = each.value.shielded_instance_config source_image_family = each.value.source_image_family source_image_project = each.value.source_image_project source_image = each.value.source_image subnetwork = each.value.subnetwork_self_link additional_networks = each.value.additional_networks access_config = each.value.access_config tags = concat([local.slurm_cluster_name], each.value.tags) max_run_duration = (each.value.dws_flex.enabled && !each.value.dws_flex.use_bulk_insert) ? each.value.dws_flex.max_run_duration : null provisioning_model = (each.value.dws_flex.enabled && !each.value.dws_flex.use_bulk_insert) ? "FLEX_START" : null reservation_affinity = (each.value.dws_flex.enabled && !each.value.dws_flex.use_bulk_insert) ? local.no_reservation_affinity : null } module "nodeset_cleanup" { source = "./modules/cleanup_compute" for_each = local.nodeset_map nodeset = each.value project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name enable_cleanup_compute = var.enable_cleanup_compute universe_domain = var.universe_domain endpoint_versions = var.endpoint_versions gcloud_path_override = var.gcloud_path_override nodeset_template = module.slurm_nodeset_template[each.value.nodeset_name].self_link } locals { nodesets = [for name, ns in local.nodeset_map : { nodeset_name = ns.nodeset_name node_conf = ns.node_conf dws_flex = ns.dws_flex instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link node_count_dynamic_max = ns.node_count_dynamic_max node_count_static = ns.node_count_static subnetwork = ns.subnetwork_self_link reservation_name = ns.reservation_name future_reservation = ns.future_reservation maintenance_interval = ns.maintenance_interval instance_properties_json = ns.instance_properties_json enable_placement = ns.enable_placement placement_max_distance = ns.placement_max_distance network_storage = ns.network_storage zone_target_shape = ns.zone_target_shape zone_policy_allow = ns.zone_policy_allow zone_policy_deny = ns.zone_policy_deny enable_maintenance_reservation = ns.enable_maintenance_reservation enable_opportunistic_maintenance = ns.enable_opportunistic_maintenance }] } # NODESET TPU module "slurm_nodeset_tpu" { source = "../../internal/slurm-gcp/nodeset_tpu" for_each = local.nodeset_tpu_map project_id = var.project_id node_count_dynamic_max = each.value.node_count_dynamic_max node_count_static = each.value.node_count_static nodeset_name = each.value.nodeset_name zone = each.value.zone node_type = each.value.node_type accelerator_config = each.value.accelerator_config tf_version = each.value.tf_version preemptible = each.value.preemptible preserve_tpu = each.value.preserve_tpu enable_public_ip = each.value.enable_public_ip service_account = each.value.service_account data_disks = each.value.data_disks docker_image = each.value.docker_image subnetwork = each.value.subnetwork } module "nodeset_cleanup_tpu" { source = "./modules/cleanup_tpu" for_each = local.nodeset_tpu_map nodeset = { nodeset_name = each.value.nodeset_name zone = each.value.zone } project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name enable_cleanup_compute = var.enable_cleanup_compute universe_domain = var.universe_domain endpoint_versions = var.endpoint_versions gcloud_path_override = var.gcloud_path_override depends_on = [ # Depend on controller network, as a best effort to avoid # subnetwork resourceInUseByAnotherResource error var.subnetwork_self_link ] } resource "google_storage_bucket_object" "parition_config" { for_each = { for p in var.partitions : p.partition_name => p } bucket = module.slurm_files.bucket_name name = "${module.slurm_files.bucket_dir}/partition_configs/${each.key}.yaml" content = yamlencode(each.value) } moved { from = module.slurm_files.google_storage_bucket_object.parition_config to = google_storage_bucket_object.parition_config }