community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf (163 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. locals { # This label allows for billing report tracking based on module. labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset", ghpc_role = "compute" }) } module "gpu" { source = "../../../../modules/internal/gpu-definition" machine_type = var.machine_type guest_accelerator = var.guest_accelerator } locals { guest_accelerator = module.gpu.guest_accelerator disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.disable_automatic_updates_metadata, var.metadata ) name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14) additional_disks = [ for ad in var.additional_disks : { disk_name = ad.disk_name device_name = ad.device_name disk_type = ad.disk_type disk_size_gb = ad.disk_size_gb disk_labels = merge(ad.disk_labels, local.labels) auto_delete = ad.auto_delete boot = ad.boot disk_resource_manager_tags = ad.disk_resource_manager_tags } ] public_access_config = var.enable_public_ips ? [{ nat_ip = null, network_tier = null }] : [] access_config = length(var.access_config) == 0 ? local.public_access_config : var.access_config service_account = { email = var.service_account_email scopes = var.service_account_scopes } ghpc_startup_script = [{ filename = "ghpc_nodeset_startup.sh" content = var.startup_script }] termination_action = (var.dws_flex.enabled && !var.dws_flex.use_bulk_insert) ? "DELETE" : try(var.spot_instance_config.termination_action, null) nodeset = { node_count_static = var.node_count_static node_count_dynamic_max = var.node_count_dynamic_max node_conf = var.node_conf nodeset_name = local.name dws_flex = var.dws_flex disk_auto_delete = var.disk_auto_delete disk_labels = merge(local.labels, var.disk_labels) disk_size_gb = var.disk_size_gb disk_type = var.disk_type disk_resource_manager_tags = var.disk_resource_manager_tags additional_disks = local.additional_disks bandwidth_tier = var.bandwidth_tier can_ip_forward = var.can_ip_forward enable_confidential_vm = var.enable_confidential_vm enable_placement = var.enable_placement placement_max_distance = var.placement_max_distance enable_oslogin = var.enable_oslogin enable_shielded_vm = var.enable_shielded_vm gpu = one(local.guest_accelerator) labels = local.labels machine_type = terraform_data.machine_type_zone_validation.output advanced_machine_features = var.advanced_machine_features metadata = local.metadata min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance preemptible = var.preemptible region = var.region resource_manager_tags = var.resource_manager_tags service_account = local.service_account shielded_instance_config = var.shielded_instance_config source_image_family = local.source_image_family # requires source_image_logic.tf source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf subnetwork_self_link = var.subnetwork_self_link additional_networks = var.additional_networks access_config = local.access_config tags = var.tags spot = var.enable_spot_vm termination_action = local.termination_action reservation_name = local.reservation_name future_reservation = local.future_reservation maintenance_interval = var.maintenance_interval instance_properties_json = jsonencode(var.instance_properties) zone_target_shape = var.zone_target_shape zone_policy_allow = local.zones zone_policy_deny = local.zones_deny startup_script = local.ghpc_startup_script network_storage = var.network_storage enable_maintenance_reservation = var.enable_maintenance_reservation enable_opportunistic_maintenance = var.enable_opportunistic_maintenance } } locals { zones = setunion(var.zones, [var.zone]) zones_deny = setsubtract(data.google_compute_zones.available.names, local.zones) } data "google_compute_zones" "available" { project = var.project_id region = var.region lifecycle { postcondition { condition = length(setsubtract(local.zones, self.names)) == 0 error_message = <<-EOD Invalid zones=${jsonencode(setsubtract(local.zones, self.names))} Available zones=${jsonencode(self.names)} EOD } } } locals { res_match = regex("^(?P<whole>(?P<prefix>projects/(?P<project>[a-z0-9-]+)/reservations/)?(?P<name>[a-z0-9-]+)(?P<suffix>/reservationBlocks/[a-z0-9-]+)?)?$", var.reservation_name) res_short_name = local.res_match.name res_project = coalesce(local.res_match.project, var.project_id) res_prefix = coalesce(local.res_match.prefix, "projects/${local.res_project}/reservations/") res_suffix = local.res_match.suffix == null ? "" : local.res_match.suffix reservation_name = local.res_match.whole == null ? "" : "${local.res_prefix}${local.res_short_name}${local.res_suffix}" } locals { fr_match = regex("^(?P<whole>projects/(?P<project>[a-z0-9-]+)/zones/(?P<zone>[a-z0-9-]+)/futureReservations/)?(?P<name>[a-z0-9-]+)?$", var.future_reservation) fr_name = local.fr_match.name fr_project = coalesce(local.fr_match.project, var.project_id) fr_zone = coalesce(local.fr_match.zone, var.zone) future_reservation = var.future_reservation == "" ? "" : "projects/${local.fr_project}/zones/${local.fr_zone}/futureReservations/${local.fr_name}" } # tflint-ignore: terraform_unused_declarations data "google_compute_reservation" "reservation" { count = length(local.reservation_name) > 0 ? 1 : 0 name = local.res_short_name project = local.res_project zone = var.zone lifecycle { postcondition { condition = self.self_link != null error_message = "Couldn't find the reservation ${var.reservation_name}" } postcondition { condition = coalesce(self.specific_reservation_required, true) error_message = <<EOT your reservation has to be specific, see https://cloud.google.com/compute/docs/instances/reservations-overview#how-reservations-work for more information. if it's intentionally automatic, don't specify it in the blueprint. EOT } # TODO: wait for https://github.com/hashicorp/terraform-provider-google/issues/18248 # Add a validation that if reservation.project != var.project_id it should be a shared reservation } } data "google_compute_machine_types" "machine_types_by_zone" { for_each = local.zones project = var.project_id filter = format("name = \"%s\"", var.machine_type) zone = each.value } locals { machine_types_by_zone = data.google_compute_machine_types.machine_types_by_zone zones_with_machine_type = [for k, v in local.machine_types_by_zone : k if length(v.machine_types) > 0] } resource "terraform_data" "machine_type_zone_validation" { input = var.machine_type lifecycle { precondition { condition = length(local.zones_with_machine_type) > 0 error_message = <<-EOT machine type ${var.machine_type} is not available in any of the zones ${jsonencode(local.zones)}". To list zones in which it is available, run: gcloud compute machine-types list --filter="name=${var.machine_type}" EOT } } }