community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf (154 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
module "gpu" {
source = "../../../../modules/internal/gpu-definition"
machine_type = var.machine_type
guest_accelerator = var.guest_accelerator
}
locals {
additional_disks = [
for ad in var.additional_disks : {
disk_name = ad.disk_name
device_name = ad.device_name
disk_type = ad.disk_type
disk_size_gb = ad.disk_size_gb
disk_labels = merge(ad.disk_labels, local.labels)
auto_delete = ad.auto_delete
boot = ad.boot
disk_resource_manager_tags = ad.disk_resource_manager_tags
}
]
state_disk = var.controller_state_disk != null ? [{
source = google_compute_disk.controller_disk[0].name
device_name = google_compute_disk.controller_disk[0].name
disk_labels = null
auto_delete = false
boot = false
}] : []
synth_def_sa_email = "${data.google_project.controller_project.number}-compute@developer.gserviceaccount.com"
service_account = {
email = coalesce(var.service_account_email, local.synth_def_sa_email)
scopes = var.service_account_scopes
}
disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" }
metadata = merge(
local.disable_automatic_updates_metadata,
var.metadata,
local.universe_domain
)
controller_project_id = coalesce(var.controller_project_id, var.project_id)
}
data "google_project" "controller_project" {
project_id = local.controller_project_id
}
resource "google_compute_disk" "controller_disk" {
count = var.controller_state_disk != null ? 1 : 0
project = local.controller_project_id
name = "${local.slurm_cluster_name}-controller-save"
type = var.controller_state_disk.type
size = var.controller_state_disk.size
zone = var.zone
}
# INSTANCE TEMPLATE
module "slurm_controller_template" {
source = "../../internal/slurm-gcp/instance_template"
project_id = local.controller_project_id
region = var.region
slurm_instance_role = "controller"
slurm_cluster_name = local.slurm_cluster_name
labels = local.labels
disk_auto_delete = var.disk_auto_delete
disk_labels = merge(var.disk_labels, local.labels)
disk_size_gb = var.disk_size_gb
disk_type = var.disk_type
disk_resource_manager_tags = var.disk_resource_manager_tags
additional_disks = concat(local.additional_disks, local.state_disk)
bandwidth_tier = var.bandwidth_tier
slurm_bucket_path = module.slurm_files.slurm_bucket_path
can_ip_forward = var.can_ip_forward
advanced_machine_features = var.advanced_machine_features
resource_manager_tags = var.resource_manager_tags
enable_confidential_vm = var.enable_confidential_vm
enable_oslogin = var.enable_oslogin
enable_shielded_vm = var.enable_shielded_vm
shielded_instance_config = var.shielded_instance_config
gpu = one(module.gpu.guest_accelerator)
machine_type = var.machine_type
metadata = local.metadata
min_cpu_platform = var.min_cpu_platform
on_host_maintenance = var.on_host_maintenance
preemptible = var.preemptible
service_account = local.service_account
source_image_family = local.source_image_family # requires source_image_logic.tf
source_image_project = local.source_image_project_normalized # requires source_image_logic.tf
source_image = local.source_image # requires source_image_logic.tf
subnetwork = var.subnetwork_self_link
tags = concat([local.slurm_cluster_name], var.tags)
# termination_action = TODO: add support for termination_action (?)
}
# INSTANCE
resource "google_compute_instance_from_template" "controller" {
provider = google-beta
name = "${local.slurm_cluster_name}-controller"
project = local.controller_project_id
zone = var.zone
source_instance_template = module.slurm_controller_template.self_link
allow_stopping_for_update = true
# Can't rely on template to specify nics due to usage of static_ip
network_interface {
dynamic "access_config" {
for_each = var.enable_controller_public_ips ? ["unit"] : []
content {
nat_ip = null
network_tier = null
}
}
network_ip = length(var.static_ips) == 0 ? "" : var.static_ips[0]
subnetwork = var.subnetwork_self_link
}
dynamic "network_interface" {
for_each = var.controller_network_attachment != null ? [1] : []
content {
network_attachment = var.controller_network_attachment
}
}
}
moved {
from = module.slurm_controller_instance.google_compute_instance_from_template.slurm_instance[0]
to = google_compute_instance_from_template.controller
}
# SECRETS: CLOUDSQL
resource "google_secret_manager_secret" "cloudsql" {
count = var.cloudsql != null ? 1 : 0
secret_id = "${local.slurm_cluster_name}-slurm-secret-cloudsql"
replication {
dynamic "auto" {
for_each = length(var.cloudsql.user_managed_replication) == 0 ? [1] : []
content {}
}
dynamic "user_managed" {
for_each = length(var.cloudsql.user_managed_replication) == 0 ? [] : [1]
content {
dynamic "replicas" {
for_each = nonsensitive(var.cloudsql.user_managed_replication)
content {
location = replicas.value.location
dynamic "customer_managed_encryption" {
for_each = compact([replicas.value.kms_key_name])
content {
kms_key_name = customer_managed_encryption.value
}
}
}
}
}
}
}
labels = {
slurm_cluster_name = local.slurm_cluster_name
}
}
resource "google_secret_manager_secret_version" "cloudsql_version" {
count = var.cloudsql != null ? 1 : 0
secret = google_secret_manager_secret.cloudsql[0].id
secret_data = jsonencode(var.cloudsql)
}
resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" {
count = var.cloudsql != null ? 1 : 0
secret_id = google_secret_manager_secret.cloudsql[0].id
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${local.service_account.email}"
}