a2/terraform/modules/cluster/mig/main.tf (136 lines of code) (raw):

/** * Copyright 2022 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ locals { startup_runners = concat( var.enable_ops_agent ? [{ type = "shell" destination = "/tmp/enable_ops_agent.sh" source = "${path.module}/../../../../../scripts/enable_ops_agent.sh" }] : [], var.enable_ray ? [{ type = "shell" destination = "/tmp/enable_ray.sh" source = "${path.module}/../../../../../scripts/enable_ray.sh" args = "1.12.1 26379 8" }] : [], var.startup_script != null && var.startup_script != "" ? [{ type = "shell" destination = "/tmp/startup_script.sh" content = var.startup_script }] : [], var.startup_script_file != null && var.startup_script_file != "" ? [{ type = "shell" destination = "/tmp/startup_script_file.sh" source = var.startup_script_file }] : [], ) } module "dashboard" { source = "../../common/dashboard" count = var.enable_ops_agent ? 1 : 0 enable_gce_gke_gpu_utilization_widgets = false enable_nvidia_dcgm_widgets = true enable_nvidia_nvml_widgets = true project_id = var.project_id resource_prefix = var.resource_prefix } module "network" { source = "../../common/network" nic_existing = var.network_existing project_id = var.project_id region = var.region resource_prefix = var.resource_prefix } module "gcsfuse" { source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/pre-existing-network-storage//?ref=v1.17.0" count = length(var.gcsfuse_existing) fs_type = "gcsfuse" local_mount = var.gcsfuse_existing[count.index].local_mount mount_options = "defaults,_netdev,implicit_dirs,allow_other" remote_mount = var.gcsfuse_existing[count.index].remote_mount } module "filestore" { source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore//?ref=v1.17.0" count = length(var.filestore_new) deployment_name = var.resource_prefix filestore_share_name = "nfsshare_${count.index}" filestore_tier = var.filestore_new[count.index].filestore_tier local_mount = var.filestore_new[count.index].local_mount network_id = module.network.network_id project_id = var.project_id region = var.region size_gb = var.filestore_new[count.index].size_gb zone = var.filestore_new[count.index].zone labels = merge(var.labels, { ghpc_role = "file-system" }) } module "startup" { source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script/?ref=v1.17.0" deployment_name = var.resource_prefix labels = merge(var.labels, { ghpc_role = "scripts" }) project_id = var.project_id region = var.region gcs_bucket_path = var.startup_script_gcs_bucket_path runners = concat( module.gcsfuse[*].client_install_runner, module.gcsfuse[*].mount_runner, module.filestore[*].install_nfs_client_runner, module.filestore[*].mount_runner, local.startup_runners, ) } module "compute_instance_template" { source = "../../common/instance_template" count = length(var.instance_groups) disk_size_gb = var.disk_size_gb disk_type = var.disk_type machine_image = var.machine_image machine_type = var.instance_groups[count.index].machine_type maintenance_interval = null metadata = var.metadata project_id = var.project_id region = var.region resource_prefix = var.resource_prefix service_account = var.service_account use_compact_placement_policy = var.use_compact_placement_policy startup_script = module.startup.startup_script subnetwork_self_link = module.network.subnetwork_self_link network_self_link = module.network.network_self_link labels = merge(var.labels, { ghpc_role = "compute" }) } resource "google_compute_instance_group_manager" "mig" { provider = google-beta count = length(var.instance_groups) base_instance_name = "${var.resource_prefix}-${count.index}" name = "${var.resource_prefix}-${count.index}" project = var.project_id target_size = var.instance_groups[count.index].target_size wait_for_instances = var.wait_for_instances zone = var.instance_groups[count.index].zone update_policy { max_unavailable_fixed = 1 minimal_action = "RESTART" replacement_method = "RECREATE" # Instance name will be preserved type = "PROACTIVE" } version { instance_template = module.compute_instance_template[count.index].id name = "default" } timeouts { create = "30m" update = "30m" } }