best-practices/gke-batch-refarch/gke/main.tf (257 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
locals {
team_a_namespace = var.team_a_namespace
team_b_namespace = var.team_b_namespace
team_c_namespace = var.team_c_namespace
team_d_namespace = var.team_d_namespace
}
data "google_client_config" "default" {}
data "google_container_cluster" "gke_cluster" {
name = "batch-dev"
location = var.region
}
data "google_project" "project" {
project_id = var.project_id
}
# Reservation for instances with GPUs
resource "google_compute_reservation" "machine_reservation" {
project = var.project_id
specific_reservation_required = true
name = "machine-reservation"
zone = var.zone
specific_reservation {
count = var.machine_reservation_count
instance_properties {
machine_type = var.machine_type
guest_accelerators {
accelerator_type = var.accelerator
accelerator_count = var.accelerator_count
}
}
}
}
# Nodepool to consume reservation for instances with GPUs
resource "google_container_node_pool" "reserved_np" {
project = var.project_id
name = "reserved-np"
cluster = data.google_container_cluster.gke_cluster.name
node_count = var.machine_reservation_count
node_locations = ["${var.zone}"]
location = var.region
node_config {
machine_type = var.machine_type
dynamic "taint" {
for_each = var.reserved_taints
content {
key = taint.value.key
value = taint.value.taint_value
effect = taint.value.effect
}
}
labels = {
"resource-type" : "reservation"
}
guest_accelerator {
type = var.accelerator
count = var.accelerator_count
}
reservation_affinity {
consume_reservation_type = "SPECIFIC_RESERVATION"
key = "compute.googleapis.com/reservation-name"
values = ["machine-reservation"]
}
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
}
timeouts {
create = "30m"
update = "20m"
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
}
# Nodepool to spill over high priority workloads from reserved to on-demand instances with GPUs
resource "google_container_node_pool" "ondemand_np" {
name = "ondemand-np"
project = var.project_id
cluster = data.google_container_cluster.gke_cluster.name
location = var.region
node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"]
node_config {
machine_type = var.machine_type
dynamic "taint" {
for_each = var.ondemand_taints
content {
key = taint.value.key
value = taint.value.taint_value
effect = taint.value.effect
}
}
labels = {
"resource-type" : "ondemand"
}
guest_accelerator {
type = var.accelerator
count = var.accelerator_count
}
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
}
autoscaling {
total_min_node_count = 0
total_max_node_count = 24
location_policy = "ANY"
}
timeouts {
create = "30m"
update = "20m"
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
}
# Nodepool to spill over low priority workloads from reserved to Spot instances with GPUs
resource "google_container_node_pool" "spot_np" {
name = "spot-np"
project = var.project_id
cluster = data.google_container_cluster.gke_cluster.name
location = var.region
node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"]
node_config {
machine_type = var.machine_type
spot = true
dynamic "taint" {
for_each = var.spot_taints
content {
key = taint.value.key
value = taint.value.taint_value
effect = taint.value.effect
}
}
labels = {
"resource-type" : "spot"
}
guest_accelerator {
type = var.accelerator
count = var.accelerator_count
}
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
}
autoscaling {
total_min_node_count = 0
total_max_node_count = 36
location_policy = "ANY"
}
timeouts {
create = "30m"
update = "20m"
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
}
# Workload Identity for team-a
resource "google_service_account" "wi_team_a" {
account_id = "wi-team-a"
display_name = "team-a Service Account"
}
resource "google_project_iam_member" "wi_team_a_monitoring_metricwriter" {
member = google_service_account.wi_team_a.member
project = var.project_id
role = "roles/monitoring.metricWriter"
}
resource "google_project_iam_member" "wi_team_a_logging_logwriter" {
member = google_service_account.wi_team_a.member
project = var.project_id
role = "roles/logging.logWriter"
}
resource "google_project_iam_member" "wi_team_a_storage_admin" {
member = google_service_account.wi_team_a.member
project = var.project_id
role = "roles/storage.admin"
}
resource "google_service_account_iam_binding" "wi_team_a_iam_wi_user" {
service_account_id = google_service_account.wi_team_a.name
role = "roles/iam.workloadIdentityUser"
members = ["serviceAccount:${var.project_id}.svc.id.goog[${local.team_a_namespace}/${local.team_a_namespace}-ksa]"]
}
# Workload Identity for team-b
resource "google_service_account" "wi_team_b" {
account_id = "wi-team-b"
display_name = "team-b Service Account"
}
resource "google_project_iam_member" "wi_team_b_monitoring_metricwriter" {
member = google_service_account.wi_team_b.member
project = var.project_id
role = "roles/monitoring.metricWriter"
}
resource "google_project_iam_member" "wi_team_b_logging_logwriter" {
member = google_service_account.wi_team_b.member
project = var.project_id
role = "roles/logging.logWriter"
}
resource "google_project_iam_member" "wi_team_b_storage_admin" {
member = google_service_account.wi_team_b.member
project = var.project_id
role = "roles/storage.admin"
}
resource "google_service_account_iam_binding" "wi_team_b_iam_wi_user" {
service_account_id = google_service_account.wi_team_b.name
role = "roles/iam.workloadIdentityUser"
members = ["serviceAccount:${var.project_id}.svc.id.goog[${local.team_b_namespace}/${local.team_b_namespace}-ksa]"]
}
# Workload Identity for team-c
resource "google_service_account" "wi_team_c" {
account_id = "wi-team-c"
display_name = "team-c Service Account"
}
resource "google_project_iam_member" "wi_team_c_monitoring_metricwriter" {
member = google_service_account.wi_team_c.member
project = var.project_id
role = "roles/monitoring.metricWriter"
}
resource "google_project_iam_member" "wi_team_c_logging_logwriter" {
member = google_service_account.wi_team_c.member
project = var.project_id
role = "roles/logging.logWriter"
}
resource "google_project_iam_member" "wi_team_c_storage_admin" {
member = google_service_account.wi_team_c.member
project = var.project_id
role = "roles/storage.admin"
}
resource "google_service_account_iam_binding" "wi_team_c_iam_wi_user" {
service_account_id = google_service_account.wi_team_c.name
role = "roles/iam.workloadIdentityUser"
members = ["serviceAccount:${var.project_id}.svc.id.goog[${local.team_c_namespace}/${local.team_c_namespace}-ksa]"]
}
# Workload Identity for team-d
resource "google_service_account" "wi_team_d" {
account_id = "wi-team-d"
display_name = "team-d Service Account"
}
resource "google_project_iam_member" "wi_team_d_monitoring_metricwriter" {
member = google_service_account.wi_team_d.member
project = var.project_id
role = "roles/monitoring.metricWriter"
}
resource "google_project_iam_member" "wi_team_d_logging_logwriter" {
member = google_service_account.wi_team_d.member
project = var.project_id
role = "roles/logging.logWriter"
}
resource "google_project_iam_member" "wi_team_d_storage_admin" {
member = google_service_account.wi_team_d.member
project = var.project_id
role = "roles/storage.admin"
}
resource "google_service_account_iam_binding" "wi_team_d_iam_wi_user" {
service_account_id = google_service_account.wi_team_d.name
role = "roles/iam.workloadIdentityUser"
members = ["serviceAccount:${var.project_id}.svc.id.goog[${local.team_d_namespace}/${local.team_d_namespace}-ksa]"]
}