platforms/gke-aiml/playground/container_node_pool.tf (1,204 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#######################################################################################################################
# CPU
# Available zones: https://cloud.google.com/compute/docs/regions-zones#available
#######################################################################################################################
resource "google_container_node_pool" "cpu_n4s8" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
initial_node_count = 1
location = var.region
name = "cpu-n4s8"
node_locations = [
"us-central1-a",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "BALANCED"
total_max_node_count = 32
total_min_node_count = 1
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "n4"
"resource-type" : "cpu"
}
machine_type = "n4-standard-8"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
lifecycle {
ignore_changes = [
initial_node_count,
node_config[0].labels,
node_config[0].taint,
]
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "cpu_n4s8_spot" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "cpu-n4s8-spot"
node_locations = [
"us-central1-a",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "BALANCED"
total_max_node_count = 32
total_min_node_count = 0
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "n4"
"resource-type" : "cpu"
}
machine_type = "n4-standard-8"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
spot = true
# Blocks
gcfs_config {
enabled = true
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "spot"
value = true
}
}
lifecycle {
ignore_changes = [
initial_node_count,
node_config[0].labels,
node_config[0].taint,
]
}
timeouts {
create = "30m"
update = "20m"
}
}
#######################################################################################################################
# GPU
# Available zones: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-table
#######################################################################################################################
###################################################################################################
# A100 x 2
###################################################################################################
resource "google_container_node_pool" "gpu_a100x2_a2h2" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-a100x2-a2h2"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
"us-central1-f",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "a100"
"resource-type" : "gpu"
"resource-variant" : "40GB"
}
machine_type = "a2-highgpu-2g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 2
type = "nvidia-tesla-a100"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-a100x2-a2h2-dws"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
"us-central1-f",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "a100"
"resource-type" : "gpu"
"resource-variant" : "40GB"
}
machine_type = "a2-highgpu-2g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 2
type = "nvidia-tesla-a100"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
queued_provisioning {
enabled = true
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_a100x2_a2h2_res" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-a100x2-a2h2-res"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
"us-central1-f",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "a100"
"resource-type" : "gpu"
"resource-variant" : "40GB"
}
machine_type = "a2-highgpu-2g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 2
type = "nvidia-tesla-a100"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "ANY_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "reservation"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_a100x2_a2h2_spot" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-a100x2-a2h2-spot"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
"us-central1-f",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "a100"
"resource-type" : "gpu"
"resource-variant" : "40GB"
}
machine_type = "a2-highgpu-2g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
spot = true
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 2
type = "nvidia-tesla-a100"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "spot"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###################################################################################################
# A100 x 8
###################################################################################################
resource "google_container_node_pool" "gpu_a100x8_a2h8" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-a100x8-a2h8"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
"us-central1-f",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "a100"
"resource-type" : "gpu"
"resource-variant" : "320GB"
}
machine_type = "a2-highgpu-8g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 8
type = "nvidia-tesla-a100"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_a100x8_a2h8_dws" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-a100x8-a2h8-dws"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
"us-central1-f",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "a100"
"resource-type" : "gpu"
"resource-variant" : "320GB"
}
machine_type = "a2-highgpu-8g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 8
type = "nvidia-tesla-a100"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
queued_provisioning {
enabled = true
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_a100x8_a2h8_res" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-a100x8-a2h8-res"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
"us-central1-f",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "a100"
"resource-type" : "gpu"
"resource-variant" : "320GB"
}
machine_type = "a2-highgpu-8g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 8
type = "nvidia-tesla-a100"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "ANY_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "reservation"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_a100x8_a2h8_spot" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-a100x8-a2h8-spot"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
"us-central1-f",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "a100"
"resource-type" : "gpu"
"resource-variant" : "320GB"
}
machine_type = "a2-highgpu-8g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
spot = true
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 8
type = "nvidia-tesla-a100"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "spot"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###################################################################################################
# H100 x 8
###################################################################################################
resource "google_container_node_pool" "gpu_h100x8_a3h8" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-h100x8-a3h8"
node_locations = [
"us-central1-a",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "h100"
"resource-type" : "gpu"
}
machine_type = "a3-highgpu-8g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
ephemeral_storage_local_ssd_config {
local_ssd_count = 16
}
gcfs_config {
enabled = true
}
guest_accelerator {
count = 8
type = "nvidia-h100-80gb"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-h100x8-a3h8-dws"
node_locations = [
"us-central1-a",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "h100"
"resource-type" : "gpu"
}
machine_type = "a3-highgpu-8g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
ephemeral_storage_local_ssd_config {
local_ssd_count = 16
}
gcfs_config {
enabled = true
}
guest_accelerator {
count = 8
type = "nvidia-h100-80gb"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
queued_provisioning {
enabled = true
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-h100x8-a3h8-res"
node_locations = [
"us-central1-a",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "h100"
"resource-type" : "gpu"
}
machine_type = "a3-highgpu-8g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
ephemeral_storage_local_ssd_config {
local_ssd_count = 16
}
gcfs_config {
enabled = true
}
guest_accelerator {
count = 8
type = "nvidia-h100-80gb"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "ANY_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "reservation"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-h100x8-a3h8-spot"
node_locations = [
"us-central1-a",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "h100"
"resource-type" : "gpu"
}
machine_type = "a3-highgpu-8g"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
spot = true
# Blocks
ephemeral_storage_local_ssd_config {
local_ssd_count = 16
}
gcfs_config {
enabled = true
}
guest_accelerator {
count = 8
type = "nvidia-h100-80gb"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "spot"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###################################################################################################
# L4 x 2
###################################################################################################
resource "google_container_node_pool" "gpu_l4x2_g2s24" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-l4x2-g2s24"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "l4"
"resource-type" : "gpu"
}
machine_type = "g2-standard-24"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 2
type = "nvidia-l4"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-l4x2-g2s24-dws"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "l4"
"resource-type" : "gpu"
}
machine_type = "g2-standard-24"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 2
type = "nvidia-l4"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "on-demand"
value = true
}
}
queued_provisioning {
enabled = true
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_l4x2_g2s24_res" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-l4x2-g2s24-res"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "l4"
"resource-type" : "gpu"
}
machine_type = "g2-standard-24"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 2
type = "nvidia-l4"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "ANY_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "reservation"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
###############################################################################
resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
depends_on = [google_gke_hub_membership.cluster]
# Variables
cluster = google_container_cluster.mlp.name
location = var.region
name = "gpu-l4x2-g2s24-spot"
node_locations = [
"us-central1-a",
"us-central1-b",
"us-central1-c",
]
project = data.google_project.environment.project_id
# Blocks
autoscaling {
location_policy = "ANY"
total_max_node_count = 1000
total_min_node_count = 0
}
lifecycle {
ignore_changes = [
node_config[0].labels,
node_config[0].taint,
]
}
network_config {
enable_private_nodes = true
}
node_config {
# Variables
labels = {
"resource-model" : "l4"
"resource-type" : "gpu"
}
machine_type = "g2-standard-24"
service_account = google_service_account.cluster.email
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
spot = true
# Blocks
gcfs_config {
enabled = true
}
guest_accelerator {
count = 2
type = "nvidia-l4"
gpu_driver_installation_config {
gpu_driver_version = var.gpu_driver_version
}
}
gvnic {
enabled = true
}
reservation_affinity {
consume_reservation_type = "NO_RESERVATION"
}
shielded_instance_config {
enable_integrity_monitoring = true
enable_secure_boot = true
}
taint {
effect = "NO_SCHEDULE"
key = "spot"
value = true
}
}
timeouts {
create = "30m"
update = "20m"
}
}
#######################################################################################################################
# TPU
# Available zones: https://cloud.google.com/tpu/docs/regions-zones
#######################################################################################################################