terraform-provision-infra/modules/nonagones/gcp-res/main.tf (179 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "4.63.1"
}
random = {
source = "hashicorp/random"
version = "3.5.1"
}
}
}
provider "google" {
project = var.project_id
region = var.region
}
resource "random_id" "tf_subfix" {
byte_length = 4
}
# Enable related service
resource "google_project_service" "gcp_services" {
for_each = toset(var.gcp_service_list)
project = var.project_id
service = each.key
disable_dependent_services = false
disable_on_destroy = false
}
# VPC
resource "google_compute_network" "vpc" {
project = var.project_id
name = "tf-gen-vpc-${random_id.tf_subfix.hex}"
auto_create_subnetworks = "false"
depends_on = [google_project_service.gcp_services]
}
# Subnet
resource "google_compute_subnetwork" "subnet" {
name = "tf-gen-subnet-${random_id.tf_subfix.hex}"
region = var.region
network = google_compute_network.vpc.name
ip_cidr_range = "10.0.0.0/16"
}
# Cloud Router
resource "google_compute_router" "router" {
name = "tf-gen-router-${var.region}-${random_id.tf_subfix.hex}"
region = google_compute_subnetwork.subnet.region
network = google_compute_network.vpc.id
}
# NAT IP
resource "google_compute_address" "address" {
count = 2
name = "tf-gen-nat-${random_id.tf_subfix.hex}-ip-${count.index}"
region = google_compute_subnetwork.subnet.region
depends_on = [google_project_service.gcp_services]
}
# NAT Gateway
resource "google_compute_router_nat" "nat" {
name = "tf-gen-${var.region}-nat-gw-${random_id.tf_subfix.hex}"
router = google_compute_router.router.name
region = google_compute_router.router.region
nat_ip_allocate_option = "MANUAL_ONLY"
nat_ips = google_compute_address.address.*.self_link
source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES"
}
# GKE cluster
resource "google_container_cluster" "gke" {
name = "tf-gen-cluster-${random_id.tf_subfix.hex}"
location = var.filestore_zone
remove_default_node_pool = false
enable_shielded_nodes = true
initial_node_count = 1
network = google_compute_network.vpc.name
subnetwork = google_compute_subnetwork.subnet.name
private_cluster_config {
enable_private_nodes = true
master_ipv4_cidr_block = "192.168.1.0/28"
}
ip_allocation_policy {
}
monitoring_config {
enable_components = ["SYSTEM_COMPONENTS", "APISERVER", "SCHEDULER", "CONTROLLER_MANAGER"]
managed_prometheus { enabled = true }
}
logging_config {
enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS", "APISERVER", "SCHEDULER", "CONTROLLER_MANAGER"]
}
release_channel {
channel = "STABLE"
}
maintenance_policy {
daily_maintenance_window {
start_time = "03:00"
}
}
addons_config {
http_load_balancing {
disabled = false
}
horizontal_pod_autoscaling {
disabled = false
}
gcp_filestore_csi_driver_config {
enabled = true
}
gce_persistent_disk_csi_driver_config {
enabled = true
}
dns_cache_config {
enabled = true
}
}
node_config {
shielded_instance_config {
enable_secure_boot = true
enable_integrity_monitoring = true
}
}
lifecycle {
ignore_changes = all
}
}
# Separately Managed Node Pool
resource "google_container_node_pool" "separately_gpu_nodepool" {
name = "${var.accelerator_type}-nodepool"
location = var.cluster_location
cluster = google_container_cluster.gke.name
autoscaling {
min_node_count = 1
max_node_count = 10
}
node_count = var.gke_num_nodes
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]
labels = {
Terraform = "true"
Environment = "dev"
}
spot = true
machine_type = var.node_machine_type
image_type = "COS_CONTAINERD"
gcfs_config {
enabled = true
}
guest_accelerator {
type = var.accelerator_type
count = 1
gpu_sharing_config {
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
}
}
disk_type = "pd-balanced"
disk_size_gb = 100
tags = ["gpu-node", "gke-sd"]
metadata = {
disable-legacy-endpoints = "true"
}
shielded_instance_config {
enable_secure_boot = true
enable_integrity_monitoring = true
}
}
lifecycle {
ignore_changes = all
}
}
# Filestore
resource "google_filestore_instance" "instance" {
name = "nfs-store-${random_id.tf_subfix.hex}"
location = var.filestore_zone
tier = "BASIC_HDD"
file_shares {
capacity_gb = 1024
name = "vol1"
}
networks {
network = google_compute_network.vpc.name
modes = ["MODE_IPV4"]
}
}
#Artifact Registry
resource "google_artifact_registry_repository" "sd_repo" {
location = var.region
repository_id = "${random_id.tf_subfix.hex}-stable-diffusion-repository"
description = "stable diffusion repository"
format = "DOCKER"
depends_on = [google_project_service.gcp_services]
}