terraform-provision-infra/modules/agones/gcp-res/main.tf (308 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. terraform { required_providers { google = { source = "hashicorp/google" version = "4.63.1" } random = { source = "hashicorp/random" version = "3.5.1" } archive = { source = "hashicorp/archive" version = "2.3.0" } } } provider "google" { project = var.project_id region = var.region } resource "random_id" "tf_subfix" { byte_length = 4 } # Enable related service resource "google_project_service" "gcp_services" { for_each = toset(var.gcp_service_list) project = var.project_id service = each.key disable_dependent_services = false disable_on_destroy = false } data "google_compute_default_service_account" "default" { depends_on = [google_project_service.gcp_services] } data "archive_file" "lambda_my_function" { type = "zip" source_dir = var.cloudfunctions_source_code_path output_file_mode = "0666" output_path = "./cloud_function.zip" } # VPC resource "google_compute_network" "vpc" { project = var.project_id name = "tf-gen-vpc-${random_id.tf_subfix.hex}" auto_create_subnetworks = "false" depends_on = [google_project_service.gcp_services] } # Subnet resource "google_compute_subnetwork" "subnet" { name = "tf-gen-subnet-${random_id.tf_subfix.hex}" region = var.region network = google_compute_network.vpc.name ip_cidr_range = "10.10.0.0/16" } # Cloud Router resource "google_compute_router" "router" { name = "tf-gen-router-${var.region}-${random_id.tf_subfix.hex}" region = google_compute_subnetwork.subnet.region network = google_compute_network.vpc.id } # NAT IP resource "google_compute_address" "address" { count = 2 name = "nat-${random_id.tf_subfix.hex}-ip-${count.index}" region = google_compute_subnetwork.subnet.region depends_on = [google_project_service.gcp_services] } resource "google_compute_global_address" "webui_addr" { name = "sd-webui-ingress-${random_id.tf_subfix.hex}" depends_on = [google_project_service.gcp_services] } # NAT Gateway resource "google_compute_router_nat" "nat" { name = "tf-gen-${var.region}-nat-gw" router = google_compute_router.router.name region = google_compute_router.router.region nat_ip_allocate_option = "MANUAL_ONLY" nat_ips = google_compute_address.address.*.self_link source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES" } # GKE cluster resource "google_container_cluster" "gke" { name = "tf-gen-gke-${random_id.tf_subfix.hex}" location = var.cluster_location remove_default_node_pool = false enable_shielded_nodes = true initial_node_count = 1 network = google_compute_network.vpc.name subnetwork = google_compute_subnetwork.subnet.name private_cluster_config { enable_private_nodes = true master_ipv4_cidr_block = "192.168.1.0/28" } ip_allocation_policy { } monitoring_config { enable_components = ["SYSTEM_COMPONENTS", "APISERVER", "SCHEDULER", "CONTROLLER_MANAGER"] managed_prometheus { enabled = true } } logging_config { enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS", "APISERVER", "SCHEDULER", "CONTROLLER_MANAGER"] } release_channel { channel = "STABLE" } maintenance_policy { daily_maintenance_window { start_time = "03:00" } } addons_config { http_load_balancing { disabled = false } horizontal_pod_autoscaling { disabled = false } gcp_filestore_csi_driver_config { enabled = true } gce_persistent_disk_csi_driver_config { enabled = true } dns_cache_config { enabled = true } } node_config { shielded_instance_config { enable_secure_boot = true enable_integrity_monitoring = true } } lifecycle { ignore_changes = all } } # Separately Managed Node Pool resource "google_container_node_pool" "gpu_nodepool" { name = "${var.accelerator_type}-nodepool" location = var.cluster_location cluster = google_container_cluster.gke.name autoscaling { min_node_count = 1 max_node_count = 10 } node_count = var.gke_num_nodes node_config { oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform" ] labels = { Terraform = "true" Environment = "dev" } preemptible = true machine_type = var.node_machine_type image_type = "COS_CONTAINERD" gcfs_config { enabled = true } guest_accelerator { type = var.accelerator_type count = 1 gpu_sharing_config { gpu_sharing_strategy = "TIME_SHARING" max_shared_clients_per_gpu = 2 } } disk_type = "pd-balanced" disk_size_gb = 100 tags = ["gpu-node", "gke-sd"] metadata = { disable-legacy-endpoints = "true" } shielded_instance_config { enable_secure_boot = true enable_integrity_monitoring = true } } lifecycle { ignore_changes = all } } #agones firewall resource "google_compute_firewall" "agones" { depends_on = [google_container_cluster.gke] name = "allow-agones-${random_id.tf_subfix.hex}" network = google_compute_network.vpc.name project = var.project_id allow { protocol = "tcp" ports = ["443", "8080", "8081"] } source_ranges = ["0.0.0.0/0"] } # Filestore resource "google_filestore_instance" "instance" { name = "nfs-store-${random_id.tf_subfix.hex}" location = var.filestore_zone tier = "BASIC_HDD" file_shares { capacity_gb = 1024 name = "vol1" } networks { network = google_compute_network.vpc.name modes = ["MODE_IPV4"] } } #Artifact Registry resource "google_artifact_registry_repository" "sd_repo" { location = var.region repository_id = "sd-repository-${random_id.tf_subfix.hex}" description = "stable diffusion repository" format = "DOCKER" depends_on = [google_project_service.gcp_services] } # Redis cache resource "google_redis_instance" "cache" { region = var.region name = "sd-agones-cache-${random_id.tf_subfix.hex}" tier = "BASIC" memory_size_gb = 1 authorized_network = google_compute_network.vpc.id redis_version = "REDIS_6_X" display_name = "Stable Diffusion Agones Cache Instance" connect_mode = "DIRECT_PEERING" maintenance_policy { weekly_maintenance_window { day = "SUNDAY" start_time { hours = 0 minutes = 30 seconds = 0 nanos = 0 } } } } # vpc_connector_for_function resource "google_vpc_access_connector" "connector" { name = "vpc-con-${random_id.tf_subfix.hex}" ip_cidr_range = "192.168.240.16/28" network = google_compute_network.vpc.name } # function_source_gcs_bucket resource "google_storage_bucket" "bucket" { name = "cloud-function-source-${random_id.tf_subfix.hex}" project = var.project_id location = var.region force_destroy = true storage_class = "COLDLINE" uniform_bucket_level_access = true depends_on = [google_project_service.gcp_services] } # function_source_zip resource "google_storage_bucket_object" "archive" { name = "cloud_function.zip" bucket = google_storage_bucket.bucket.name source = "./cloud_function.zip" } resource "google_cloudfunctions_function" "function" { name = "redis-http-${random_id.tf_subfix.hex}" description = "agones gpu pod recycle function" runtime = "python310" trigger_http = true region = var.region ingress_settings = "ALLOW_INTERNAL_AND_GCLB" vpc_connector = google_vpc_access_connector.connector.name vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" entry_point = "redis_http" environment_variables = { REDIS_HOST = google_redis_instance.cache.host TIME_INTERVAL = 900 } available_memory_mb = 128 source_archive_bucket = google_storage_bucket.bucket.name source_archive_object = google_storage_bucket_object.archive.name timeout = 60 depends_on = [google_storage_bucket_object.archive] } resource "google_cloudfunctions_function_iam_member" "invoker" { project = google_cloudfunctions_function.function.project region = google_cloudfunctions_function.function.region cloud_function = google_cloudfunctions_function.function.name role = "roles/cloudfunctions.invoker" member = "serviceAccount:${data.google_compute_default_service_account.default.email}" depends_on = [google_project_service.gcp_services] } resource "google_cloud_scheduler_job" "job" { name = "sd-agones-cruiser-${random_id.tf_subfix.hex}" description = "cloud function http schedule job" region = var.region schedule = "*/5 * * * *" http_target { http_method = "GET" uri = google_cloudfunctions_function.function.https_trigger_url oidc_token { service_account_email = data.google_compute_default_service_account.default.email } } depends_on = [google_project_service.gcp_services] } resource "google_dns_managed_zone" "private_zone" { name = "private-zone-${random_id.tf_subfix.hex}" dns_name = "private.domain." description = "Example private DNS zone" visibility = "private" private_visibility_config { networks { network_url = google_compute_network.vpc.id } } depends_on = [google_project_service.gcp_services] } resource "google_dns_record_set" "redis_a" { name = "redis.${google_dns_managed_zone.private_zone.dns_name}" managed_zone = google_dns_managed_zone.private_zone.name type = "A" ttl = 300 rrdatas = [google_redis_instance.cache.host] }