terraform-provision-infra/modules/agones/k8s-res/main.tf (270 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "4.63.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "2.20.0"
}
http = {
source = "hashicorp/http"
version = "3.3.0"
}
}
}
data "google_client_config" "default" {}
data "google_container_cluster" "my_cluster" {
name = var.gke_cluster_name
location = var.gke_cluster_location
project = var.project_id
}
provider "kubernetes" {
host = "https://${data.google_container_cluster.my_cluster.endpoint}"
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(
data.google_container_cluster.my_cluster.master_auth[0].cluster_ca_certificate,
)
experiments {
manifest_resource = true
}
}
resource "kubernetes_storage_class" "nfs" {
metadata {
name = "filestore"
}
reclaim_policy = "Retain"
storage_provisioner = "nfs"
}
resource "kubernetes_persistent_volume_v1" "nfs_pv" {
metadata {
name = "filestore-nfs-pv"
}
spec {
capacity = {
storage = "1Ti"
}
storage_class_name = kubernetes_storage_class.nfs.metadata[0].name
access_modes = ["ReadWriteMany"]
persistent_volume_source {
nfs {
path = "/vol1"
server = var.google_filestore_reserved_ip_range
}
}
}
}
resource "kubernetes_persistent_volume_claim_v1" "nfs_pvc" {
metadata {
name = "vol1"
}
spec {
access_modes = ["ReadWriteMany"]
storage_class_name = kubernetes_storage_class.nfs.metadata[0].name
volume_name = kubernetes_persistent_volume_v1.nfs_pv.metadata.0.name
resources {
requests = {
storage = "1Ti"
}
}
}
}
resource "kubernetes_secret" "iap_client_secret" {
metadata {
name = "iap-secret"
}
data = {
client_id = var.oauth_client_id
client_secret = var.oauth_client_secret
}
}
resource "kubernetes_deployment" "nginx" {
metadata {
name = "stable-diffusion-nginx-deployment"
labels = {
app = "stable-diffusion-nginx"
}
}
spec {
replicas = 2
selector {
match_labels = {
app = "stable-diffusion-nginx"
}
}
template {
metadata {
labels = {
app = "stable-diffusion-nginx"
}
}
spec {
container {
image = var.nginx_image_url
name = "stable-diffusion-nginx"
port {
container_port = 8080
}
}
node_selector = {
"cloud.google.com/gke-nodepool" = var.gke_cluster_nodepool
}
}
}
}
}
data "http" "gpu_driver_file" {
url = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml"
}
resource "kubernetes_manifest" "gpu_driver" {
manifest = yamldecode(data.http.gpu_driver_file.response_body)
}
resource "kubernetes_manifest" "webui_fleet" {
manifest = yamldecode(<<-EOF
apiVersion: "agones.dev/v1"
kind: Fleet
metadata:
name: sd-agones-fleet
namespace: default
spec:
replicas: 1
template:
spec:
container: simple-game-server
ports:
- name: default
container: simple-game-server
containerPort: 7654
- name: sd
container: stable-diffusion-webui
containerPort: 7860
protocol: TCP
template:
spec:
containers:
- name: simple-game-server
image: "${var.game_server_image_url}"
resources:
requests:
memory: "64Mi"
cpu: "20m"
limits:
memory: "64Mi"
cpu: "20m"
- name: stable-diffusion-webui
image: "${var.webui_image_url}"
command: ["/bin/sh", "start.sh"]
volumeMounts:
- mountPath: /stable-diffusion-webui/models
name: stable-diffusion-storage
subPath: models
- mountPath: /result
name: stable-diffusion-storage
subPath: result
resources:
limits:
nvidia.com/gpu: "1"
volumes:
- name: stable-diffusion-storage
persistentVolumeClaim:
claimName: vol1
EOF
)
}
resource "kubernetes_manifest" "webui_fleet_autoscaler" {
manifest = yamldecode(<<-EOF
apiVersion: "autoscaling.agones.dev/v1"
kind: FleetAutoscaler
metadata:
name: fleet-autoscaler-policy
namespace: default
spec:
fleetName: sd-agones-fleet
policy:
type: Buffer
buffer:
bufferSize: 1
minReplicas: 1
maxReplicas: 20
sync:
type: FixedInterval
fixedInterval:
seconds: 30
EOF
)
depends_on = [kubernetes_manifest.webui_fleet]
}
resource "kubernetes_manifest" "webui_backend_config" {
manifest = yamldecode(<<-EOF
apiVersion: "autoscaling.agones.dev/v1"
apiVersion: cloud.google.com/v1
kind: BackendConfig
metadata:
name: config-default
namespace: default
spec:
timeoutSec: 900
iap:
enabled: true
oauthclientCredentials:
secretName: iap-secret
EOF
)
}
resource "kubernetes_manifest" "webui_cert" {
manifest = yamldecode(<<-EOF
apiVersion: networking.gke.io/v1
kind: ManagedCertificate
metadata:
name: managed-cert
namespace: default
spec:
domains:
- "${var.sd_webui_domain}"
EOF
)
}
resource "kubernetes_manifest" "webui_svc" {
manifest = yamldecode(<<-EOF
apiVersion: v1
kind: Service
metadata:
name: stable-diffusion-nginx-service
namespace: default
annotations:
cloud.google.com/neg: '{"ingress": true}' # Creates a NEG after an Ingress is created
beta.cloud.google.com/backend-config: '{"default": "config-default"}'
labels:
app: stable-diffusion-nginx
spec:
ports:
- protocol: TCP
port: 8080
targetPort: 8080
selector:
app: stable-diffusion-nginx
type: ClusterIP
EOF
)
}
resource "kubernetes_manifest" "webui_ingress" {
manifest = yamldecode(<<-EOF
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: sd-agones-ingress
namespace: default
annotations:
kubernetes.io/ingress.global-static-ip-name: ${var.webui_address_name}
networking.gke.io/managed-certificates: managed-cert
kubernetes.io/ingress.class: "gce"
spec:
defaultBackend:
service:
name: stable-diffusion-nginx-service # Name of the Service targeted by the Ingress
port:
number: 8080 # Should match the port used by the Service
EOF
)
}