modules/kuberay-cluster/main.tf (194 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. resource "google_storage_bucket_iam_member" "gcs-bucket-iam" { bucket = var.gcs_bucket role = "roles/storage.objectAdmin" member = "serviceAccount:${var.google_service_account}@${var.project_id}.iam.gserviceaccount.com" } locals { security_context = { for k, v in var.security_context : k => v if v != null } cloudsql_instance_connection_name = format("%s:%s:%s", var.project_id, var.db_region, var.cloudsql_instance_name) additional_labels = tomap({ for item in split(",", var.additional_labels) : split("=", item)[0] => split("=", item)[1] }) } resource "helm_release" "ray-cluster" { name = var.name repository = "https://ray-project.github.io/kuberay-helm/" chart = "ray-cluster" namespace = var.namespace create_namespace = true version = "1.0.0" values = [ templatefile("${path.module}/values.yaml", { gcs_bucket = var.gcs_bucket k8s_service_account = var.google_service_account additional_labels = local.additional_labels grafana_host = var.grafana_host security_context = local.security_context secret_name = var.db_secret_name cloudsql_instance_connection_name = local.cloudsql_instance_connection_name image = var.use_custom_image ? "us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/ray-image" : "rayproject/ray" image_tag = var.enable_gpu ? "2.9.3-py310-gpu" : var.use_custom_image ? "2.9.3-py310-gpu" : "2.9.3-py310" resource_requests = var.enable_gpu ? { "cpu" = "8" "memory" = "32G" "ephemeral-storage" = "20Gi" "nvidia.com/gpu" = "1" } : { "cpu" = "8" "memory" = "32G" "ephemeral-storage" = "20Gi" } annotations = { "gke-gcsfuse/volumes" : "true" "gke-gcsfuse/cpu-limit" : "2" "gke-gcsfuse/memory-limit" : "8Gi" "gke-gcsfuse/ephemeral-storage-limit" : "20Gi" } node_selectors = var.autopilot_cluster ? var.enable_gpu ? { "cloud.google.com/compute-class" : "Accelerator" "cloud.google.com/gke-accelerator" : "nvidia-l4" "cloud.google.com/gke-ephemeral-storage-local-ssd" : "true" "iam.gke.io/gke-metadata-server-enabled" : "true" } : { "cloud.google.com/compute-class" : "Performance" "cloud.google.com/machine-family" : "c3" "cloud.google.com/gke-ephemeral-storage-local-ssd" : "true" "iam.gke.io/gke-metadata-server-enabled" : "true" } : var.enable_gpu ? { "iam.gke.io/gke-metadata-server-enabled" : "true" "cloud.google.com/gke-accelerator" : "nvidia-l4" } : var.enable_tpu ? { "iam.gke.io/gke-metadata-server-enabled" : "true" "cloud.google.com/gke-tpu-accelerator" : "tpu-v4-podslice" "cloud.google.com/gke-tpu-topology" : "2x2x1" "cloud.google.com/gke-placement-group" : "tpu-pool" } : { "iam.gke.io/gke-metadata-server-enabled" : "true" } }) ] } data "kubernetes_service" "head-svc" { metadata { name = "${helm_release.ray-cluster.name}-kuberay-head-svc" namespace = var.namespace } depends_on = [helm_release.ray-cluster] } # Allow ingress to the kuberay head from outside the cluster resource "kubernetes_network_policy" "kuberay-head-namespace-network-policy" { count = var.disable_network_policy ? 0 : 1 metadata { name = "terraform-kuberay-head-namespace-network-policy" namespace = var.namespace } spec { pod_selector { match_labels = { "ray.io/node-type" : "head" } } ingress { # Ray job submission and dashboard ports { port = "8265" protocol = "TCP" } # Ray client ports { port = "10001" protocol = "TCP" } from { namespace_selector { match_expressions { key = "kubernetes.io/metadata.name" operator = "In" values = var.network_policy_allow_namespaces } } } } policy_types = ["Ingress"] } } # Allow ingress to the kuberay head from outside the cluster resource "kubernetes_network_policy" "kuberay-head-cidr-network-policy" { count = var.network_policy_allow_cidr != "" && !var.disable_network_policy ? 1 : 0 metadata { name = "terraform-kuberay-head-cidr-network-policy" namespace = var.namespace } spec { pod_selector { match_labels = { "ray.io/node-type" : "head" } } ingress { # Ray job submission and dashboard ports { port = "8265" protocol = "TCP" } # Ray client ports { port = "10001" protocol = "TCP" } from { ip_block { cidr = var.network_policy_allow_cidr } } } policy_types = ["Ingress"] } } # Allow all same namespace and gmp traffic resource "kubernetes_network_policy" "kuberay-cluster-allow-network-policy" { count = var.disable_network_policy ? 0 : 1 metadata { name = "terraform-kuberay-allow-cluster-network-policy" namespace = var.namespace } spec { pod_selector { match_labels = { "ray.io/is-ray-node" : "yes" } } ingress { ports { protocol = "TCP" } from { namespace_selector { match_expressions { key = "kubernetes.io/metadata.name" operator = "In" values = [var.namespace, "gke-gmp-system", "gmp-system"] } } } } policy_types = ["Ingress"] } } # IAP Section: Creates the GKE components module "iap_auth" { count = var.add_auth ? 1 : 0 source = "../../modules/iap" project_id = var.project_id namespace = var.namespace support_email = var.support_email app_name = "ray-dashboard" create_brand = var.create_brand k8s_ingress_name = var.k8s_ingress_name k8s_managed_cert_name = var.k8s_managed_cert_name k8s_iap_secret_name = var.k8s_iap_secret_name k8s_backend_config_name = var.k8s_backend_config_name k8s_backend_service_name = "${helm_release.ray-cluster.name}-kuberay-head-svc" k8s_backend_service_port = var.k8s_backend_service_port client_id = var.client_id client_secret = var.client_secret domain = var.domain members_allowlist = var.members_allowlist depends_on = [ helm_release.ray-cluster, data.kubernetes_service.head-svc, ] }