applications/ray/main.tf (177 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#######################################################
#### APPLICATIONS
#######################################################
provider "google" {
project = var.project_id
}
provider "time" {}
data "google_client_config" "default" {}
data "google_project" "project" {
project_id = var.project_id
}
## Enable Required GCP Project Services APIs
module "project-services" {
source = "terraform-google-modules/project-factory/google//modules/project_services"
version = "~> 14.5"
project_id = var.project_id
disable_services_on_destroy = false
disable_dependent_services = false
activate_apis = flatten([
"autoscaling.googleapis.com",
"cloudbuild.googleapis.com",
"cloudresourcemanager.googleapis.com",
"compute.googleapis.com",
"config.googleapis.com",
"connectgateway.googleapis.com",
"container.googleapis.com",
"containerfilesystem.googleapis.com",
"dns.googleapis.com",
"gkehub.googleapis.com",
"iamcredentials.googleapis.com",
"logging.googleapis.com",
"monitoring.googleapis.com",
"pubsub.googleapis.com",
"servicenetworking.googleapis.com",
"serviceusage.googleapis.com",
"sourcerepo.googleapis.com",
"iap.googleapis.com"
])
}
module "infra" {
source = "../../infrastructure"
count = var.create_cluster ? 1 : 0
project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = false
network_name = "default"
subnetwork_name = "default"
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
ray_addon_enabled = true
depends_on = [module.project-services]
}
data "google_container_cluster" "default" {
count = var.create_cluster ? 0 : 1
name = var.cluster_name
location = var.cluster_location
depends_on = [module.project-services]
}
locals {
endpoint = var.create_cluster ? "https://${module.infra[0].endpoint}" : "https://${data.google_container_cluster.default[0].endpoint}"
ca_certificate = var.create_cluster ? base64decode(module.infra[0].ca_certificate) : base64decode(data.google_container_cluster.default[0].master_auth[0].cluster_ca_certificate)
private_cluster = var.create_cluster ? var.private_cluster : data.google_container_cluster.default[0].private_cluster_config.0.enable_private_endpoint
cluster_membership_id = var.cluster_membership_id == "" ? local.cluster_name : var.cluster_membership_id
enable_autopilot = var.create_cluster ? var.autopilot_cluster : data.google_container_cluster.default[0].enable_autopilot
enable_tpu = var.create_cluster ? var.enable_tpu : data.google_container_cluster.default[0].enable_tpu
host = local.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : local.endpoint
kubernetes_namespace = var.goog_cm_deployment_name != "" ? "${var.goog_cm_deployment_name}-${var.kubernetes_namespace}" : var.kubernetes_namespace
workload_identity_service_account = var.goog_cm_deployment_name != "" ? "${var.goog_cm_deployment_name}-${var.workload_identity_service_account}" : var.workload_identity_service_account
cluster_name = var.goog_cm_deployment_name != "" ? "${var.goog_cm_deployment_name}-${var.cluster_name}" : var.cluster_name
ray_cluster_default_uri = "https://console.cloud.google.com/kubernetes/service/${var.cluster_location}/${local.cluster_name}/${local.kubernetes_namespace}/${var.ray_cluster_name}-kuberay-head-svc/overview?project=${var.project_id}"
}
provider "kubernetes" {
alias = "ray"
host = local.host
token = data.google_client_config.default.access_token
cluster_ca_certificate = local.private_cluster ? "" : local.ca_certificate
dynamic "exec" {
for_each = local.private_cluster ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "gke-gcloud-auth-plugin"
}
}
}
provider "helm" {
alias = "ray"
kubernetes {
host = local.host
token = data.google_client_config.default.access_token
cluster_ca_certificate = local.private_cluster ? "" : local.ca_certificate
dynamic "exec" {
for_each = local.private_cluster ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "gke-gcloud-auth-plugin"
}
}
}
}
module "namespace" {
source = "../../modules/kubernetes-namespace"
providers = { helm = helm.ray }
create_namespace = true
namespace = local.kubernetes_namespace
}
module "kuberay-workload-identity" {
providers = { kubernetes = kubernetes.ray }
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts
use_existing_gcp_sa = !var.create_service_account
name = local.workload_identity_service_account
namespace = local.kubernetes_namespace
project_id = var.project_id
roles = ["roles/cloudsql.client", "roles/monitoring.viewer"]
automount_service_account_token = true
depends_on = [module.namespace]
}
module "kuberay-monitoring" {
count = var.create_ray_cluster ? 1 : 0
source = "../../modules/kuberay-monitoring"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
project_id = var.project_id
autopilot_cluster = var.autopilot_cluster
namespace = local.kubernetes_namespace
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.workload_identity_service_account
depends_on = [module.kuberay-workload-identity]
}
module "gcs" {
source = "../../modules/gcs"
count = var.create_gcs_bucket ? 1 : 0
project_id = var.project_id
bucket_name = var.gcs_bucket
}
module "kuberay-cluster" {
count = var.create_ray_cluster == true ? 1 : 0
source = "../../modules/kuberay-cluster"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = var.ray_cluster_name
namespace = local.kubernetes_namespace
project_id = var.project_id
enable_tpu = local.enable_tpu
enable_gpu = var.enable_gpu
gcs_bucket = var.gcs_bucket
autopilot_cluster = local.enable_autopilot
google_service_account = local.workload_identity_service_account
grafana_host = var.enable_grafana_on_ray_dashboard ? module.kuberay-monitoring[0].grafana_uri : ""
network_policy_allow_cidr = var.kuberay_network_policy_allow_cidr
disable_network_policy = var.disable_ray_cluster_network_policy
additional_labels = var.additional_labels
# IAP Auth parameters
add_auth = var.ray_dashboard_add_auth
create_brand = var.create_brand
support_email = var.support_email
client_id = var.ray_dashboard_client_id
client_secret = var.ray_dashboard_client_secret
k8s_ingress_name = var.ray_dashboard_k8s_ingress_name
k8s_iap_secret_name = var.ray_dashboard_k8s_iap_secret_name
k8s_managed_cert_name = var.ray_dashboard_k8s_managed_cert_name
k8s_backend_config_name = var.ray_dashboard_k8s_backend_config_name
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
depends_on = [module.gcs, module.kuberay-workload-identity]
}
# Assign resource quotas to Ray namespace to ensure that they don't overutilize resources
resource "kubernetes_resource_quota" "ray_namespace_resource_quota" {
provider = kubernetes.ray
count = var.disable_resource_quotas ? 0 : 1
metadata {
name = "ray-resource-quota"
namespace = local.kubernetes_namespace
}
spec {
hard = var.resource_quotas
}
depends_on = [module.namespace]
}