applications/ray/variables.tf (243 lines of code) (raw):
# # Copyright 2023 Google LLC
# #
# # Licensed under the Apache License, Version 2.0 (the "License");
# # you may not use this file except in compliance with the License.
# # You may obtain a copy of the License at
# #
# # http://www.apache.org/licenses/LICENSE-2.0
# #
# # Unless required by applicable law or agreed to in writing, software
# # distributed under the License is distributed on an "AS IS" BASIS,
# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# # See the License for the specific language governing permissions and
# # limitations under the License.
variable "project_id" {
type = string
description = "GCP project id"
}
variable "cluster_name" {
type = string
}
variable "cluster_location" {
type = string
}
variable "cluster_membership_id" {
type = string
description = "require to use connectgateway for private clusters, default: cluster_name"
default = ""
}
variable "ray_version" {
type = string
default = "v2.9.3"
}
variable "kubernetes_namespace" {
type = string
description = "Kubernetes namespace where resources are deployed"
default = "ai-on-gke"
}
variable "enable_grafana_on_ray_dashboard" {
type = bool
description = "Add option to enable or disable grafana for the ray dashboard. Enabling requires anonymous access."
default = false
}
variable "create_gcs_bucket" {
type = bool
default = false
description = "Enable flag to create gcs_bucket"
}
variable "gcs_bucket" {
type = string
description = "The GCS bucket to store data for the Ray cluster."
}
variable "create_service_account" {
type = bool
description = "Creates a google IAM service account & k8s service account & configures workload identity"
default = true
}
variable "workload_identity_service_account" {
type = string
description = "Google Cloud IAM service account for authenticating with GCP services for GCS"
default = "ray-sa"
}
variable "create_ray_cluster" {
type = bool
default = true
description = "Create a Ray cluster"
}
variable "ray_cluster_name" {
type = string
default = "ray-cluster"
}
variable "enable_gpu" {
type = bool
default = false
}
variable "enable_tpu" {
type = bool
default = false
}
## GKE variables
variable "create_cluster" {
type = bool
default = false
}
variable "private_cluster" {
type = bool
default = false
}
variable "autopilot_cluster" {
type = bool
default = true
}
variable "cpu_pools" {
type = list(object({
name = string
machine_type = string
node_locations = optional(string, "")
autoscaling = optional(bool, false)
min_count = optional(number, 1)
max_count = optional(number, 3)
local_ssd_count = optional(number, 0)
spot = optional(bool, false)
disk_size_gb = optional(number, 100)
disk_type = optional(string, "pd-standard")
image_type = optional(string, "COS_CONTAINERD")
enable_gcfs = optional(bool, false)
enable_gvnic = optional(bool, false)
logging_variant = optional(string, "DEFAULT")
auto_repair = optional(bool, true)
auto_upgrade = optional(bool, true)
create_service_account = optional(bool, true)
preemptible = optional(bool, false)
initial_node_count = optional(number, 1)
accelerator_count = optional(number, 0)
}))
default = [{
name = "cpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
enable_gcfs = true
disk_size_gb = 100
disk_type = "pd-standard"
}]
}
variable "gpu_pools" {
type = list(object({
name = string
machine_type = string
node_locations = optional(string, "")
autoscaling = optional(bool, false)
min_count = optional(number, 1)
max_count = optional(number, 3)
local_ssd_count = optional(number, 0)
spot = optional(bool, false)
disk_size_gb = optional(number, 100)
disk_type = optional(string, "pd-standard")
image_type = optional(string, "COS_CONTAINERD")
enable_gcfs = optional(bool, false)
enable_gvnic = optional(bool, false)
logging_variant = optional(string, "DEFAULT")
auto_repair = optional(bool, true)
auto_upgrade = optional(bool, true)
create_service_account = optional(bool, true)
preemptible = optional(bool, false)
initial_node_count = optional(number, 1)
accelerator_count = optional(number, 0)
accelerator_type = optional(string, "nvidia-tesla-t4")
gpu_driver_version = optional(string, "DEFAULT")
}))
default = [{
name = "gpu-pool-l4"
machine_type = "g2-standard-24"
autoscaling = true
min_count = 0
max_count = 3
disk_size_gb = 100
disk_type = "pd-balanced"
enable_gcfs = true
accelerator_count = 2
accelerator_type = "nvidia-l4"
gpu_driver_version = "DEFAULT"
}]
}
variable "goog_cm_deployment_name" {
type = string
default = ""
}
# Ray-dashboard IAP settings
variable "create_brand" {
type = bool
description = "Create Brand OAuth Screen"
default = false
}
variable "ray_dashboard_add_auth" {
type = bool
description = "Enable iap authentication on frontend"
default = true
}
variable "ray_dashboard_k8s_ingress_name" {
type = string
default = "ray-dashboard-ingress"
}
variable "ray_dashboard_k8s_managed_cert_name" {
type = string
description = "Name for frontend managed certificate"
default = "ray-dashboard-managed-cert"
}
variable "ray_dashboard_k8s_iap_secret_name" {
type = string
default = "ray-dashboard-secret"
}
variable "ray_dashboard_k8s_backend_config_name" {
type = string
description = "Name of the Backend Config on GCP"
default = "ray-dashboard-iap-config"
}
variable "ray_dashboard_k8s_backend_service_port" {
type = number
description = "Name of the K8s Backend Service Port"
default = 8265
}
variable "ray_dashboard_domain" {
type = string
description = "Domain used for SSL certificate."
default = ""
}
variable "support_email" {
type = string
description = "Email for users to contact with questions about their consent"
default = "<email>"
}
variable "ray_dashboard_client_id" {
type = string
description = "Client ID used for enabling IAP"
default = ""
}
variable "ray_dashboard_client_secret" {
type = string
description = "Client secret used for enabling IAP"
default = ""
sensitive = false
}
variable "ray_dashboard_members_allowlist" {
type = string
default = ""
## keeping it string type to support single field input for marketplace UI.
}
# These default resource quotas are set intentionally high as an example that won't be limiting for most Ray clusters.
# Consult https://kubernetes.io/docs/concepts/policy/resource-quotas/ for additional quotas that may be set.
variable "resource_quotas" {
description = "Kubernetes ResourceQuota object to attach to the Ray cluster's namespace"
type = map(string)
default = {
cpu = "1000"
memory = "10Ti"
"requests.nvidia.com/gpu" = "100"
"requests.google.com/tpu" = "100"
}
}
variable "disable_resource_quotas" {
description = "Set to true to remove resource quotas from your Ray clusters. Not recommended"
type = bool
default = false
}
# This is a list of CIDR ranges allowed to access a Ray cluster's job submission API and Dashboard.
#
# Example:
# kuberay_network_policy_allow_cidr = "10.0.0.0/8"
#
variable "kuberay_network_policy_allow_cidr" {
description = "List of CIDRs that are allowed to access this Ray cluster's job submission and dashboard port."
type = string
default = ""
}
variable "disable_ray_cluster_network_policy" {
description = "Disables Kubernetes Network Policy for Ray Clusters for this demo. Defaulting to 'true' aka disabled pending fixes to the kuberay-monitoring module. This should be defaulted to false."
type = bool
default = false
}
variable "additional_labels" {
// string is used instead of map(string) since blueprint metadata does not support maps.
type = string
description = "Additional labels to add to Kubernetes resources."
default = "created-by=ai-on-gke,ai.gke.io=ray"
}