modules/kuberay-cluster/values.yaml (119 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Default values for ray-cluster. # This is a YAML-formatted file. # Declare variables to be passed into your templates. # The KubeRay community welcomes PRs to expose additional configuration # in this Helm chart. image: # Replace this with your own image if needed. repository: ${image} tag: ${image_tag} pullPolicy: IfNotPresent nameOverride: "kuberay" fullnameOverride: "" imagePullSecrets: [] # - name: an-existing-secret head: groupName: headgroup # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. enableInTreeAutoscaling: true # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. # The example configuration shown below below represents the DEFAULT values. # autoscalerOptions: # upscalingMode: Default # idleTimeoutSeconds: 60 # securityContext: {} # env: [] # envFrom: [] # resources specifies optional resource request and limit overrides for the autoscaler container. # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. # resources: # limits: # cpu: "500m" # memory: "512Mi" # requests: # cpu: "500m" # memory: "512Mi" labels: cloud.google.com/gke-ray-node-type: head ${indent(4, chomp(yamlencode(additional_labels)))} serviceAccountName: ${k8s_service_account} rayStartParams: dashboard-host: '0.0.0.0' block: 'true' # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - name: RAY_memory_monitor_refresh_ms value: "0" - name: RAY_GRAFANA_IFRAME_HOST value: http://${grafana_host} - name: RAY_GRAFANA_HOST value: http://grafana:80 - name: RAY_PROMETHEUS_HOST value: http://frontend:9090 - name: CLOUDSQL_INSTANCE_CONNECTION_NAME value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret # ports optionally allows specifying ports for the Ray container. # ports: [] # resource requests and limits for the Ray head container. # Modify as needed for your application. # Note that the resources in this example are much too small for production; # we don't recommend allocating less than 8G memory for a Ray pod in production. # Ray pods should be sized to take up entire K8s nodes when possible. # Always set CPU and memory limits for Ray pods. # It is usually best to set requests equal to limits. # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources # for further guidance. resources: limits: cpu: "4" # To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head. # Ray recommends at least 8G memory for production workloads. memory: "8G" # Sum of ephemeral storage requests must be max 10Gi on Autopilot default class. # This includes, ray-head, gcsfuse-sidecar, fluent-bit, and ray Autoscaler sidecar which requests 1Gi by default. ephemeral-storage: 3Gi requests: cpu: "4" memory: "8G" ephemeral-storage: 3Gi annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "1" gke-gcsfuse/memory-limit: 2Gi gke-gcsfuse/ephemeral-storage-limit: 3Gi nodeSelector: iam.gke.io/gke-metadata-server-enabled: "true" tolerations: [] affinity: {} # Ray container security context. securityContext: ${indent(4, chomp(yamlencode(security_context)))} volumes: - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io #readOnly: true volumeAttributes: bucketName: ${gcs_bucket} mountOptions: "implicit-dirs,uid=1000,gid=100" - name: secret-volume secret: secretName: ${secret_name} optional: true # Ray writes logs to /tmp/ray/session_latests/logs volumeMounts: - name: gcs-fuse-csi-ephemeral mountPath: /data - name: secret-volume mountPath: /etc/secret-volume readOnly: true worker: # If you want to disable the default workergroup # uncomment the line below # disabled: true groupName: workerGroup replicas: 0 minReplicas: 0 maxReplicas: 5 type: worker labels: cloud.google.com/gke-ray-node-type: worker ${indent(4, chomp(yamlencode(additional_labels)))} serviceAccountName: ${k8s_service_account} rayStartParams: block: 'true' initContainerImage: 'busybox:1.28' # Enable users to specify the image for init container. Users can pull the busybox image from their private repositories. # Security context for the init container. initContainerSecurityContext: {} # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - name: CLOUDSQL_INSTANCE_CONNECTION_NAME value: ${cloudsql_instance_connection_name} envFrom: [] # - secretRef: # name: my-env-secret # ports optionally allows specifying ports for the Ray container. # ports: [] # resource requests and limits for the Ray head container. # Modify as needed for your application. # Note that the resources in this example are much too small for production; # we don't recommend allocating less than 8G memory for a Ray pod in production. # Ray pods should be sized to take up entire K8s nodes when possible. # Always set CPU and memory limits for Ray pods. # It is usually best to set requests equal to limits. # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources # for further guidance. resources: requests: ${indent(6, chomp(yamlencode(resource_requests)))} limits: ${indent(6, chomp(yamlencode(resource_requests)))} annotations: ${indent(4, chomp(yamlencode(annotations)))} nodeSelector: ${indent(4, chomp(yamlencode(node_selectors)))} tolerations: [] affinity: {} securityContext: ${indent(4, chomp(yamlencode(security_context)))} volumes: - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io #readOnly: true volumeAttributes: bucketName: ${gcs_bucket} mountOptions: "implicit-dirs,uid=1000,gid=100" - name: secret-volume secret: secretName: ${secret_name} optional: true # Ray writes logs to /tmp/ray/session_latests/logs volumeMounts: - name: gcs-fuse-csi-ephemeral mountPath: /data - name: secret-volume mountPath: /etc/secret-volume readOnly: true # The map's key is used as the groupName. # For example, key:small-group in the map below # will be used as the groupName additionalWorkerGroups: smallGroup: # Disabled by default disabled: true service: type: ClusterIP