ai-ml/gke-ray/rayserve/llm/model-multiplexing/ray-service.yaml

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_ai_ml_gke_ray_rayserve_llm_model_multiplexing] apiVersion: ray.io/v1 kind: RayService metadata: name: model-multiplexing spec: serveConfigV2: | applications: - name: llm route_prefix: / import_path: ai-ml.gke-ray.rayserve.llm.model-multiplexing.serve:multi_model deployments: - name: MultiModelDeployment num_replicas: 1 ray_actor_options: num_cpus: 2 # NOTE: num_gpus is set automatically based on TENSOR_PARALLELISM runtime_env: working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" pip: ["vllm==0.4.3"] env_vars: MODEL_1_ID: "google/gemma-7b-it" MODEL_1_TENSOR_PARALLELISM: "2" MODEL_2_ID: "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_2_TENSOR_PARALLELISM: "2" rayClusterConfig: headGroupSpec: rayStartParams: dashboard-host: "0.0.0.0" template: metadata: labels: ai.gke.io: rayserve spec: containers: - name: ray-head image: rayproject/ray-ml:2.9.0-py310 resources: limits: cpu: "2" memory: "8Gi" ephemeral-storage: "10Gi" requests: cpu: "2" memory: "8Gi" ephemeral-storage: "10Gi" ports: - containerPort: 6379 name: gcs-server - containerPort: 8265 name: dashboard - containerPort: 10001 name: client - containerPort: 8000 name: serve env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token workerGroupSpecs: - replicas: 2 minReplicas: 0 maxReplicas: 4 groupName: gpu-group rayStartParams: {} template: metadata: labels: ai.gke.io: rayserve spec: containers: - name: llm image: rayproject/ray-ml:2.9.0-py310 env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token resources: limits: cpu: "20" memory: "40Gi" nvidia.com/gpu: "2" requests: cpu: "20" memory: "40Gi" nvidia.com/gpu: "2" nodeSelector: cloud.google.com/gke-accelerator: nvidia-l4 # [END gke_ai_ml_gke_ray_rayserve_llm_model_multiplexing]

ai-ml/gke-ray/rayserve/llm/model-multiplexing/ray-service.yaml (90 lines of code) (raw):