ai-ml/gke-ray/rayserve/llm/model-composition/ray-service.tpu-v5e-singlehost.yaml (136 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_ai_ml_gke_ray_rayserve_llm_model_composition_v5e_singlehost_tpu] apiVersion: ray.io/v1 kind: RayService metadata: name: vllm-tpu spec: serveConfigV2: | applications: - name: llm route_prefix: / import_path: ai-ml.gke-ray.rayserve.llm.model-composition.serve_tpu:multi_model deployments: - name: MultiModelDeployment num_replicas: 1 runtime_env: working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" env_vars: ASSIST_MODEL_ID: "$ASSIST_MODEL_ID" SUMMARIZER_MODEL_ID: "$SUMMARIZER_MODEL_ID" TPU_CHIPS: "16" TPU_HEADS: "2" rayClusterConfig: headGroupSpec: rayStartParams: {} template: metadata: annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" spec: serviceAccountName: $KSA_NAME containers: - name: ray-head image: $VLLM_IMAGE resources: limits: cpu: "2" memory: 8G requests: cpu: "2" memory: 8G ports: - containerPort: 6379 name: gcs-server - containerPort: 8265 name: dashboard - containerPort: 10001 name: client - containerPort: 8000 name: serve env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token - name: VLLM_XLA_CACHE_PATH value: "/data" volumeMounts: - name: gcs-fuse-csi-ephemeral mountPath: /data - name: dshm mountPath: /dev/shm volumes: - name: gke-gcsfuse-cache emptyDir: medium: Memory - name: dshm emptyDir: medium: Memory - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: $GSBUCKET mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" workerGroupSpecs: - replicas: 2 minReplicas: 1 maxReplicas: 2 numOfHosts: 1 groupName: tpu-group rayStartParams: {} template: metadata: annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" spec: serviceAccountName: $KSA_NAME containers: - name: llm image: $VLLM_IMAGE env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token - name: VLLM_XLA_CACHE_PATH value: "/data" resources: limits: cpu: "100" google.com/tpu: "8" ephemeral-storage: 40G memory: 200G requests: cpu: "100" google.com/tpu: "8" ephemeral-storage: 40G memory: 200G volumeMounts: - name: gcs-fuse-csi-ephemeral mountPath: /data - name: dshm mountPath: /dev/shm volumes: - name: gke-gcsfuse-cache emptyDir: medium: Memory - name: dshm emptyDir: medium: Memory - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: $GSBUCKET mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice cloud.google.com/gke-tpu-topology: 2x4 # [END gke_ai_ml_gke_ray_rayserve_llm_model_composition_v5e_singlehost_tpu]