ai-ml/gke-ray/rayserve/llm/model-composition/ray-service.tpu-v6e-singlehost.yaml (136 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# [START gke_ai_ml_gke_ray_rayserve_llm_model_composition_v6e_singlehost_tpu]
apiVersion: ray.io/v1
kind: RayService
metadata:
name: vllm-tpu
spec:
serveConfigV2: |
applications:
- name: llm
route_prefix: /
import_path: ai-ml.gke-ray.rayserve.llm.model-composition.serve_tpu:multi_model
deployments:
- name: MultiModelDeployment
num_replicas: 1
runtime_env:
working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
env_vars:
ASSIST_MODEL_ID: "$ASSIST_MODEL_ID"
SUMMARIZER_MODEL_ID: "$SUMMARIZER_MODEL_ID"
TPU_CHIPS: "16"
TPU_HEADS: "2"
rayClusterConfig:
headGroupSpec:
rayStartParams: {}
template:
metadata:
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: "0"
gke-gcsfuse/memory-limit: "0"
gke-gcsfuse/ephemeral-storage-limit: "0"
spec:
serviceAccountName: $KSA_NAME
containers:
- name: ray-head
image: $VLLM_IMAGE
resources:
limits:
cpu: "2"
memory: 8G
requests:
cpu: "2"
memory: 8G
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
- name: VLLM_XLA_CACHE_PATH
value: "/data"
volumeMounts:
- name: gcs-fuse-csi-ephemeral
mountPath: /data
- name: dshm
mountPath: /dev/shm
volumes:
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: dshm
emptyDir:
medium: Memory
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: $GSBUCKET
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
workerGroupSpecs:
- replicas: 2
minReplicas: 1
maxReplicas: 2
numOfHosts: 1
groupName: tpu-group
rayStartParams: {}
template:
metadata:
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: "0"
gke-gcsfuse/memory-limit: "0"
gke-gcsfuse/ephemeral-storage-limit: "0"
spec:
serviceAccountName: $KSA_NAME
containers:
- name: llm
image: $VLLM_IMAGE
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
- name: VLLM_XLA_CACHE_PATH
value: "/data"
resources:
limits:
cpu: "100"
google.com/tpu: "8"
ephemeral-storage: 40G
memory: 200G
requests:
cpu: "100"
google.com/tpu: "8"
ephemeral-storage: 40G
memory: 200G
volumeMounts:
- name: gcs-fuse-csi-ephemeral
mountPath: /data
- name: dshm
mountPath: /dev/shm
volumes:
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: dshm
emptyDir:
medium: Memory
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: $GSBUCKET
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 2x4
# [END gke_ai_ml_gke_ray_rayserve_llm_model_composition_v6e_singlehost_tpu]