ai-ml/vllm-tpu/vllm-llama3-70b.yaml (88 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# [START gke_ai_ml_vllm_tpu_vllm_llama3_70b]
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-tpu
spec:
replicas: 1
selector:
matchLabels:
app: vllm-tpu
template:
metadata:
labels:
app: vllm-tpu
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: "0"
gke-gcsfuse/memory-limit: "0"
gke-gcsfuse/ephemeral-storage-limit: "0"
spec:
serviceAccountName: KSA_NAME
nodeSelector:
cloud.google.com/gke-tpu-topology: 2x4
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
containers:
- name: vllm-tpu
image: docker.io/vllm/vllm-tpu:73aa7041bfee43581314e6f34e9a657137ecc092
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- --host=0.0.0.0
- --port=8000
- --tensor-parallel-size=8
- --max-model-len=4096
- --model=meta-llama/Llama-3.1-70B
- --download-dir=/data
- --max-num-batched-tokens=512
- --max-num-seqs=128
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
- name: VLLM_XLA_CACHE_PATH
value: "/data"
- name: VLLM_USE_V1
value: "1"
ports:
- containerPort: 8000
resources:
limits:
google.com/tpu: 8
readinessProbe:
tcpSocket:
port: 8000
initialDelaySeconds: 15
periodSeconds: 10
volumeMounts:
- name: gcs-fuse-csi-ephemeral
mountPath: /data
- name: dshm
mountPath: /dev/shm
volumes:
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: dshm
emptyDir:
medium: Memory
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: GSBUCKET
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
---
apiVersion: v1
kind: Service
metadata:
name: vllm-service
spec:
selector:
app: vllm-tpu
type: LoadBalancer
ports:
- name: http
protocol: TCP
port: 8000
targetPort: 8000
# [END gke_ai_ml_vllm_tpu_vllm_llama3_70b]