use-cases/model-fine-tuning-pipeline/model-eval/manifests/deployment-l4.yaml (94 lines of code) (raw):
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-openai-l4
spec:
replicas: 1
selector:
matchLabels:
app: vllm-openai-l4
template:
metadata:
labels:
app: vllm-openai-l4
ml-platform: vllm-openai
annotations:
gke-gcsfuse/volumes: "true"
spec:
containers:
- name: inference-server
args:
- --model=$(MODEL)
- --tensor-parallel-size=2
env:
- name: MODEL
value: V_MODEL_PATH
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER
image: V_IMAGE_URL
readinessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 240
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
requests:
cpu: "2"
memory: "25Gi"
ephemeral-storage: "25Gi"
nvidia.com/gpu: "2"
limits:
cpu: "2"
memory: "25Gi"
ephemeral-storage: "25Gi"
nvidia.com/gpu: "2"
volumeMounts:
- mountPath: /dev/shm
name: dshm
- name: gcs-fuse-csi-ephemeral
mountPath: /model-data
readOnly: true
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
serviceAccountName: V_KSA
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- key: "on-demand"
value: "true"
operator: "Equal"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: V_BUCKET
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1"
fileCacheCapacity: "20Gi"
fileCacheForRangeRead: "true"
metadataStatCacheCapacity: "-1"
metadataTypeCacheCapacity: "-1"
metadataCacheTTLSeconds: "-1"
---
apiVersion: v1
kind: Service
metadata:
name: vllm-openai-l4
spec:
selector:
app: vllm-openai-l4
type: ClusterIP
ports:
- protocol: TCP
port: 8000
targetPort: 8000