use-cases/inferencing/cost-optimization/gcsfuse/manifests/model-deployment-tuned-a100-dws.yaml (116 lines of code) (raw):
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-openai-gcs-tuned-llama33-70b-a100
spec:
replicas: 1
selector:
matchLabels:
app: vllm-openai-gcs-tuned-llama33-70b-a100
strategy:
type: Recreate
template:
metadata:
labels:
app: vllm-openai-gcs-tuned-llama33-70b-a100
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: "0"
gke-gcsfuse/memory-limit: "0"
gke-gcsfuse/ephemeral-storage-limit: "0"
cluster-autoscaler.kubernetes.io/consume-provisioning-request: a100-storage-benchmark-tuned
cluster-autoscaler.kubernetes.io/provisioning-class-name: "queued-provisioning.gke.io"
spec:
containers:
- name: fetch-safetensors
image: busybox
command: ["/bin/sh", "-c"]
args:
- |
echo "########### $(date) - Starting parallel-fetch-safetensors"
find /gcs/${MODEL_NAME}/${MODEL_VERSION}/*safetensors -type f | xargs -I {} -P 15 sh -c 'echo "########### $(date) - Fetching: {}"; dd if={} of=/dev/null'
echo "########### $(date) - Finished parallel-fetch-safetensors"
sleep infinity
volumeMounts:
- name: gcsfuse
mountPath: /gcs/${MODEL_NAME}/${MODEL_VERSION}/
readOnly: true
- name: inference-server
args:
- --model=$(MODEL)
- --tensor-parallel-size=8
- --trust-remote-code
env:
- name: MODEL
value: /gcs/${MODEL_NAME}/${MODEL_VERSION}
image: ${VLLM_IMAGE_NAME}
imagePullPolicy: Always
readinessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 1
successThreshold: 1
timeoutSeconds: 1
resources:
requests:
cpu: "64"
memory: "192G"
nvidia.com/gpu: "8"
limits:
cpu: "64"
memory: "192G"
nvidia.com/gpu: "8"
volumeMounts:
- mountPath: /dev/shm
name: dshm
- name: gcsfuse
mountPath: /gcs/${MODEL_NAME}/${MODEL_VERSION}/
readOnly: true
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-tesla-a100
serviceAccountName: ${MLP_STORAGE_BENCHMARKING_KSA}
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- key: "on-demand"
value: "true"
operator: "Equal"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: gcsfuse
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: ${MLP_STORAGE_BENCHMARK_HIERARCHICAL_BUCKET}
mountOptions: "metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,file-system:kernel-list-cache-ttl-secs:-1,only-dir:${MODEL_NAME}/${MODEL_VERSION}/"
skipCSIBucketAccessCheck: "true"
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: gke-gcsfuse-tmp
emptyDir:
medium: Memory
- name: gke-gcsfuse-buffer
emptyDir:
medium: Memory
---
apiVersion: v1
kind: Service
metadata:
name: vllm-openai-gcs-tuned-llama33-70b-a100
spec:
selector:
app: vllm-openai-gcs-tuned-llama33-70b-a100
type: ClusterIP
ports:
- protocol: TCP
port: 8000
targetPort: 8000