use-cases/inferencing/cost-optimization/gcsfuse/manifests/model-deployment-tuned-a100-dws.yaml (116 lines of code) (raw):

# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: apps/v1 kind: Deployment metadata: name: vllm-openai-gcs-tuned-llama33-70b-a100 spec: replicas: 1 selector: matchLabels: app: vllm-openai-gcs-tuned-llama33-70b-a100 strategy: type: Recreate template: metadata: labels: app: vllm-openai-gcs-tuned-llama33-70b-a100 annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" cluster-autoscaler.kubernetes.io/consume-provisioning-request: a100-storage-benchmark-tuned cluster-autoscaler.kubernetes.io/provisioning-class-name: "queued-provisioning.gke.io" spec: containers: - name: fetch-safetensors image: busybox command: ["/bin/sh", "-c"] args: - | echo "########### $(date) - Starting parallel-fetch-safetensors" find /gcs/${MODEL_NAME}/${MODEL_VERSION}/*safetensors -type f | xargs -I {} -P 15 sh -c 'echo "########### $(date) - Fetching: {}"; dd if={} of=/dev/null' echo "########### $(date) - Finished parallel-fetch-safetensors" sleep infinity volumeMounts: - name: gcsfuse mountPath: /gcs/${MODEL_NAME}/${MODEL_VERSION}/ readOnly: true - name: inference-server args: - --model=$(MODEL) - --tensor-parallel-size=8 - --trust-remote-code env: - name: MODEL value: /gcs/${MODEL_NAME}/${MODEL_VERSION} image: ${VLLM_IMAGE_NAME} imagePullPolicy: Always readinessProbe: failureThreshold: 3 httpGet: path: /health port: 8000 scheme: HTTP initialDelaySeconds: 60 periodSeconds: 1 successThreshold: 1 timeoutSeconds: 1 resources: requests: cpu: "64" memory: "192G" nvidia.com/gpu: "8" limits: cpu: "64" memory: "192G" nvidia.com/gpu: "8" volumeMounts: - mountPath: /dev/shm name: dshm - name: gcsfuse mountPath: /gcs/${MODEL_NAME}/${MODEL_VERSION}/ readOnly: true nodeSelector: cloud.google.com/gke-accelerator: nvidia-tesla-a100 serviceAccountName: ${MLP_STORAGE_BENCHMARKING_KSA} tolerations: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - key: "on-demand" value: "true" operator: "Equal" effect: "NoSchedule" volumes: - name: dshm emptyDir: medium: Memory - name: gcsfuse csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: ${MLP_STORAGE_BENCHMARK_HIERARCHICAL_BUCKET} mountOptions: "metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,file-system:kernel-list-cache-ttl-secs:-1,only-dir:${MODEL_NAME}/${MODEL_VERSION}/" skipCSIBucketAccessCheck: "true" - name: gke-gcsfuse-cache emptyDir: medium: Memory - name: gke-gcsfuse-tmp emptyDir: medium: Memory - name: gke-gcsfuse-buffer emptyDir: medium: Memory --- apiVersion: v1 kind: Service metadata: name: vllm-openai-gcs-tuned-llama33-70b-a100 spec: selector: app: vllm-openai-gcs-tuned-llama33-70b-a100 type: ClusterIP ports: - protocol: TCP port: 8000 targetPort: 8000