use-cases/model-fine-tuning-pipeline/model-eval/manifests/deployment-l4.yaml (94 lines of code) (raw):

apiVersion: apps/v1 kind: Deployment metadata: name: vllm-openai-l4 spec: replicas: 1 selector: matchLabels: app: vllm-openai-l4 template: metadata: labels: app: vllm-openai-l4 ml-platform: vllm-openai annotations: gke-gcsfuse/volumes: "true" spec: containers: - name: inference-server args: - --model=$(MODEL) - --tensor-parallel-size=2 env: - name: MODEL value: V_MODEL_PATH - name: VLLM_ATTENTION_BACKEND value: FLASHINFER image: V_IMAGE_URL readinessProbe: failureThreshold: 3 httpGet: path: /health port: 8000 scheme: HTTP initialDelaySeconds: 240 periodSeconds: 5 successThreshold: 1 timeoutSeconds: 1 resources: requests: cpu: "2" memory: "25Gi" ephemeral-storage: "25Gi" nvidia.com/gpu: "2" limits: cpu: "2" memory: "25Gi" ephemeral-storage: "25Gi" nvidia.com/gpu: "2" volumeMounts: - mountPath: /dev/shm name: dshm - name: gcs-fuse-csi-ephemeral mountPath: /model-data readOnly: true nodeSelector: cloud.google.com/gke-accelerator: nvidia-l4 serviceAccountName: V_KSA tolerations: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - key: "on-demand" value: "true" operator: "Equal" effect: "NoSchedule" volumes: - name: dshm emptyDir: medium: Memory - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: V_BUCKET mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" fileCacheCapacity: "20Gi" fileCacheForRangeRead: "true" metadataStatCacheCapacity: "-1" metadataTypeCacheCapacity: "-1" metadataCacheTTLSeconds: "-1" --- apiVersion: v1 kind: Service metadata: name: vllm-openai-l4 spec: selector: app: vllm-openai-l4 type: ClusterIP ports: - protocol: TCP port: 8000 targetPort: 8000