ai-ml/llm-serving-gemma/vllm/vllm-2-9b-it.yaml (68 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_ai_ml_llm_serving_gemma_vllm_2_9b_it_deployment] apiVersion: apps/v1 kind: Deployment metadata: name: vllm-gemma-deployment spec: replicas: 1 selector: matchLabels: app: gemma-server template: metadata: labels: app: gemma-server ai.gke.io/model: gemma-2-9b-it ai.gke.io/inference-server: vllm examples.ai.gke.io/source: user-guide spec: containers: - name: inference-server image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250310_0916_RC00 resources: requests: cpu: "4" memory: "30Gi" ephemeral-storage: "30Gi" nvidia.com/gpu: "2" limits: cpu: "4" memory: "30Gi" ephemeral-storage: "30Gi" nvidia.com/gpu: "2" command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - --model=$(MODEL_ID) - --tensor-parallel-size=2 - --host=0.0.0.0 - --port=8000 env: - name: MODEL_ID value: google/gemma-2-9b-it - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token volumeMounts: - mountPath: /dev/shm name: dshm volumes: - name: dshm emptyDir: medium: Memory nodeSelector: cloud.google.com/gke-accelerator: nvidia-l4 cloud.google.com/gke-gpu-driver-version: latest --- apiVersion: v1 kind: Service metadata: name: llm-service spec: selector: app: gemma-server type: ClusterIP ports: - protocol: TCP port: 8000 targetPort: 8000 # [END gke_ai_ml_llm_serving_gemma_vllm_2_9b_it_deployment]