fleet-charts/gemma-server/templates/deployment.yaml (120 lines of code) (raw):

# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Patterned on: # - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-1b-it.yaml # - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-4b-it.yaml # - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-14b-it.yaml # - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-27b-it.yaml apiVersion: apps/v1 kind: Deployment metadata: name: vllm-gemma-3-{{ .Values.weight }} labels: app: gemma-server spec: selector: matchLabels: app: gemma-server template: metadata: labels: app: gemma-server ai.gke.io/model: "gemma-3-{{ .Values.weight }}-it" ai.gke.io/inference-server: vllm spec: containers: {{ if eq .Values.weight "1b" }} - name: inference-server image: "{{ .Values.image }}" resources: requests: cpu: "2" memory: "10Gi" ephemeral-storage: "10Gi" nvidia.com/gpu: "1" limits: cpu: "2" memory: "10Gi" ephemeral-storage: "10Gi" nvidia.com/gpu: "1" command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - --model=$(MODEL_ID) - --tensor-parallel-size=1 - --host=0.0.0.0 - --port=8000{{ else if eq .Values.weight "4b" }} - name: inference-server image: "{{ .Values.image }}" resources: requests: cpu: "2" memory: "20Gi" ephemeral-storage: "20Gi" nvidia.com/gpu: "1" limits: cpu: "2" memory: "20Gi" ephemeral-storage: "20Gi" nvidia.com/gpu: "1" command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - --model=$(MODEL_ID) - --tensor-parallel-size=1 - --host=0.0.0.0 - --port=8000 - --max-model-len=32768 - --max-num-seqs=4{{- else if eq .Values.weight "12b" }} - name: inference-server image: "{{ .Values.image }}" resources: requests: cpu: "4" memory: "32Gi" ephemeral-storage: "32Gi" nvidia.com/gpu: "2" limits: cpu: "4" memory: "32Gi" ephemeral-storage: "32Gi" nvidia.com/gpu: "2" command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - --model=$(MODEL_ID) - --tensor-parallel-size=2 - --host=0.0.0.0 - --port=8000 - --max-model-len=16384 - --max-num-seqs=4{{ else if eq .Values.weight "27b" }} - name: inference-server image: "{{ .Values.image }}" resources: requests: cpu: "10" memory: "128Gi" ephemeral-storage: "120Gi" nvidia.com/gpu : "1" limits: cpu: "10" memory: "128Gi" ephemeral-storage: "120Gi" nvidia.com/gpu : "1" command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - --model=$(MODEL_ID) - --tensor-parallel-size=1 - --host=0.0.0.0 - --port=8000 - --swap-space=16 - --gpu-memory-utilization=0.95 - --max-model-len=32768 - --max-num-seqs=4{{ end }} env: - name: MODEL_ID value: "google/gemma-3-{{ .Values.weight }}-it" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token volumeMounts: - mountPath: /dev/shm name: dshm volumes: - name: dshm emptyDir: medium: Memory nodeSelector: cloud.google.com/gke-accelerator: "{{ .Values.accelerator }}" cloud.google.com/gke-gpu-driver-version: latest