ai-ml/llm-serving-optimum-tpu/optimum-tpu-gemma-2b-2x4.yaml (59 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# [START gke_ai_ml_llm_serving_optimum_tpu_gemma_2b_2x4]
apiVersion: apps/v1
kind: Deployment
metadata:
name: tgi-tpu
spec:
replicas: 1
selector:
matchLabels:
app: tgi-tpu
template:
metadata:
labels:
app: tgi-tpu
spec:
nodeSelector:
cloud.google.com/gke-tpu-topology: 2x4
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
containers:
- name: tgi-tpu
image: REGION_NAME-docker.pkg.dev/PROJECT_ID/optimum-tpu/tgi-tpu:latest
args:
- --model-id=google/gemma-2b
- --max-concurrent-requests=4
- --max-input-length=8191
- --max-total-tokens=8192
- --max-batch-prefill-tokens=32768
- --max-batch-size=16
securityContext:
privileged: true
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
ports:
- containerPort: 80
resources:
limits:
google.com/tpu: 8
livenessProbe:
httpGet:
path: /health
port: 80
initialDelaySeconds: 300
periodSeconds: 120
---
apiVersion: v1
kind: Service
metadata:
name: service
spec:
selector:
app: tgi-tpu
ports:
- name: http
protocol: TCP
port: 8080
targetPort: 80
# [END gke_ai_ml_llm_serving_optimum_tpu_gemma_2b_2x4]