ai-ml/llm-serving-optimum-tpu/optimum-tpu-gemma-2b-2x4.yaml

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_ai_ml_llm_serving_optimum_tpu_gemma_2b_2x4] apiVersion: apps/v1 kind: Deployment metadata: name: tgi-tpu spec: replicas: 1 selector: matchLabels: app: tgi-tpu template: metadata: labels: app: tgi-tpu spec: nodeSelector: cloud.google.com/gke-tpu-topology: 2x4 cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice containers: - name: tgi-tpu image: REGION_NAME-docker.pkg.dev/PROJECT_ID/optimum-tpu/tgi-tpu:latest args: - --model-id=google/gemma-2b - --max-concurrent-requests=4 - --max-input-length=8191 - --max-total-tokens=8192 - --max-batch-prefill-tokens=32768 - --max-batch-size=16 securityContext: privileged: true env: - name: HF_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token ports: - containerPort: 80 resources: limits: google.com/tpu: 8 livenessProbe: httpGet: path: /health port: 80 initialDelaySeconds: 300 periodSeconds: 120 --- apiVersion: v1 kind: Service metadata: name: service spec: selector: app: tgi-tpu ports: - name: http protocol: TCP port: 8080 targetPort: 80 # [END gke_ai_ml_llm_serving_optimum_tpu_gemma_2b_2x4]

ai-ml/llm-serving-optimum-tpu/optimum-tpu-gemma-2b-2x4.yaml (59 lines of code) (raw):