ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_aiml_llm_multi_gpus_llama3_70b_inference] apiVersion: apps/v1 kind: Deployment metadata: name: llm spec: replicas: 1 selector: matchLabels: app: llm template: metadata: labels: app: llm spec: containers: - name: llm image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-1.ubuntu2204.py310 resources: requests: cpu: "10" memory: "60Gi" nvidia.com/gpu: "2" limits: cpu: "10" memory: "60Gi" nvidia.com/gpu: "2" env: - name: MODEL_ID value: meta-llama/Meta-Llama-3-70B-Instruct - name: NUM_SHARD value: "2" - name: MAX_INPUT_TOKENS value: "2048" - name: PORT value: "8080" - name: QUANTIZE value: bitsandbytes-nf4 - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: l4-demo key: HUGGING_FACE_TOKEN volumeMounts: - mountPath: /dev/shm name: dshm # mountPath is set to /tmp as it's the path where the HUGGINGFACE_HUB_CACHE environment # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image. # i.e. where the downloaded model from the Hub will be stored - mountPath: /tmp name: ephemeral-volume volumes: - name: dshm emptyDir: medium: Memory - name: ephemeral-volume ephemeral: volumeClaimTemplate: metadata: labels: type: ephemeral spec: accessModes: ["ReadWriteOnce"] storageClassName: "premium-rwo" resources: requests: storage: 150Gi nodeSelector: cloud.google.com/gke-accelerator: "nvidia-l4" cloud.google.com/gke-spot: "true" # [END gke_aiml_llm_multi_gpus_llama3_70b_inference]

ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml (66 lines of code) (raw):