k8s/llama3-70b.yaml (69 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http:#www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: apps/v1 kind: Deployment metadata: name: vllm-openai-llama-deployment spec: replicas: 1 selector: matchLabels: app: openai-llama-server template: metadata: labels: app: openai-llama-server spec: containers: - name: inference-server image: vllm/vllm-openai:v0.6.1 resources: requests: cpu: "20" memory: "80Gi" ephemeral-storage: "100Gi" nvidia.com/gpu: "2" limits: cpu: "20" memory: "80Gi" ephemeral-storage: "100Gi" nvidia.com/gpu: "2" command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - --model=$(MODEL_ID) - --tensor-parallel-size=2 - --max-model-len=128000 - --kv-cache-dtype=fp8 env: - name: MODEL_ID value: neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 - name: VLLM_WORKER_MULTIPROC_METHOD value: spawn - name: VLLM_ATTENTION_BACKEND value: FLASHINFER - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token volumeMounts: - mountPath: /dev/shm name: dshm volumes: - name: dshm emptyDir: medium: Memory nodeSelector: cloud.google.com/gke-accelerator: nvidia-h100-80gb cloud.google.com/gke-gpu-driver-version: latest --- apiVersion: v1 kind: Service metadata: name: openai-llm-service spec: selector: app: openai-llama-server type: ClusterIP ports: - protocol: TCP port: 8000 targetPort: 8000