ai-ml/llm-multihost-gpus/vllm-llama3-405b-A3.yaml

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_ai_ml_llm_serving_multihost_gpus_vllm_llama3_405b_a4] apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: name: vllm spec: replicas: 1 leaderWorkerTemplate: size: 2 restartPolicy: RecreateGroupOnPodRestart leaderTemplate: metadata: labels: role: leader spec: nodeSelector: cloud.google.com/gke-accelerator: nvidia-h100-80gb containers: - name: vllm-leader image: docker.io/vllm/vllm-openai:latest env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token command: - sh - -c - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2" resources: limits: nvidia.com/gpu: "8" ports: - containerPort: 8080 readinessProbe: tcpSocket: port: 8080 initialDelaySeconds: 15 periodSeconds: 10 volumeMounts: - mountPath: /dev/shm name: dshm volumes: - name: dshm emptyDir: medium: Memory sizeLimit: 15Gi workerTemplate: spec: nodeSelector: cloud.google.com/gke-accelerator: nvidia-h100-80gb containers: - name: vllm-worker image: docker.io/vllm/vllm-openai:latest command: - sh - -c - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" resources: limits: nvidia.com/gpu: "8" env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token volumeMounts: - mountPath: /dev/shm name: dshm volumes: - name: dshm emptyDir: medium: Memory sizeLimit: 15Gi --- apiVersion: v1 kind: Service metadata: name: vllm-leader spec: ports: - name: http port: 8080 protocol: TCP targetPort: 8080 selector: leaderworkerset.sigs.k8s.io/name: vllm role: leader type: ClusterIP # [END gke_ai_ml_llm_serving_multihost_gpus_vllm_llama3_405b_a4]

ai-ml/llm-multihost-gpus/vllm-llama3-405b-A3.yaml (91 lines of code) (raw):