manifests/inference.yaml

# * Copyright 2022 Google LLC # * # * Licensed under the Apache License, Version 2.0 (the "License"); # * you may not use this file except in compliance with the License. # * You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. apiVersion: apps/v1 kind: Deployment metadata: name: inference-deployment namespace: airflow spec: replicas: 1 selector: matchLabels: app: gemma-server template: metadata: labels: app: gemma-server ai.gke.io/model: gemma-2-9b-it ai.gke.io/inference-server: vllm annotations: gke-gcsfuse/volumes: "true" spec: serviceAccountName: airflow-mlops-sa tolerations: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - key: "on-demand" value: "true" operator: "Equal" effect: "NoSchedule" containers: - name: inference-server image: vllm/vllm-openai:latest ports: - containerPort: 8000 resources: requests: nvidia.com/gpu: "2" limits: nvidia.com/gpu: "2" command: ["/bin/sh", "-c"] args: - | python3 -m vllm.entrypoints.api_server --model=/modeldata/fine_tuned_model --tokenizer=/modeldata/fine_tuned_model --tensor-parallel-size=2 volumeMounts: - mountPath: /dev/shm name: dshm - name: gcs-fuse-csi-ephemeral mountPath: /modeldata readOnly: true volumes: - name: dshm emptyDir: medium: Memory - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: BUCKET_DATA_NAME mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" fileCacheCapacity: "20Gi" fileCacheForRangeRead: "true" metadataStatCacheCapacity: "-1" metadataTypeCacheCapacity: "-1" metadataCacheTTLSeconds: "-1" nodeSelector: cloud.google.com/gke-accelerator: nvidia-l4

manifests/inference.yaml (65 lines of code) (raw):