tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/deployment.yaml (77 lines of code) (raw):

apiVersion: apps/v1 kind: Deployment metadata: namespace: bionemo-training name: esm2-inference labels: app: esm2-inference spec: replicas: 1 selector: matchLabels: app: esm2-inference template: metadata: labels: app: esm2-inference spec: serviceAccountName: esm2-inference-sa containers: - name: inference image: esm2-inference-image ports: - containerPort: 8000 name: http env: - name: MODEL_PATH value: "/mnt/data/model" resources: limits: nvidia.com/gpu: 1 memory: "4Gi" cpu: "2" requests: nvidia.com/gpu: 1 memory: "2Gi" cpu: "500m" volumeMounts: - name: model-storage mountPath: /mnt/data readOnly: true livenessProbe: httpGet: path: /health port: 8000 initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: path: /health port: 8000 initialDelaySeconds: 15 periodSeconds: 5 securityContext: runAsNonRoot: true runAsUser: 1000 allowPrivilegeEscalation: false volumes: - name: model-storage persistentVolumeClaim: claimName: bionemo-filestore readOnly: true nodeSelector: cloud.google.com/gke-gpu: "true" --- apiVersion: v1 kind: Service metadata: name: esm2-inference spec: selector: app: esm2-inference ports: - port: 80 targetPort: 8000 protocol: TCP name: http type: ClusterIP