machine_learning/ml_infrastructure/inference-server-performance/server/trtis_deploy.yaml (71 lines of code) (raw):

apiVersion: apps/v1 kind: Deployment metadata: name: inference-server labels: name: inference-server spec: replicas: 1 selector: matchLabels: app: inference-server template: metadata: labels: app: inference-server spec: dnsPolicy: ClusterFirst imagePullSecrets: - name: ngc priority: 0 restartPolicy: Always schedulerName: default-scheduler securityContext: {} serviceAccount: default serviceAccountName: default terminationGracePeriodSeconds: 30 containers: - args: - trtserver - --model-store=gs://YOUR-BUCKET-NAME/resnet/ image: nvcr.io/nvidia/tensorrtserver:19.05-py3 imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 3 httpGet: path: /api/health/live port: 8000 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 5 successThreshold: 1 timeoutSeconds: 1 name: inference-server ports: - containerPort: 8000 protocol: TCP - containerPort: 8001 protocol: TCP - containerPort: 8002 protocol: TCP readinessProbe: failureThreshold: 3 httpGet: path: /api/health/ready port: 8000 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 5 successThreshold: 1 timeoutSeconds: 1 resources: limits: nvidia.com/gpu: "1" requests: cpu: 1000m nvidia.com/gpu: "1" securityContext: procMount: Default runAsUser: 1000 terminationMessagePath: /dev/termination-log terminationMessagePolicy: File