machine_learning/ml_infrastructure/inference-server-performance/server/trtis_deploy.yaml (71 lines of code) (raw):
apiVersion: apps/v1
kind: Deployment
metadata:
name: inference-server
labels:
name: inference-server
spec:
replicas: 1
selector:
matchLabels:
app: inference-server
template:
metadata:
labels:
app: inference-server
spec:
dnsPolicy: ClusterFirst
imagePullSecrets:
- name: ngc
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
containers:
- args:
- trtserver
- --model-store=gs://YOUR-BUCKET-NAME/resnet/
image: nvcr.io/nvidia/tensorrtserver:19.05-py3
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /api/health/live
port: 8000
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
name: inference-server
ports:
- containerPort: 8000
protocol: TCP
- containerPort: 8001
protocol: TCP
- containerPort: 8002
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /api/health/ready
port: 8000
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
nvidia.com/gpu: "1"
requests:
cpu: 1000m
nvidia.com/gpu: "1"
securityContext:
procMount: Default
runAsUser: 1000
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File