machine_learning/ml_infrastructure/inference-server-performance/server/setup_trtis.ipynb (472 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TRTIS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile trtis_service.yaml\n",
"\n",
"apiVersion: v1\n",
"kind: Service\n",
"metadata:\n",
" labels:\n",
" name: inference-server\n",
" name: inference-server\n",
" namespace: default\n",
"spec:\n",
" #externalTrafficPolicy: Cluster\n",
" ports:\n",
" - name: http-inference-server\n",
" port: 8000\n",
" protocol: TCP\n",
" targetPort: 8000\n",
" - name: grpc-inference-server\n",
" port: 8001\n",
" protocol: TCP\n",
" targetPort: 8001\n",
" - name: metrics-inference-server\n",
" port: 8002\n",
" protocol: TCP\n",
" targetPort: 8002\n",
" selector:\n",
" app: inference-server\n",
" sessionAffinity: None\n",
" type: ClusterIP"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile trtis_deploy.yaml\n",
"\n",
"apiVersion: apps/v1\n",
"kind: Deployment\n",
"metadata:\n",
" name: inference-server\n",
" labels:\n",
" name: inference-server\n",
"spec:\n",
" replicas: 1\n",
" selector:\n",
" matchLabels:\n",
" app: inference-server\n",
" template:\n",
" metadata:\n",
" labels:\n",
" app: inference-server\n",
" spec:\n",
" dnsPolicy: ClusterFirst\n",
" imagePullSecrets:\n",
" - name: ngc\n",
" priority: 0\n",
" restartPolicy: Always\n",
" schedulerName: default-scheduler\n",
" securityContext: {}\n",
" serviceAccount: default\n",
" serviceAccountName: default\n",
" terminationGracePeriodSeconds: 30\n",
" containers:\n",
" - args:\n",
" - trtserver\n",
" - --model-store=gs://${BUCKET_NAME}/resnet/\n",
" image: nvcr.io/nvidia/tensorrtserver:19.05-py3\n",
" imagePullPolicy: IfNotPresent\n",
" livenessProbe:\n",
" failureThreshold: 3\n",
" httpGet:\n",
" path: /api/health/live\n",
" port: 8000\n",
" scheme: HTTP\n",
" initialDelaySeconds: 5\n",
" periodSeconds: 5\n",
" successThreshold: 1\n",
" timeoutSeconds: 1\n",
" name: inference-server\n",
" ports:\n",
" - containerPort: 8000\n",
" protocol: TCP\n",
" - containerPort: 8001\n",
" protocol: TCP\n",
" - containerPort: 8002\n",
" protocol: TCP\n",
" readinessProbe:\n",
" failureThreshold: 3\n",
" httpGet:\n",
" path: /api/health/ready\n",
" port: 8000\n",
" scheme: HTTP\n",
" initialDelaySeconds: 5\n",
" periodSeconds: 5\n",
" successThreshold: 1\n",
" timeoutSeconds: 1\n",
" resources:\n",
" limits:\n",
" nvidia.com/gpu: \"1\"\n",
" requests:\n",
" cpu: 1000m\n",
" nvidia.com/gpu: \"1\"\n",
" securityContext:\n",
" procMount: Default\n",
" runAsUser: 1000\n",
" terminationMessagePath: /dev/termination-log\n",
" terminationMessagePolicy: File\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl create -f trtis_service.yaml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl create -f trtis_deploy.yaml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get svc inference-server -o \"jsonpath={.spec['clusterIP']}:{.spec['ports'][0]['port']}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get pods"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl logs inference-server-6dd698b787-nzs4h"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prometheus"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile clusterRole.yml\n",
"\n",
"apiVersion: rbac.authorization.k8s.io/v1beta1\n",
"kind: ClusterRole\n",
"metadata:\n",
" name: prometheus\n",
"rules:\n",
"- apiGroups: [\"\"]\n",
" resources:\n",
" - nodes\n",
" - nodes/proxy\n",
" - services\n",
" - endpoints\n",
" - pods\n",
" verbs: [\"get\", \"list\", \"watch\"]\n",
"- apiGroups:\n",
" - extensions\n",
" resources:\n",
" - ingresses\n",
" verbs: [\"get\", \"list\", \"watch\"]\n",
"- nonResourceURLs: [\"/metrics\"]\n",
" verbs: [\"get\"]\n",
"---\n",
"apiVersion: rbac.authorization.k8s.io/v1beta1\n",
"kind: ClusterRoleBinding\n",
"metadata:\n",
" name: prometheus\n",
"roleRef:\n",
" apiGroup: rbac.authorization.k8s.io\n",
" kind: ClusterRole\n",
" name: prometheus\n",
"subjects:\n",
"- kind: ServiceAccount\n",
" name: default\n",
" namespace: monitoring"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get svc inference-server -o \"jsonpath={.spec['clusterIP']}:{.spec['ports'][2]['port']}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile prometheus-configmap.yml\n",
"\n",
"apiVersion: v1\n",
"kind: ConfigMap\n",
"metadata:\n",
" name: prometheus-server-conf\n",
" labels:\n",
" name: prometheus-server-conf\n",
" namespace: monitoring\n",
"data:\n",
" prometheus.yml: |-\n",
" # my global config\n",
" global:\n",
" scrape_interval: 10s\n",
" evaluation_interval: 10s\n",
" # scrape_timeout is set to the global default (10s).\n",
"\n",
" # Alertmanager configuration\n",
" alerting:\n",
" alertmanagers:\n",
" - static_configs:\n",
" - targets:\n",
" # - alertmanager:9093\n",
"\n",
" # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.\n",
" rule_files:\n",
" # - \"first_rules.yml\"\n",
" # - \"second_rules.yml\"\n",
"\n",
" # A scrape configuration containing exactly one endpoint to scrape:\n",
" # Here it's Prometheus itself.\n",
" scrape_configs:\n",
" # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.\n",
" - job_name: 'prometheus'\n",
"\n",
" # metrics_path defaults to '/metrics'\n",
" # scheme defaults to 'http'.\n",
"\n",
" static_configs:\n",
" - targets: ['CLUSTER_IP:8002']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile prometheus-deployment.yml\n",
"\n",
"apiVersion: extensions/v1beta1\n",
"kind: Deployment\n",
"metadata:\n",
" name: prometheus-deployment\n",
" namespace: monitoring\n",
"spec:\n",
" replicas: 1\n",
" template:\n",
" metadata:\n",
" labels:\n",
" app: prometheus-server\n",
" spec:\n",
" containers:\n",
" - name: prometheus\n",
" image: prom/prometheus:latest\n",
" args:\n",
" - \"--config.file=/etc/prometheus/prometheus.yml\"\n",
" - \"--storage.tsdb.path=/prometheus/\"\n",
" ports:\n",
" - containerPort: 9090\n",
" volumeMounts:\n",
" - name: prometheus-config-volume\n",
" mountPath: /etc/prometheus\n",
" - name: prometheus-storage-volume\n",
" mountPath: /prometheus\n",
" volumes:\n",
" - name: prometheus-config-volume\n",
" configMap:\n",
" defaultMode: 420\n",
" name: prometheus-server-conf\n",
" - name: prometheus-storage-volume\n",
" emptyDir: {}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile prometheus-service.yml\n",
"\n",
"apiVersion: v1\n",
"kind: Service\n",
"metadata:\n",
" name: prometheus-service\n",
"spec:\n",
" selector: \n",
" app: prometheus-server\n",
" type: ClusterIP\n",
" ports:\n",
" - port: 8080\n",
" targetPort: 9090"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl create namespace monitoring\n",
"!kubectl create -f clusterRole.yml\n",
"!kubectl create -f prometheus-configmap.yml -n monitoring\n",
"!kubectl create -f prometheus-deployment.yml -n monitoring\n",
"!kubectl create -f prometheus-service.yml -n monitoring"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get svc prometheus-service -o \"jsonpath={.spec['clusterIP']}:{.spec['ports'][0]['port']}\" -n monitoring"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Grafana"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile grafana-deployment.yml\n",
"\n",
"apiVersion: extensions/v1beta1\n",
"kind: Deployment\n",
"metadata:\n",
" name: grafana-deployment\n",
" namespace: monitoring\n",
"spec:\n",
" replicas: 1\n",
" template:\n",
" metadata:\n",
" labels:\n",
" app: grafana-server\n",
" spec:\n",
" containers:\n",
" - name: grafana\n",
" image: grafana/grafana:latest\n",
" #args:\n",
" # - \"--config.file=/root/prometheus.yml\"\n",
" # - \"--storage.tsdb.path=/prometheus/\"\n",
" ports:\n",
" - containerPort: 3000"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile grafana-service.yml\n",
"\n",
"apiVersion: v1\n",
"kind: Service\n",
"metadata:\n",
" name: grafana-service\n",
"spec:\n",
" selector: \n",
" app: grafana-server\n",
" type: LoadBalancer\n",
" ports:\n",
" - port: 8100\n",
" targetPort: 3000"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl create -f grafana-service.yml -n monitoring\n",
"!kubectl create -f grafana-deployment.yml -n monitoring"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get svc prometheus-service -o \"jsonpath={.spec['clusterIP']}:{.spec['ports'][0]['port']}\" -n monitoring"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get pods -n monitoring"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}