machine_learning/ml_infrastructure/inference-server-performance/server/setup

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# TRTIS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile trtis_service.yaml\n", "\n", "apiVersion: v1\n", "kind: Service\n", "metadata:\n", " labels:\n", " name: inference-server\n", " name: inference-server\n", " namespace: default\n", "spec:\n", " #externalTrafficPolicy: Cluster\n", " ports:\n", " - name: http-inference-server\n", " port: 8000\n", " protocol: TCP\n", " targetPort: 8000\n", " - name: grpc-inference-server\n", " port: 8001\n", " protocol: TCP\n", " targetPort: 8001\n", " - name: metrics-inference-server\n", " port: 8002\n", " protocol: TCP\n", " targetPort: 8002\n", " selector:\n", " app: inference-server\n", " sessionAffinity: None\n", " type: ClusterIP" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile trtis_deploy.yaml\n", "\n", "apiVersion: apps/v1\n", "kind: Deployment\n", "metadata:\n", " name: inference-server\n", " labels:\n", " name: inference-server\n", "spec:\n", " replicas: 1\n", " selector:\n", " matchLabels:\n", " app: inference-server\n", " template:\n", " metadata:\n", " labels:\n", " app: inference-server\n", " spec:\n", " dnsPolicy: ClusterFirst\n", " imagePullSecrets:\n", " - name: ngc\n", " priority: 0\n", " restartPolicy: Always\n", " schedulerName: default-scheduler\n", " securityContext: {}\n", " serviceAccount: default\n", " serviceAccountName: default\n", " terminationGracePeriodSeconds: 30\n", " containers:\n", " - args:\n", " - trtserver\n", " - --model-store=gs://${BUCKET_NAME}/resnet/\n", " image: nvcr.io/nvidia/tensorrtserver:19.05-py3\n", " imagePullPolicy: IfNotPresent\n", " livenessProbe:\n", " failureThreshold: 3\n", " httpGet:\n", " path: /api/health/live\n", " port: 8000\n", " scheme: HTTP\n", " initialDelaySeconds: 5\n", " periodSeconds: 5\n", " successThreshold: 1\n", " timeoutSeconds: 1\n", " name: inference-server\n", " ports:\n", " - containerPort: 8000\n", " protocol: TCP\n", " - containerPort: 8001\n", " protocol: TCP\n", " - containerPort: 8002\n", " protocol: TCP\n", " readinessProbe:\n", " failureThreshold: 3\n", " httpGet:\n", " path: /api/health/ready\n", " port: 8000\n", " scheme: HTTP\n", " initialDelaySeconds: 5\n", " periodSeconds: 5\n", " successThreshold: 1\n", " timeoutSeconds: 1\n", " resources:\n", " limits:\n", " nvidia.com/gpu: \"1\"\n", " requests:\n", " cpu: 1000m\n", " nvidia.com/gpu: \"1\"\n", " securityContext:\n", " procMount: Default\n", " runAsUser: 1000\n", " terminationMessagePath: /dev/termination-log\n", " terminationMessagePolicy: File\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl create -f trtis_service.yaml" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl create -f trtis_deploy.yaml" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl get svc inference-server -o \"jsonpath={.spec['clusterIP']}:{.spec['ports'][0]['port']}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl get pods" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl logs inference-server-6dd698b787-nzs4h" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Prometheus" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile clusterRole.yml\n", "\n", "apiVersion: rbac.authorization.k8s.io/v1beta1\n", "kind: ClusterRole\n", "metadata:\n", " name: prometheus\n", "rules:\n", "- apiGroups: [\"\"]\n", " resources:\n", " - nodes\n", " - nodes/proxy\n", " - services\n", " - endpoints\n", " - pods\n", " verbs: [\"get\", \"list\", \"watch\"]\n", "- apiGroups:\n", " - extensions\n", " resources:\n", " - ingresses\n", " verbs: [\"get\", \"list\", \"watch\"]\n", "- nonResourceURLs: [\"/metrics\"]\n", " verbs: [\"get\"]\n", "---\n", "apiVersion: rbac.authorization.k8s.io/v1beta1\n", "kind: ClusterRoleBinding\n", "metadata:\n", " name: prometheus\n", "roleRef:\n", " apiGroup: rbac.authorization.k8s.io\n", " kind: ClusterRole\n", " name: prometheus\n", "subjects:\n", "- kind: ServiceAccount\n", " name: default\n", " namespace: monitoring" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl get svc inference-server -o \"jsonpath={.spec['clusterIP']}:{.spec['ports'][2]['port']}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile prometheus-configmap.yml\n", "\n", "apiVersion: v1\n", "kind: ConfigMap\n", "metadata:\n", " name: prometheus-server-conf\n", " labels:\n", " name: prometheus-server-conf\n", " namespace: monitoring\n", "data:\n", " prometheus.yml: |-\n", " # my global config\n", " global:\n", " scrape_interval: 10s\n", " evaluation_interval: 10s\n", " # scrape_timeout is set to the global default (10s).\n", "\n", " # Alertmanager configuration\n", " alerting:\n", " alertmanagers:\n", " - static_configs:\n", " - targets:\n", " # - alertmanager:9093\n", "\n", " # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.\n", " rule_files:\n", " # - \"first_rules.yml\"\n", " # - \"second_rules.yml\"\n", "\n", " # A scrape configuration containing exactly one endpoint to scrape:\n", " # Here it's Prometheus itself.\n", " scrape_configs:\n", " # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.\n", " - job_name: 'prometheus'\n", "\n", " # metrics_path defaults to '/metrics'\n", " # scheme defaults to 'http'.\n", "\n", " static_configs:\n", " - targets: ['CLUSTER_IP:8002']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile prometheus-deployment.yml\n", "\n", "apiVersion: extensions/v1beta1\n", "kind: Deployment\n", "metadata:\n", " name: prometheus-deployment\n", " namespace: monitoring\n", "spec:\n", " replicas: 1\n", " template:\n", " metadata:\n", " labels:\n", " app: prometheus-server\n", " spec:\n", " containers:\n", " - name: prometheus\n", " image: prom/prometheus:latest\n", " args:\n", " - \"--config.file=/etc/prometheus/prometheus.yml\"\n", " - \"--storage.tsdb.path=/prometheus/\"\n", " ports:\n", " - containerPort: 9090\n", " volumeMounts:\n", " - name: prometheus-config-volume\n", " mountPath: /etc/prometheus\n", " - name: prometheus-storage-volume\n", " mountPath: /prometheus\n", " volumes:\n", " - name: prometheus-config-volume\n", " configMap:\n", " defaultMode: 420\n", " name: prometheus-server-conf\n", " - name: prometheus-storage-volume\n", " emptyDir: {}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile prometheus-service.yml\n", "\n", "apiVersion: v1\n", "kind: Service\n", "metadata:\n", " name: prometheus-service\n", "spec:\n", " selector: \n", " app: prometheus-server\n", " type: ClusterIP\n", " ports:\n", " - port: 8080\n", " targetPort: 9090" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl create namespace monitoring\n", "!kubectl create -f clusterRole.yml\n", "!kubectl create -f prometheus-configmap.yml -n monitoring\n", "!kubectl create -f prometheus-deployment.yml -n monitoring\n", "!kubectl create -f prometheus-service.yml -n monitoring" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl get svc prometheus-service -o \"jsonpath={.spec['clusterIP']}:{.spec['ports'][0]['port']}\" -n monitoring" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Grafana" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile grafana-deployment.yml\n", "\n", "apiVersion: extensions/v1beta1\n", "kind: Deployment\n", "metadata:\n", " name: grafana-deployment\n", " namespace: monitoring\n", "spec:\n", " replicas: 1\n", " template:\n", " metadata:\n", " labels:\n", " app: grafana-server\n", " spec:\n", " containers:\n", " - name: grafana\n", " image: grafana/grafana:latest\n", " #args:\n", " # - \"--config.file=/root/prometheus.yml\"\n", " # - \"--storage.tsdb.path=/prometheus/\"\n", " ports:\n", " - containerPort: 3000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile grafana-service.yml\n", "\n", "apiVersion: v1\n", "kind: Service\n", "metadata:\n", " name: grafana-service\n", "spec:\n", " selector: \n", " app: grafana-server\n", " type: LoadBalancer\n", " ports:\n", " - port: 8100\n", " targetPort: 3000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl create -f grafana-service.yml -n monitoring\n", "!kubectl create -f grafana-deployment.yml -n monitoring" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl get svc prometheus-service -o \"jsonpath={.spec['clusterIP']}:{.spec['ports'][0]['port']}\" -n monitoring" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kubectl get pods -n monitoring" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }

machine_learning/ml_infrastructure/inference-server-performance/server/setup_trtis.ipynb (472 lines of code) (raw):