best-practices/gke-batch-refarch/platform/monitoring/deploy-dashboard.yaml (1,404 lines of code) (raw):

apiVersion: batch/v1 kind: Job metadata: name: deploy-grafana-kueue-dashboard spec: ttlSecondsAfterFinished: 30 template: spec: volumes: - name: kueue-dashboard configMap: name: kueue-dashboard-config containers: - name: deploy-dashboard image: ${REGION}-docker.pkg.dev/${PROJECT_ID}/tutorial-installer/installer command: ["sh"] args: - -c - | curl -X POST -H 'Content-Type: application/json' -H 'Accept: application/json' 'http://admin:admin@grafana.monitoring.svc.cluster.local:3000/api/dashboards/import' -d @kueue-dashboard.json volumeMounts: - mountPath: /kueue-dashboard.json name: kueue-dashboard subPath: kueue-dashboard.json restartPolicy: Never --- apiVersion: v1 kind: ConfigMap metadata: name: kueue-dashboard-config data: kueue-dashboard.json: | { "dashboard": { "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "target": { "limit": 100, "matchAny": false, "tags": [], "type": "dashboard" }, "type": "dashboard" } ] }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "links": [], "liveNow": true, "panels": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(rate(kueue_admission_attempts_total[60s]))", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Admissions /s", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 4, "w": 5, "x": 4, "y": 0 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(histogram_quantile(0.99, rate(kueue_admission_attempt_duration_seconds_bucket[60s]))>0)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Admission latency [99th perc]", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of all active workloads in all cluster queues", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "builder", "expr": "sum(kueue_admitted_active_workloads)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Active workloads", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of all pending workloads in all cluster queues", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "#EAB839", "value": 10 }, { "color": "dark-red", "value": 20 } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "builder", "expr": "sum(kueue_pending_workloads)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Pending workloads", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of nodes in a cluster", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_info)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Total nodes", "type": "stat" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "panels": [], "title": "Cluster queues", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 50, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 9, "x": 0, "y": 5 }, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum by(cluster_queue)(kueue_admitted_active_workloads)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Active workloads per cluster queue", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "continuous-YlRd" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 50, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 9, "x": 9, "y": 5 }, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum by(cluster_queue)(kueue_pending_workloads)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Pending workloads per cluster queue", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, "panels": [], "title": "Cluster nodes", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "dark-orange", "value": 0.8 }, { "color": "dark-red", "value": 0.9 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 14 }, "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "builder", "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\"})", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "CPU requested", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "dark-orange", "value": 80 }, { "color": "dark-red", "value": 90 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 6, "y": 14 }, "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\"})", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Memory requested", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "dark-red", "value": null }, { "color": "dark-green", "value": 0.8 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 12, "y": 14 }, "options": { "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Ready nodes", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of nodes with Ready == true condition", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 0, "y": 19 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Nodes ready", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of nodes with Ready == false condition", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 3, "y": 19 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"false\"})", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Nodes not ready", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of nodes that can't schedule PODs", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 6, "y": 19 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_spec_unschedulable)", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Nodes unschedulable", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of nodes with DiskPressure condition", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 9, "y": 19 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Nodes disk pressure", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of nodes with MemPressure condition", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 12, "y": 19 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Nodes mem pressure", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "description": "The total number of nodes with PIDPressure condition", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 15, "y": 19 }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "pluginVersion": "9.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\"})", "legendFormat": "__auto", "range": true, "refId": "A" } ], "title": "Nodes PID pressure", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 22, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 7, "w": 18, "x": 0, "y": 23 }, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "legendFormat": "__auto", "range": true, "refId": "Ready nodes" }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", "exemplar": false, "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"false\"})", "hide": false, "instant": false, "legendFormat": "__auto", "range": true, "refId": "Not ready nodes" } ], "title": "Nodes", "type": "timeseries" }, { "datasource": { "default": true, "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 8, "w": 18, "x": 0, "y": 30 }, "id": 20, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum by(key) (kube_node_spec_taint{key=\"reserved\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "reserved nodes", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "disableTextWrap": false, "editorMode": "builder", "expr": "sum by(key) (kube_node_spec_taint{key=\"spot\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "spot nodes", "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, "disableTextWrap": false, "editorMode": "code", "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "sum of all nodes", "range": true, "refId": "Total number of nodes", "useBackend": false }, { "datasource": { "name": "Expression", "type": "__expr__", "uid": "__expr__" }, "expression": "${Total number of nodes} - ${reserved nodes} - ${spot nodes}", "hide": false, "refId": "On-demand nodes", "type": "math" } ], "title": "Node per type", "type": "timeseries" } ], "refresh": "5s", "revision": 1, "schemaVersion": 38, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-5m", "to": "now" }, "timepicker": { "nowDelay": "" }, "timezone": "browser", "title": "Kueue Dashboard", "uid": "DoNO0Jx4z", "version": 12, "weekStart": "" }, "overwrite": true, "inputs": [ { "name": "DS_PROMETHEUS", "type": "datasource", "pluginId": "prometheus" } ], "folderUid": "" }