modules/kuberay-monitoring/grafana/values.yaml (2,870 lines of code) (raw):

replicas: 1 image: repository: grafana/grafana tag: 8.3.4 pullPolicy: IfNotPresent serviceAccount: create: false name: ${k8s_service_account} grafana.ini: security: allow_embedding: true auth.anonymous: enabled: true org_role: Viewer auth.basic: enabled: false auth: disable_login_form: true datasources: datasources.yaml: apiVersion: 1 datasources: - name: Prometheus type: prometheus url: http://frontend:9090 access: proxy isDefault: true plugins: - grafana-piechart-panel service: name: service type: LoadBalancer port: 80 targetPort: 3000 dashboardProviders: dashboardproviders.yaml: apiVersion: 1 providers: - name: 'default' orgId: 1 folder: '' type: file disableDeletion: true editable: false options: path: /var/lib/grafana/dashboards/default dashboards: default: custom-dashboard: json: | { "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "iteration": 1667344411089, "links": [], "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 26, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (State), 0)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (State), 0)", "interval": "", "legendFormat": "{{State}} (retry)", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler Task State", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "tasks", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "hiddenSeries": false, "id": 35, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (Name)", "interval": "", "legendFormat": "{{Name}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (Name)", "interval": "", "legendFormat": "{{Name}} (retry)", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Active Tasks by Name", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "tasks", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, "hiddenSeries": false, "id": 33, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(ray_actors{SessionName=\"$SessionName\",}) by (State)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler Actor State", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "actors", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Current number of (live) actors with a particular name.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, "hiddenSeries": false, "id": 36, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=\"$SessionName\",}) by (Name)", "interval": "", "legendFormat": "{{Name}}", "queryType": "randomWalk", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Active Actors by Name", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "actors", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 2 }, "hiddenSeries": false, "id": 27, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=\"$SessionName\",}) by (instance)", "interval": "", "legendFormat": "CPU Usage: {{instance}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_resources{Name=\"CPU\",SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" }, { "exemplar": true, "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)))", "interval": "", "legendFormat": "MAX + PENDING", "queryType": "randomWalk", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler CPUs (logical slots)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "cores", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 2 }, "hiddenSeries": false, "id": 29, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(ray_object_store_memory{SessionName=\"$SessionName\",}) by (Location)", "interval": "", "legendFormat": "{{Location}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Object Store Memory", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "bytes", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, "hiddenSeries": false, "id": 28, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=\"$SessionName\",}", "interval": "", "legendFormat": "GPU Usage: {{instance}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_resources{Name=\"GPU\",SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" }, { "exemplar": true, "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)))", "interval": "", "legendFormat": "MAX + PENDING", "queryType": "randomWalk", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler GPUs (logical slots)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "GPUs", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 }, "hiddenSeries": false, "id": 40, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(ray_placement_groups{SessionName=\"$SessionName\",}) by (State)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler Placement Groups", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "placement groups", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "hiddenSeries": false, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=\"$SessionName\",} / 100", "interval": "", "legendFormat": "CPU Usage: {{instance}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_node_cpu_count{SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node CPU (hardware utilization)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "cores", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "hiddenSeries": false, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=\"$SessionName\",} / 100", "interval": "", "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_node_gpus_available{SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node GPU (hardware utilization)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "GPUs", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }, "hiddenSeries": false, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=\"$SessionName\",}", "interval": "", "legendFormat": "Disk Used: {{instance}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_node_disk_free{SessionName=\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node Disk", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "bytes", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Disk IO per node.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }, "hiddenSeries": false, "id": 32, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}", "interval": "", "legendFormat": "Write: {{instance}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}", "interval": "", "legendFormat": "Read: {{instance}}", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node Disk IO Speed", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "Bps", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, "hiddenSeries": false, "id": 4, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=\"$SessionName\",}", "interval": "", "legendFormat": "Memory Used: {{instance}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_node_mem_total{SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node Memory (heap + object store)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "bytes", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, "hiddenSeries": false, "id": 44, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=\"$SessionName\",}", "interval": "", "legendFormat": "OOM Killed: {{Name}}, {{instance}}", "queryType": "randomWalk", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node Out of Memory Failures by Name", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "failures", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 }, "hiddenSeries": false, "id": 34, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "(sum(ray_component_rss_mb{SessionName=\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=\"$SessionName\",}) by (Component))", "interval": "", "legendFormat": "{{Component}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_node_mem_shared_bytes{SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "shared_memory", "queryType": "randomWalk", "refId": "B" }, { "exemplar": true, "expr": "sum(ray_node_mem_total{SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node Memory by Component", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "bytes", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 }, "hiddenSeries": false, "id": 37, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(ray_component_cpu_percentage{SessionName=\"$SessionName\",}) by (Component) / 100", "interval": "", "legendFormat": "{{Component}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_node_cpu_count{SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node CPU by Component", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "cores", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "hiddenSeries": false, "id": 18, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=\"$SessionName\",} * 1024 * 1024", "interval": "", "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "(sum(ray_node_gram_available{SessionName=\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=\"$SessionName\",})) * 1024 * 1024", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node GPU Memory (GRAM)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "bytes", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Network speed per node", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "hiddenSeries": false, "id": 20, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}", "interval": "", "legendFormat": "Recv: {{instance}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}", "interval": "", "legendFormat": "Send: {{instance}}", "queryType": "randomWalk", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node Network", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "Bps", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 10, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "hiddenSeries": false, "id": 24, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "sum(autoscaler_active_nodes{SessionName=\"$SessionName\",}) by (NodeType)", "interval": "", "legendFormat": "Active Nodes: {{NodeType}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(autoscaler_recently_failed_nodes{SessionName=\"$SessionName\",}) by (NodeType)", "interval": "", "legendFormat": "Failed Nodes: {{NodeType}}", "queryType": "randomWalk", "refId": "B" }, { "exemplar": true, "expr": "sum(autoscaler_pending_nodes{SessionName=\"$SessionName\",}) by (NodeType)", "interval": "", "legendFormat": "Pending Nodes: {{NodeType}}", "queryType": "randomWalk", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Node Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "nodes", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", "fieldConfig": { "defaults": {}, "overrides": [] }, "fill": 0, "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "hiddenSeries": false, "id": 41, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", "options": { "alertThreshold": true }, "percentage": false, "pluginVersion": "7.5.17", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { "$$hashKey": "object:2987", "alias": "MAX", "dashes": true, "color": "#1F60C4", "fill": 0, "stack": false }, { "$$hashKey": "object:78", "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", "hiddenSeries": true }, { "$$hashKey": "object:2987", "alias": "MAX + PENDING", "dashes": true, "color": "#777777", "fill": 0, "stack": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "exemplar": true, "expr": "avg(ray_node_cpu_utilization{SessionName=\"$SessionName\",})", "interval": "", "legendFormat": "CPU (physical)", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, "expr": "sum(ray_node_gpus_utilization{SessionName=\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=\"$SessionName\",}) or vector(0))", "interval": "", "legendFormat": "GPU (physical)", "queryType": "randomWalk", "refId": "B" }, { "exemplar": true, "expr": "sum(ray_node_mem_used{SessionName=\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=\"$SessionName\",})) * 100", "interval": "", "legendFormat": "Memory (RAM)", "queryType": "randomWalk", "refId": "C" }, { "exemplar": true, "expr": "sum(ray_node_gram_used{SessionName=\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=\"$SessionName\",})) * 100", "interval": "", "legendFormat": "GRAM", "queryType": "randomWalk", "refId": "D" }, { "exemplar": true, "expr": "sum(ray_object_store_memory{SessionName=\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=\"$SessionName\",}) * 100", "interval": "", "legendFormat": "Object Store Memory", "queryType": "randomWalk", "refId": "E" }, { "exemplar": true, "expr": "sum(ray_node_disk_usage{SessionName=\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=\"$SessionName\",})) * 100", "interval": "", "legendFormat": "Disk", "queryType": "randomWalk", "refId": "F" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Cluster Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:628", "format": "%", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "$$hashKey": "object:629", "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "refresh": false, "schemaVersion": 27, "style": "dark", "tags": [ "rayVersion:2.5.0" ], "templating": { "list": [ { "allValue": null, "current": { "selected": false }, "datasource": "Prometheus", "definition": "label_values(ray_node_network_receive_speed{}, SessionName)", "description": "Filter queries to specific ray sessions.", "error": null, "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "SessionName", "options": [], "query": { "query": "label_values(ray_node_network_receive_speed{}, SessionName)", "refId": "StandardVariableQuery" }, "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 2, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": ".+", "current": { "selected": true, "text": [ "All" ], "value": [ "$__all" ] }, "datasource": "Prometheus", "definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", "description": null, "error": null, "hide": 0, "includeAll": true, "label": null, "multi": true, "name": "Instance", "options": [], "query": { "query": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", "refId": "Prometheus-Instance-Variable-Query" }, "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Default Dashboard", "uid": "rayDefaultDashboard", "version": 4, "rayMeta": [ "supportsGlobalFilterOverride" ] }