modules/kuberay-monitoring/grafana/values.yaml (2,870 lines of code) (raw):
replicas: 1
image:
repository: grafana/grafana
tag: 8.3.4
pullPolicy: IfNotPresent
serviceAccount:
create: false
name: ${k8s_service_account}
grafana.ini:
security:
allow_embedding: true
auth.anonymous:
enabled: true
org_role: Viewer
auth.basic:
enabled: false
auth:
disable_login_form: true
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
url: http://frontend:9090
access: proxy
isDefault: true
plugins:
- grafana-piechart-panel
service:
name: service
type: LoadBalancer
port: 80
targetPort: 3000
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: true
editable: false
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
custom-dashboard:
json: |
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"iteration": 1667344411089,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 26,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (State), 0)",
"interval": "",
"legendFormat": "{{State}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (State), 0)",
"interval": "",
"legendFormat": "{{State}} (retry)",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Scheduler Task State",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "tasks",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"hiddenSeries": false,
"id": 35,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (Name)",
"interval": "",
"legendFormat": "{{Name}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (Name)",
"interval": "",
"legendFormat": "{{Name}} (retry)",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Active Tasks by Name",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "tasks",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 1
},
"hiddenSeries": false,
"id": 33,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(ray_actors{SessionName=\"$SessionName\",}) by (State)",
"interval": "",
"legendFormat": "{{State}}",
"queryType": "randomWalk",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Scheduler Actor State",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "actors",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Current number of (live) actors with a particular name.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 1
},
"hiddenSeries": false,
"id": 36,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(ray_actors{State!=\"DEAD\",SessionName=\"$SessionName\",}) by (Name)",
"interval": "",
"legendFormat": "{{Name}}",
"queryType": "randomWalk",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Active Actors by Name",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "actors",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 2
},
"hiddenSeries": false,
"id": 27,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=\"$SessionName\",}) by (instance)",
"interval": "",
"legendFormat": "CPU Usage: {{instance}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_resources{Name=\"CPU\",SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
},
{
"exemplar": true,
"expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)))",
"interval": "",
"legendFormat": "MAX + PENDING",
"queryType": "randomWalk",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Scheduler CPUs (logical slots)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "cores",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 2
},
"hiddenSeries": false,
"id": 29,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(ray_object_store_memory{SessionName=\"$SessionName\",}) by (Location)",
"interval": "",
"legendFormat": "{{Location}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Object Store Memory",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "bytes",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 3
},
"hiddenSeries": false,
"id": 28,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=\"$SessionName\",}",
"interval": "",
"legendFormat": "GPU Usage: {{instance}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_resources{Name=\"GPU\",SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
},
{
"exemplar": true,
"expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)))",
"interval": "",
"legendFormat": "MAX + PENDING",
"queryType": "randomWalk",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Scheduler GPUs (logical slots)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "GPUs",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 3
},
"hiddenSeries": false,
"id": 40,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(ray_placement_groups{SessionName=\"$SessionName\",}) by (State)",
"interval": "",
"legendFormat": "{{State}}",
"queryType": "randomWalk",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Scheduler Placement Groups",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "placement groups",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"hiddenSeries": false,
"id": 2,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=\"$SessionName\",} / 100",
"interval": "",
"legendFormat": "CPU Usage: {{instance}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_node_cpu_count{SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node CPU (hardware utilization)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "cores",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"hiddenSeries": false,
"id": 8,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=\"$SessionName\",} / 100",
"interval": "",
"legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_node_gpus_available{SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node GPU (hardware utilization)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "GPUs",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 5
},
"hiddenSeries": false,
"id": 6,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=\"$SessionName\",}",
"interval": "",
"legendFormat": "Disk Used: {{instance}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_node_disk_free{SessionName=\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Disk",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "bytes",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Disk IO per node.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 5
},
"hiddenSeries": false,
"id": 32,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}",
"interval": "",
"legendFormat": "Write: {{instance}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}",
"interval": "",
"legendFormat": "Read: {{instance}}",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Disk IO Speed",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "Bps",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 6
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=\"$SessionName\",}",
"interval": "",
"legendFormat": "Memory Used: {{instance}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_node_mem_total{SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Memory (heap + object store)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "bytes",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 6
},
"hiddenSeries": false,
"id": 44,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=\"$SessionName\",}",
"interval": "",
"legendFormat": "OOM Killed: {{Name}}, {{instance}}",
"queryType": "randomWalk",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Out of Memory Failures by Name",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "failures",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 7
},
"hiddenSeries": false,
"id": 34,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "(sum(ray_component_rss_mb{SessionName=\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=\"$SessionName\",}) by (Component))",
"interval": "",
"legendFormat": "{{Component}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_node_mem_shared_bytes{SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "shared_memory",
"queryType": "randomWalk",
"refId": "B"
},
{
"exemplar": true,
"expr": "sum(ray_node_mem_total{SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Memory by Component",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "bytes",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 7
},
"hiddenSeries": false,
"id": 37,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(ray_component_cpu_percentage{SessionName=\"$SessionName\",}) by (Component) / 100",
"interval": "",
"legendFormat": "{{Component}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_node_cpu_count{SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node CPU by Component",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "cores",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"hiddenSeries": false,
"id": 18,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=\"$SessionName\",} * 1024 * 1024",
"interval": "",
"legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "(sum(ray_node_gram_available{SessionName=\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=\"$SessionName\",})) * 1024 * 1024",
"interval": "",
"legendFormat": "MAX",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node GPU Memory (GRAM)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "bytes",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Network speed per node",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"hiddenSeries": false,
"id": 20,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}",
"interval": "",
"legendFormat": "Recv: {{instance}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}",
"interval": "",
"legendFormat": "Send: {{instance}}",
"queryType": "randomWalk",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Network",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "Bps",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 10,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 9
},
"hiddenSeries": false,
"id": 24,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(autoscaler_active_nodes{SessionName=\"$SessionName\",}) by (NodeType)",
"interval": "",
"legendFormat": "Active Nodes: {{NodeType}}",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(autoscaler_recently_failed_nodes{SessionName=\"$SessionName\",}) by (NodeType)",
"interval": "",
"legendFormat": "Failed Nodes: {{NodeType}}",
"queryType": "randomWalk",
"refId": "B"
},
{
"exemplar": true,
"expr": "sum(autoscaler_pending_nodes{SessionName=\"$SessionName\",}) by (NodeType)",
"interval": "",
"legendFormat": "Pending Nodes: {{NodeType}}",
"queryType": "randomWalk",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node Count",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "nodes",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 9
},
"hiddenSeries": false,
"id": 41,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.17",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:2987",
"alias": "MAX",
"dashes": true,
"color": "#1F60C4",
"fill": 0,
"stack": false
},
{
"$$hashKey": "object:78",
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
"hiddenSeries": true
},
{
"$$hashKey": "object:2987",
"alias": "MAX + PENDING",
"dashes": true,
"color": "#777777",
"fill": 0,
"stack": false
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "avg(ray_node_cpu_utilization{SessionName=\"$SessionName\",})",
"interval": "",
"legendFormat": "CPU (physical)",
"queryType": "randomWalk",
"refId": "A"
},
{
"exemplar": true,
"expr": "sum(ray_node_gpus_utilization{SessionName=\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=\"$SessionName\",}) or vector(0))",
"interval": "",
"legendFormat": "GPU (physical)",
"queryType": "randomWalk",
"refId": "B"
},
{
"exemplar": true,
"expr": "sum(ray_node_mem_used{SessionName=\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=\"$SessionName\",})) * 100",
"interval": "",
"legendFormat": "Memory (RAM)",
"queryType": "randomWalk",
"refId": "C"
},
{
"exemplar": true,
"expr": "sum(ray_node_gram_used{SessionName=\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=\"$SessionName\",})) * 100",
"interval": "",
"legendFormat": "GRAM",
"queryType": "randomWalk",
"refId": "D"
},
{
"exemplar": true,
"expr": "sum(ray_object_store_memory{SessionName=\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=\"$SessionName\",}) * 100",
"interval": "",
"legendFormat": "Object Store Memory",
"queryType": "randomWalk",
"refId": "E"
},
{
"exemplar": true,
"expr": "sum(ray_node_disk_usage{SessionName=\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=\"$SessionName\",})) * 100",
"interval": "",
"legendFormat": "Disk",
"queryType": "randomWalk",
"refId": "F"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Cluster Utilization",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:628",
"format": "%",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:629",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": false,
"schemaVersion": 27,
"style": "dark",
"tags": [
"rayVersion:2.5.0"
],
"templating": {
"list": [
{
"allValue": null,
"current": {
"selected": false
},
"datasource": "Prometheus",
"definition": "label_values(ray_node_network_receive_speed{}, SessionName)",
"description": "Filter queries to specific ray sessions.",
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "SessionName",
"options": [],
"query": {
"query": "label_values(ray_node_network_receive_speed{}, SessionName)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": "Prometheus",
"definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)",
"description": null,
"error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "Instance",
"options": [],
"query": {
"query": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)",
"refId": "Prometheus-Instance-Variable-Query"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Default Dashboard",
"uid": "rayDefaultDashboard",
"version": 4,
"rayMeta": [
"supportsGlobalFilterOverride"
]
}