From b63a90b9921b85e78b021b0b4daa3c9794c6bf51 Mon Sep 17 00:00:00 2001 From: Alan Guo Date: Fri, 3 Oct 2025 16:43:06 -0700 Subject: [PATCH] Update grafana dashboards to ray 2.49.2 + add README instructions on how to do the update Signed-off-by: Alan Guo --- config/grafana/README.md | 25 + config/grafana/data_grafana_dashboard.json | 9909 ++++++++++------- config/grafana/default_grafana_dashboard.json | 1632 ++- .../serve_deployment_grafana_dashboard.json | 135 +- config/grafana/serve_grafana_dashboard.json | 201 +- .../grafana/serve_llm_grafana_dashboard.json | 2826 +++++ config/grafana/train_grafana_dashboard.json | 1859 +++- 7 files changed, 11943 insertions(+), 4644 deletions(-) create mode 100644 config/grafana/README.md create mode 100644 config/grafana/serve_llm_grafana_dashboard.json diff --git a/config/grafana/README.md b/config/grafana/README.md new file mode 100644 index 00000000000..cf70daf97e5 --- /dev/null +++ b/config/grafana/README.md @@ -0,0 +1,25 @@ +# Grafana dashboards + +The grafana dashboards in this directory are copied over from the ray repo. + +## Updating the dashboards + +To update the dashboards with the latest dashboards from ray, run the following command: + +Install ray: + +```bash +pip install "ray[default]" +``` + +Run locally: + +```bash +ray start --head +``` + +Copy the dashboards to this directory: + +```bash +cp -r /tmp/ray/session_latest/metrics/grafana/dashboards/* . +``` diff --git a/config/grafana/data_grafana_dashboard.json b/config/grafana/data_grafana_dashboard.json index a26bd6ac9d4..f28c0dbf98e 100644 --- a/config/grafana/data_grafana_dashboard.json +++ b/config/grafana/data_grafana_dashboard.json @@ -18,27 +18,40 @@ "iteration": 1667344411089, "links": [], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 99, + "title": "Overview", + "type": "row", + "panels": [] + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", + "description": "Byte size of output blocks generated by tasks per second.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 0 + "y": 1 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 1, + "id": 23, "legend": { "alignAsTable": true, "avg": false, @@ -56,7 +69,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -94,9 +107,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "expr": "sum(rate(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", "interval": "", - "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", + "legendFormat": "Bytes Generated / Second: {{dataset}}, {{operator}}", "queryType": "randomWalk", "refId": "A" } @@ -105,7 +118,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Bytes Spilled", + "title": "Bytes Generated by Tasks / Second", "tooltip": { "shared": true, "sort": 0, @@ -122,7 +135,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "bytes", + "format": "Bps", "label": "", "logBase": 1, "max": null, @@ -150,21 +163,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Amount allocated by dataset operators.", + "description": "Number of output blocks generated by tasks per second.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 0 + "y": 1 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 2, + "id": 22, "legend": { "alignAsTable": true, "avg": false, @@ -182,7 +195,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -220,9 +233,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "expr": "sum(rate(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", "interval": "", - "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}", + "legendFormat": "Blocks Generated / Second: {{dataset}}, {{operator}}", "queryType": "randomWalk", "refId": "A" } @@ -231,7 +244,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Bytes Allocated", + "title": "Blocks Generated by Tasks / Second", "tooltip": { "shared": true, "sort": 0, @@ -248,7 +261,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "bytes", + "format": "blocks/sec", "label": "", "logBase": 1, "max": null, @@ -276,21 +289,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Amount freed by dataset operators.", + "description": "Number of rows in generated output blocks from finished tasks per second.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 1 + "y": 9 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 3, + "id": 24, "legend": { "alignAsTable": true, "avg": false, @@ -308,7 +321,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -346,9 +359,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "expr": "sum(rate(ray_data_rows_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", "interval": "", - "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", + "legendFormat": "Rows Generated / Second: {{dataset}}, {{operator}}", "queryType": "randomWalk", "refId": "A" } @@ -357,7 +370,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Bytes Freed", + "title": "Rows Generated by Tasks / Second", "tooltip": { "shared": true, "sort": 0, @@ -374,7 +387,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "bytes", + "format": "rows/sec", "label": "", "logBase": 1, "max": null, @@ -407,14 +420,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 1 + "y": 9 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 4, "legend": { @@ -434,7 +447,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -472,7 +485,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", "interval": "", "legendFormat": "Current Usage: {{dataset}}, {{operator}}", "queryType": "randomWalk", @@ -528,21 +541,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Logical CPUs allocated to dataset operators.", + "description": "Number of running tasks.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 2 + "y": 17 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 5, + "id": 30, "legend": { "alignAsTable": true, "avg": false, @@ -560,7 +573,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -598,9 +611,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "expr": "sum(ray_data_num_tasks_running{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", "interval": "", - "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", + "legendFormat": "Running Tasks: {{dataset}}, {{operator}}", "queryType": "randomWalk", "refId": "A" } @@ -609,7 +622,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CPUs (logical slots)", + "title": "Running Tasks", "tooltip": { "shared": true, "sort": 0, @@ -626,7 +639,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "cores", + "format": "tasks", "label": "", "logBase": 1, "max": null, @@ -654,21 +667,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Logical GPUs allocated to dataset operators.", + "description": "Total number of blocks in operator's internal + external input queue.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 2 + "y": 17 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 6, + "id": 56, "legend": { "alignAsTable": true, "avg": false, @@ -686,7 +699,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -724,9 +737,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"} + ray_data_num_external_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", "interval": "", - "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", + "legendFormat": "Combined Blocks: {{dataset}}, {{operator}}", "queryType": "randomWalk", "refId": "A" } @@ -735,7 +748,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "GPUs (logical slots)", + "title": "Operator Combined Internal + External Inqueue Size (Blocks)", "tooltip": { "shared": true, "sort": 0, @@ -752,7 +765,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "cores", + "format": "blocks", "label": "", "logBase": 1, "max": null, @@ -775,4043 +788,5941 @@ } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Bytes output per second by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, + "collapsed": true, "gridPos": { - "h": 8, - "w": 12, + "h": 1, + "w": 24, "x": 0, - "y": 3 - }, - "hiddenSeries": false, - "id": 7, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "y": 25 + }, + "id": 100, + "title": "Pending Inputs", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of blocks in operator's internal input queue", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, "fill": 0, - "stack": false + "fillGradient": 0, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Inqueue Size (Blocks)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks in the operator's internal input queue.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_inqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Inqueue Size (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of blocks in operator's external input queue", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Output / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Output / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ + "fillGradient": 0, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_external_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator External InQueue Size (Blocks)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of blocks in operator's external input queue", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 27, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_external_inqueue_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Number of Bytes: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator External InQueue Size (bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks used by pending tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_pending_task_inputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Size of Blocks used in Pending Tasks (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + ] }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Total rows output per second by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, + "collapsed": true, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 3 - }, - "hiddenSeries": false, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ + "h": 1, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 101, + "title": "Inputs", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of input blocks received by operator per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_num_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Received / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Blocks Received by Operator / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks received by operator per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, "fill": 0, - "stack": false + "fillGradient": 0, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Received / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Bytes Received by Operator / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of input blocks that operator's tasks have finished processing per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 59 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_num_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Processed / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Blocks Processed by Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks that operator's tasks have finished processing per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 59 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Rows Output / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows Output / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "rows/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true + "fillGradient": 0, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Processed / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Bytes Processed by Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks passed to submitted tasks per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 67 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 21, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_inputs_of_submitted_tasks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Submitted / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Bytes Submitted to Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + ] }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of input blocks received by operator per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, + "collapsed": true, "gridPos": { - "h": 8, - "w": 12, + "h": 1, + "w": 24, "x": 0, - "y": 4 - }, - "hiddenSeries": false, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "y": 75 + }, + "id": 102, + "title": "Pending Outputs", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of blocks in operator's internal output queue", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 76 + }, "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "fillGradient": 0, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_outqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Outqueue Size (Blocks)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks in the operator's internal output queue.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 76 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Received / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Blocks Received by Operator / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true + "fillGradient": 0, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_outqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Outqueue Size (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Maximum bytes to read from streaming generator buffer.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 84 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 55, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_max_bytes_to_read{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Max Bytes to Read: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Max Bytes to Read", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + ] }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks received by operator per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, + "collapsed": true, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "hiddenSeries": false, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ + "h": 1, + "w": 24, + "x": 0, + "y": 92 + }, + "id": 103, + "title": "Outputs", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of output blocks taken by downstream operators per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 93 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_num_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Taken / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Output Blocks Taken by Downstream Operators / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks taken by downstream operators per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 93 + }, "fill": 0, - "stack": false + "fillGradient": 0, + "hiddenSeries": false, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Taken / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Output Bytes Taken by Downstream Operators / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks from finished tasks per second, grouped by node.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 101 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 43, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_outputs_of_finished_tasks_per_node{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, node_ip)", + "interval": "", + "legendFormat": "Bytes output / Second: {{dataset}}, {{node_ip}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Output Bytes from Finished Tasks / Second (by Node)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of output blocks from finished tasks per second, grouped by node.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 101 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "fillGradient": 0, + "hiddenSeries": false, + "id": 48, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_blocks_outputs_of_finished_tasks_per_node{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, node_ip)", + "interval": "", + "legendFormat": "Blocks output / Second: {{dataset}}, {{node_ip}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Blocks from Finished Tasks / Second (by Node)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks/s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Received / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Bytes Received by Operator / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Bytes output per second by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 109 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Output / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Output / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total rows output per second by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 109 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Rows Output / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows Output / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "rows/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Average byte size of output blocks generated by tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 117 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 49, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "increase(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m]) / increase(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m])", + "interval": "", + "legendFormat": "Average Bytes Generated / Output Block: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average Bytes Generated / Output Block", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Average number of output blocks generated by tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 117 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 50, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "increase(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m]) / increase(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m])", + "interval": "", + "legendFormat": "Average Number of Output Blocks / Task: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average Number of Output Blocks / Task", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent generating blocks in tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 125 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_block_generation_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Block Generation Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + ] }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of input blocks that operator's tasks have finished processing per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, + "collapsed": true, "gridPos": { - "h": 8, - "w": 12, + "h": 1, + "w": 24, "x": 0, - "y": 5 - }, - "hiddenSeries": false, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "y": 133 + }, + "id": 104, + "title": "Tasks", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent running tasks to completion w/ backpressure.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 134 + }, "fill": 0, - "stack": false + "fillGradient": 0, + "hiddenSeries": false, + "id": 38, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "increase(ray_data_task_completion_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m]) / increase(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m])", + "interval": "", + "legendFormat": "Task Completion Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Completion Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent running tasks to completion w/o backpressure.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 134 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "increase(ray_data_task_completion_time_without_backpressure{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m]) / increase(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m])", + "interval": "", + "legendFormat": "Task Completion Time w/o Backpressure: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Completion Time Without Backpressure", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent in output backpressure.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 142 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Processed / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Blocks Processed by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ + "fillGradient": 0, + "hiddenSeries": false, + "id": 39, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "increase(ray_data_task_output_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m]) / increase(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[5m])", + "interval": "", + "legendFormat": "Task Output Backpressure Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Output Backpressure Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "$$hashKey": "object:628", - "format": "blocks/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent in task submission backpressure.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 142 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_task_submission_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Backpressure Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Submission Backpressure Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks that operator's tasks have finished processing per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 5 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of finished tasks per second, grouped by node.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 150 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 46, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_num_tasks_finished_per_node{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}[1m])) by (dataset, node_ip)", + "interval": "", + "legendFormat": "Finished Tasks: {{dataset}}, {{node_ip}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Throughput (by Node)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks/s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of tasks that already have output.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 150 + }, "fill": 0, - "stack": false + "fillGradient": 0, + "hiddenSeries": false, + "id": 31, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_have_outputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Tasks with output blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Tasks with output blocks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of submitted tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 158 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 29, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_submitted{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Submitted Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Submitted Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of finished tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 158 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Processed / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Bytes Processed by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true + "fillGradient": 0, + "hiddenSeries": false, + "id": 32, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Finished Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Finished Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of failed tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 166 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_failed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Failed Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Failed Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + ] }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks passed to submitted tasks per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, + "collapsed": true, "gridPos": { - "h": 8, - "w": 12, + "h": 1, + "w": 24, "x": 0, - "y": 6 - }, - "hiddenSeries": false, - "id": 21, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "y": 174 + }, + "id": 105, + "title": "Resource Budget / Usage", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical CPUs currently being used by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 175 + }, "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "fillGradient": 0, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Logical Slots Being Used (CPU)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical GPUs currently being used by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 175 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_inputs_of_submitted_tasks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Submitted / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Bytes Submitted to Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true + "fillGradient": 0, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Logical Slots Being Used (GPU)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of output blocks generated by tasks per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 6 - }, - "hiddenSeries": false, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Budget (CPU) for the operator.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 183 + }, "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "fillGradient": 0, + "hiddenSeries": false, + "id": 51, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_cpu_budget{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Budget (CPU): {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Budget (CPU)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cpu", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Budget (GPU) for the operator.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 183 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Generated / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Blocks Generated by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks generated by tasks per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 - }, - "hiddenSeries": false, - "id": 23, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Generated / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Generated by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of rows in generated output blocks from finished tasks per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 7 - }, - "hiddenSeries": false, - "id": 24, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_rows_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Rows Generated / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows Generated by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "rows/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of output blocks taken by downstream operators per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "hiddenSeries": false, - "id": 25, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Taken / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Output Blocks Taken by Downstream Operators / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks taken by downstream operators per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "hiddenSeries": false, - "id": 26, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Taken / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Output Bytes Taken by Downstream Operators / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks from finished tasks per second, grouped by node.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "hiddenSeries": false, - "id": 43, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_outputs_of_finished_tasks_per_node{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, node_ip)", - "interval": "", - "legendFormat": "Bytes output / Second: {{dataset}}, {{node_ip}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Output Bytes from Finished Tasks / Second (by Node)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of output blocks from finished tasks per second, grouped by node.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "hiddenSeries": false, - "id": 48, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_blocks_outputs_of_finished_tasks_per_node{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, node_ip)", - "interval": "", - "legendFormat": "Blocks output / Second: {{dataset}}, {{node_ip}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Blocks from Finished Tasks / Second (by Node)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks/s", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of submitted tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 29, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_submitted{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Submitted Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Submitted Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of running tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 30, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_running{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Running Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Running Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of tasks that already have output.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 11 - }, - "hiddenSeries": false, - "id": 31, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_have_outputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Tasks with output blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Tasks with output blocks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of finished tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 11 - }, - "hiddenSeries": false, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Finished Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Finished Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of finished tasks per second, grouped by node.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 12 - }, - "hiddenSeries": false, - "id": 46, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_tasks_finished_per_node{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, node_ip)", - "interval": "", - "legendFormat": "Finished Tasks: {{dataset}}, {{node_ip}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Task Throughput (by Node)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks/s", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of failed tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 12 - }, - "hiddenSeries": false, - "id": 33, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_failed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Failed Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Failed Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Time spent generating blocks in tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_block_generation_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Block Generation Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Time spent in task submission backpressure.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "hiddenSeries": false, - "id": 37, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_task_submission_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Backpressure Time: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Task Submission Backpressure Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of blocks in operator's internal input queue", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 14 - }, - "hiddenSeries": false, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Inqueue Size (Blocks)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks in the operator's internal input queue.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 14 - }, - "hiddenSeries": false, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_inqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Inqueue Size (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of blocks in operator's internal output queue", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 15 - }, - "hiddenSeries": false, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_outqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Outqueue Size (Blocks)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks in the operator's internal output queue.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 15 - }, - "hiddenSeries": false, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_outqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Outqueue Size (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks used by pending tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "hiddenSeries": false, - "id": 34, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_pending_task_inputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Size of Blocks used in Pending Tasks (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of freed memory in object store.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "hiddenSeries": false, - "id": 35, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_freed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Freed Memory in Object Store (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of spilled memory in object store.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 17 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_spilled{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Spilled Memory in Object Store (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds spent in iterator initialization code", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 17 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_initialize_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration Initialization Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds user thread is blocked by iter_batches()", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 18 - }, - "hiddenSeries": false, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ + "fillGradient": 0, + "hiddenSeries": false, + "id": 52, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_gpu_budget{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Budget (GPU): {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Budget (GPU)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "gpu", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Budget (Memory) for the operator.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 191 + }, "fill": 0, - "stack": false + "fillGradient": 0, + "hiddenSeries": false, + "id": 53, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_memory_budget{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Budget (Memory): {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Budget (Memory)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Budget (Object Store Memory) for the operator.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 191 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 54, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_object_store_memory_budget{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Budget (Object Store Memory): {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Budget (Object Store Memory)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of freed memory in object store.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 199 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "fillGradient": 0, + "hiddenSeries": false, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_freed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Freed Memory in Object Store (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "exemplar": true, - "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration Blocked Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of spilled memory in object store.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 199 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_spilled{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Spilled Memory in Object Store (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 207 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Spilled", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount freed by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 207 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Freed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + ] }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds spent in user code", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, + "collapsed": true, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 18 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", + "h": 1, + "w": 24, + "x": 0, + "y": 215 + }, + "id": 106, + "title": "Scheduling Loop", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Duration of the scheduling loop in seconds.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 216 + }, "fill": 0, - "stack": false + "fillGradient": 0, + "hiddenSeries": false, + "id": 47, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_sched_loop_duration_s{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (dataset)", + "interval": "", + "legendFormat": "Scheduling Loop Duration: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduling Loop Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 224 + }, + "id": 107, + "title": "Iteration", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds spent in iterator initialization code", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 225 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_initialize_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration Initialization Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds user thread is blocked by iter_batches()", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 225 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration Blocked Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds spent in user code", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 233 + }, "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}", - "queryType": "randomWalk", - "refId": "A" + "fillGradient": 0, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration User Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration User Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 241 + }, + "id": 108, + "title": "Operator Panels", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Shows all logical resources utilization on a single graph. Filtering by operator is recommended.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 242 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 57, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "CPU: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\", operator=~\"$Operator\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "GPU: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "All logical resources utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + ] } ], "refresh": false, "schemaVersion": 27, "style": "dark", "tags": [ - "rayVersion:2.46.0" + "rayVersion:2.49.2" ], "templating": { "list": [ @@ -4837,7 +6748,7 @@ "selected": false }, "datasource": "${datasource}", - "definition": "label_values(ray_data_allocated_bytes{}, SessionName)", + "definition": "query_result(count by (SessionName)(last_over_time(ray_data_output_bytes{}[$__range])))", "description": "Filter queries to specific ray sessions.", "error": null, "hide": 0, @@ -4847,11 +6758,11 @@ "name": "SessionName", "options": [], "query": { - "query": "label_values(ray_data_allocated_bytes{}, SessionName)", + "query": "query_result(count by (SessionName)(last_over_time(ray_data_output_bytes{}[$__range])))", "refId": "StandardVariableQuery" }, "refresh": 2, - "regex": "", + "regex": "{SessionName=\"(?.*)\".*", "skipUrlSync": false, "sort": 2, "tagValuesQuery": "", @@ -4872,7 +6783,7 @@ ] }, "datasource": "${datasource}", - "definition": "label_values(ray_data_allocated_bytes{}, dataset)", + "definition": "query_result(count by (dataset)(last_over_time(ray_data_output_bytes{SessionName=~\"$SessionName\",}[$__range])))", "description": null, "error": null, "hide": 0, @@ -4882,11 +6793,46 @@ "name": "DatasetID", "options": [], "query": { - "query": "label_values(ray_data_allocated_bytes{}, dataset)", + "query": "query_result(count by (dataset)(last_over_time(ray_data_output_bytes{SessionName=~\"$SessionName\",}[$__range])))", "refId": "Prometheus-Dataset-Variable-Query" }, "refresh": 2, - "regex": "", + "regex": "{dataset=\"(?.*)\".*", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "query_result(count by (operator)(last_over_time(ray_data_output_bytes{SessionName=~\"$SessionName\",}[$__range])))", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Operator", + "options": [], + "query": { + "query": "query_result(count by (operator)(last_over_time(ray_data_output_bytes{SessionName=~\"$SessionName\",}[$__range])))", + "refId": "Prometheus-Dataset-Variable-Query" + }, + "refresh": 2, + "regex": "{operator=\"(?.*)\".*", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", @@ -4896,6 +6842,7 @@ "useTags": false }, { + "allValue": ".*", "current": { "selected": false }, diff --git a/config/grafana/default_grafana_dashboard.json b/config/grafana/default_grafana_dashboard.json index a38569e8d11..1473ba9a120 100644 --- a/config/grafana/default_grafana_dashboard.json +++ b/config/grafana/default_grafana_dashboard.json @@ -18,27 +18,40 @@ "iteration": 1667344411089, "links": [], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1001, + "title": "Overview and Health", + "type": "row", + "panels": [] + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + "description": "Note: not impacted by \"Instance\" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 0 + "y": 1 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 26, + "id": 24, "legend": { "alignAsTable": true, "avg": false, @@ -56,7 +69,149 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (NodeType)", + "interval": "", + "legendFormat": "Active Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (NodeType)", + "interval": "", + "legendFormat": "Failed Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (NodeType)", + "interval": "", + "legendFormat": "Pending Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "nodes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -94,26 +249,58 @@ "targets": [ { "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State), 0)", + "expr": "avg(ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", - "legendFormat": "{{State}}", + "legendFormat": "CPU (physical)", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State), 0)", + "expr": "sum(ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) / on() (sum(ray_node_gpus_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0))", "interval": "", - "legendFormat": "{{State}} (retry)", + "legendFormat": "GPU (physical)", "queryType": "randomWalk", "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) / on() (sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})) * 100", + "interval": "", + "legendFormat": "Memory (RAM)", + "queryType": "randomWalk", + "refId": "C" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) / on() (sum(ray_node_gram_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) + sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})) * 100", + "interval": "", + "legendFormat": "GRAM", + "queryType": "randomWalk", + "refId": "D" + }, + { + "exemplar": true, + "expr": "sum(ray_object_store_memory{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) / on() sum(ray_resources{Name=\"object_store_memory\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) * 100", + "interval": "", + "legendFormat": "Object Store Memory", + "queryType": "randomWalk", + "refId": "E" + }, + { + "exemplar": true, + "expr": "sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) / on() (sum(ray_node_disk_free{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) + sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})) * 100", + "interval": "", + "legendFormat": "Disk", + "queryType": "randomWalk", + "refId": "F" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Scheduler Task State", + "title": "Cluster Utilization", "tooltip": { "shared": true, "sort": 0, @@ -130,7 +317,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "tasks", + "format": "%", "label": "", "logBase": 1, "max": null, @@ -158,21 +345,160 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 44, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_memory_manager_worker_eviction_total{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Name, instance, RayNodeType)", + "interval": "", + "legendFormat": "OOM Killed: {{Name}}, {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ray OOM Kills (Tasks and Actors)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "failures", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 1002, + "title": "Ray Tasks, Actors and Placement Groups", + "type": "row", + "panels": [] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current count of tasks, grouped by scheduler state (e.g., pending, running, finished).\n\nState: the task state, as described by rpc::TaskStatus proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 0 + "x": 0, + "y": 18 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 35, + "id": 26, "legend": { "alignAsTable": true, "avg": false, @@ -190,7 +516,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -228,17 +554,17 @@ "targets": [ { "exemplar": true, - "expr": "clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name), 0)", + "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (State), 0)", "interval": "", - "legendFormat": "{{Name}}", + "legendFormat": "{{State}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name), 0)", + "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (State), 0)", "interval": "", - "legendFormat": "{{Name}} (retry)", + "legendFormat": "{{State}} (retry)", "queryType": "randomWalk", "refId": "B" } @@ -247,7 +573,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Requested Live Tasks by Name", + "title": "All Tasks by State", "tooltip": { "shared": true, "sort": 0, @@ -292,21 +618,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Current number of (running) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + "description": "Current count of active tasks (i.e. pending or running; not finished), grouped by task name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 1 + "x": 12, + "y": 18 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 38, + "id": 35, "legend": { "alignAsTable": true, "avg": false, @@ -324,7 +650,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -362,7 +688,7 @@ "targets": [ { "exemplar": true, - "expr": "clamp_min(sum(ray_tasks{IsRetry=\"0\",State=~\"RUNNING*\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name), 0)", + "expr": "clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Name), 0)", "interval": "", "legendFormat": "{{Name}}", "queryType": "randomWalk", @@ -370,7 +696,7 @@ }, { "exemplar": true, - "expr": "clamp_min(sum(ray_tasks{IsRetry!=\"0\",State=~\"RUNNING*\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name), 0)", + "expr": "clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Name), 0)", "interval": "", "legendFormat": "{{Name}} (retry)", "queryType": "randomWalk", @@ -381,7 +707,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Running Tasks by Name", + "title": "Active Tasks by Name", "tooltip": { "shared": true, "sort": 0, @@ -426,21 +752,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Note: not impacted by \"Instance\" variable.\n\nCurrent number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", + "description": "Current count of tasks that are currently executing, grouped by task name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 1 + "x": 0, + "y": 26 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 33, + "id": 38, "legend": { "alignAsTable": true, "avg": false, @@ -458,7 +784,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "connected", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -491,23 +817,31 @@ } ], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { "exemplar": true, - "expr": "sum(ray_actors{Source=\"gcs\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State)", + "expr": "clamp_min(sum(ray_tasks{IsRetry=\"0\",State=~\"RUNNING*\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Name), 0)", "interval": "", - "legendFormat": "{{State}}", + "legendFormat": "{{Name}}", "queryType": "randomWalk", "refId": "A" + }, + { + "exemplar": true, + "expr": "clamp_min(sum(ray_tasks{IsRetry!=\"0\",State=~\"RUNNING*\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Name), 0)", + "interval": "", + "legendFormat": "{{Name}} (retry)", + "queryType": "randomWalk", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Scheduler Actor State", + "title": "Running Tasks by Name", "tooltip": { "shared": true, "sort": 0, @@ -524,7 +858,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "actors", + "format": "tasks", "label": "", "logBase": 1, "max": null, @@ -552,21 +886,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Current number of alive actors in a particular state.\n\nState: IDLE, RUNNING_TASK, RUNNING_IN_RAY_GET, RUNNING_IN_RAY_WAIT", + "description": "Note: not impacted by \"Instance\" variable.\n\nCurrent count of actors, grouped by lifecycle state (e.g., alive, restarting, dead/terminated).\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 2 + "x": 12, + "y": 26 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 42, + "id": 33, "legend": { "alignAsTable": true, "avg": false, @@ -622,7 +956,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_actors{Source=\"executor\",NodeAddress=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State)", + "expr": "sum(ray_actors{Source=\"gcs\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (State)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", @@ -633,7 +967,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Live Actor State", + "title": "All Actors by State", "tooltip": { "shared": true, "sort": 0, @@ -678,21 +1012,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Current number of alive actors with a particular name.", + "description": "Current count of alive actors (i.e. not dead/terminated), grouped by state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 2 + "x": 0, + "y": 34 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 36, + "id": 42, "legend": { "alignAsTable": true, "avg": false, @@ -748,9 +1082,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_actors{State!=\"DEAD\",Source=\"executor\",NodeAddress=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name)", + "expr": "sum(ray_actors{Source=\"executor\",NodeAddress=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (State)", "interval": "", - "legendFormat": "{{Name}}", + "legendFormat": "{{State}}", "queryType": "randomWalk", "refId": "A" } @@ -759,7 +1093,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Live Actors by Name", + "title": "Alive Actors by State", "tooltip": { "shared": true, "sort": 0, @@ -804,21 +1138,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", + "description": "Current count of alive actors, grouped by actor name.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 3 + "x": 12, + "y": 34 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 27, + "id": 36, "legend": { "alignAsTable": true, "avg": false, @@ -874,34 +1208,18 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_actors{State!=\"DEAD\",Source=\"executor\",NodeAddress=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Name)", "interval": "", - "legendFormat": "CPU Usage: {{instance}}", + "legendFormat": "{{Name}}", "queryType": "randomWalk", "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)))", - "interval": "", - "legendFormat": "MAX + PENDING", - "queryType": "randomWalk", - "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Scheduler CPUs (logical slots)", + "title": "Alive Actors by Name", "tooltip": { "shared": true, "sort": 0, @@ -918,7 +1236,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "cores", + "format": "actors", "label": "", "logBase": 1, "max": null, @@ -946,21 +1264,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", + "description": "Note: not impacted by \"Instance\" variable.\n\nCurrent count of placement groups, grouped by state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTableData proto in gcs.proto.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 3 + "x": 0, + "y": 42 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 29, + "id": 40, "legend": { "alignAsTable": true, "avg": false, @@ -1016,26 +1334,18 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_object_store_memory{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Location)", + "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (State)", "interval": "", - "legendFormat": "{{Location}}", + "legendFormat": "{{State}}", "queryType": "randomWalk", "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"object_store_memory\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Object Store Memory", + "title": "All Placement Groups by State", "tooltip": { "shared": true, "sort": 0, @@ -1052,7 +1362,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "bytes", + "format": "placement groups", "label": "", "logBase": 1, "max": null, @@ -1074,27 +1384,40 @@ "alignLevel": null } }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 1003, + "title": "Ray Resources", + "type": "row", + "panels": [] + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", + "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 4 + "y": 51 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 28, + "id": 27, "legend": { "alignAsTable": true, "avg": false, @@ -1150,15 +1473,15 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_resources{Name=\"GPU\",State=\"USED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance)", "interval": "", - "legendFormat": "GPU Usage: {{instance}}", + "legendFormat": "CPU Usage: {{instance}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_resources{Name=\"GPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", + "expr": "sum(ray_resources{Name=\"CPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1166,7 +1489,7 @@ }, { "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)))", + "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)))", "interval": "", "legendFormat": "MAX + PENDING", "queryType": "randomWalk", @@ -1177,7 +1500,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Scheduler GPUs (logical slots)", + "title": "Logical CPUs used", "tooltip": { "shared": true, "sort": 0, @@ -1194,7 +1517,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "GPUs", + "format": "cores", "label": "", "logBase": 1, "max": null, @@ -1222,21 +1545,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Note: not impacted by \"Instance\" variable.\n\nCurrent number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.", + "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 4 + "y": 51 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 40, + "id": 28, "legend": { "alignAsTable": true, "avg": false, @@ -1292,18 +1615,34 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State)", + "expr": "sum(ray_resources{Name=\"GPU\",State=\"USED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance)", "interval": "", - "legendFormat": "{{State}}", + "legendFormat": "GPU Usage: {{instance}}", "queryType": "randomWalk", "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_resources{Name=\"GPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) or vector(0)))", + "interval": "", + "legendFormat": "MAX + PENDING", + "queryType": "randomWalk", + "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Scheduler Placement Groups", + "title": "Logical GPUs used", "tooltip": { "shared": true, "sort": 0, @@ -1320,7 +1659,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "placement groups", + "format": "GPUs", "label": "", "logBase": 1, "max": null, @@ -1348,21 +1687,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "", + "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 5 + "y": 59 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 2, + "id": 29, "legend": { "alignAsTable": true, "avg": false, @@ -1418,34 +1757,26 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_node_cpu_utilization{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} / 100) by (instance)", + "expr": "sum(ray_object_store_memory{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Location)", "interval": "", - "legendFormat": "CPU Usage: {{instance}}", + "legendFormat": "{{Location}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_node_cpu_utilization{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} / 100) by (instance)", - "interval": "", - "legendFormat": "CPU Usage: {{instance}} (head)", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", + "expr": "sum(ray_resources{Name=\"object_store_memory\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", - "refId": "C" + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node CPU (hardware utilization)", + "title": "Object Store Memory", "tooltip": { "shared": true, "sort": 0, @@ -1462,7 +1793,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "cores", + "format": "bytes", "label": "", "logBase": 1, "max": null, @@ -1484,27 +1815,40 @@ "alignLevel": null } }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 67 + }, + "id": 1004, + "title": "Hardware Utilization by Ray Component", + "type": "row", + "panels": [] + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", + "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 5 + "x": 0, + "y": 68 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 8, + "id": 37, "legend": { "alignAsTable": true, "avg": false, @@ -1560,34 +1904,26 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} / 100) by (instance, GpuIndex, GpuDeviceName)", + "expr": "sum(ray_component_cpu_percentage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Component) / 100", "interval": "", - "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + "legendFormat": "{{Component}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} / 100) by (instance, GpuIndex, GpuDeviceName)", - "interval": "", - "legendFormat": "GPU Usage: {{instance}} (head), gpu.{{GpuIndex}}, {{GpuDeviceName}}", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gpus_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", + "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", - "refId": "C" + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node GPU (hardware utilization)", + "title": "Node CPU by Component", "tooltip": { "shared": true, "sort": 0, @@ -1604,7 +1940,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "GPUs", + "format": "cores", "label": "", "logBase": 1, "max": null, @@ -1632,21 +1968,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", + "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 6 + "x": 12, + "y": 68 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 6, + "id": 34, "legend": { "alignAsTable": true, "avg": false, @@ -1702,23 +2038,23 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_node_disk_usage{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "(sum(ray_component_rss_mb{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"} * 1024 * 1024) by (Component)) - (sum(ray_component_mem_shared_bytes{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (Component))", "interval": "", - "legendFormat": "Disk Used: {{instance}}", + "legendFormat": "{{Component}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_node_disk_usage{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_node_mem_shared_bytes{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", - "legendFormat": "Disk Used: {{instance}} (head)", + "legendFormat": "shared_memory", "queryType": "randomWalk", "refId": "B" }, { "exemplar": true, - "expr": "sum(ray_node_disk_free{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", + "expr": "sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1729,7 +2065,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node Disk", + "title": "Node Memory by Component", "tooltip": { "shared": true, "sort": 0, @@ -1774,21 +2110,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Disk IO per node.", + "description": "The physical (hardware) GPU usage across the cluster, broken down by component. This reports the summed GPU usage per Ray component.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 6 + "x": 0, + "y": 76 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 32, + "id": 45, "legend": { "alignAsTable": true, "avg": false, @@ -1832,54 +2168,30 @@ { "$$hashKey": "object:2987", "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_node_disk_io_write_speed{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", - "interval": "", - "legendFormat": "Write: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_disk_io_write_speed{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", - "interval": "", - "legendFormat": "Write: {{instance}} (head)", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_disk_io_read_speed{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", - "interval": "", - "legendFormat": "Read: {{instance}}", - "queryType": "randomWalk", - "refId": "C" - }, + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ { "exemplar": true, - "expr": "sum(ray_node_disk_io_read_speed{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_component_gpu_percentage{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"} / 100) by (Component)", "interval": "", - "legendFormat": "Read: {{instance}} (head)", + "legendFormat": "{{Component}}", "queryType": "randomWalk", - "refId": "D" + "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node Disk IO Speed", + "title": "Node GPU by Component", "tooltip": { "shared": true, "sort": 0, @@ -1896,7 +2208,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "Bps", + "format": "GPUs", "label": "", "logBase": 1, "max": null, @@ -1924,21 +2236,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", + "description": "The physical (hardware) GPU memory usage across the cluster, broken down by component. This reports the summed GPU memory usage per Ray component.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 7 + "x": 12, + "y": 76 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 4, + "id": 46, "legend": { "alignAsTable": true, "avg": false, @@ -1994,34 +2306,26 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_component_gpu_memory_mb{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"} * 1024 * 1024) by (Component)", "interval": "", - "legendFormat": "Memory Used: {{instance}}", + "legendFormat": "{{Component}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", - "interval": "", - "legendFormat": "Memory Used: {{instance}} (head)", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", + "expr": "(sum(ray_node_gram_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) + sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})) * 1024 * 1024", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", - "refId": "C" + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node Memory (heap + object store)", + "title": "Node GPU Memory by Component", "tooltip": { "shared": true, "sort": 0, @@ -2060,27 +2364,40 @@ "alignLevel": null } }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 84 + }, + "id": 1005, + "title": "Hardware Utilization by Node", + "type": "row", + "panels": [] + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "The percentage of physical (hardware) memory usage for each node.", + "description": "", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 7 + "x": 0, + "y": 85 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 48, + "id": 2, "legend": { "alignAsTable": true, "avg": false, @@ -2098,7 +2415,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": "connected", "options": { "alertThreshold": true }, @@ -2131,22 +2448,22 @@ } ], "spaceLength": 10, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, - "expr": "sum(ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}/ray_node_mem_total{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * 100) by (instance)", + "expr": "sum(ray_node_cpu_utilization{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"} * ray_node_cpu_count{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"} / 100) by (instance, RayNodeType)", "interval": "", - "legendFormat": "Memory Used: {{instance}}", + "legendFormat": "CPU Usage: {{instance}} ({{RayNodeType}})", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}/ray_node_mem_total{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * 100) by (instance)", + "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", - "legendFormat": "Memory Used: {{instance}} (head)", + "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } @@ -2155,7 +2472,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node Memory Percentage (heap + object store)", + "title": "Node CPU utilization", "tooltip": { "shared": true, "sort": 0, @@ -2172,7 +2489,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "%", + "format": "cores", "label": "", "logBase": 1, "max": null, @@ -2200,21 +2517,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", + "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 8 + "x": 12, + "y": 85 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 44, + "id": 8, "legend": { "alignAsTable": true, "avg": false, @@ -2270,18 +2587,26 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name, instance)", + "expr": "sum(ray_node_gpus_utilization{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"} / 100) by (instance, RayNodeType, GpuIndex, GpuDeviceName)", "interval": "", - "legendFormat": "OOM Killed: {{Name}}, {{instance}}", + "legendFormat": "GPU Usage: {{instance}} ({{RayNodeType}}), gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gpus_available{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node Out of Memory Failures by Name", + "title": "Node GPU utilization", "tooltip": { "shared": true, "sort": 0, @@ -2298,7 +2623,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "failures", + "format": "GPUs", "label": "", "logBase": 1, "max": null, @@ -2326,21 +2651,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 8 + "x": 0, + "y": 93 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 34, + "id": 4, "legend": { "alignAsTable": true, "avg": false, @@ -2396,34 +2721,26 @@ "targets": [ { "exemplar": true, - "expr": "(sum(ray_component_rss_mb{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Component))", + "expr": "sum(ray_node_mem_used{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, RayNodeType)", "interval": "", - "legendFormat": "{{Component}}", + "legendFormat": "Memory Used: {{instance}} ({{RayNodeType}})", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_node_mem_shared_bytes{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", - "interval": "", - "legendFormat": "shared_memory", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", + "expr": "sum(ray_node_mem_total{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", - "refId": "C" + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node Memory by Component", + "title": "Node Memory (heap + object store)", "tooltip": { "shared": true, "sort": 0, @@ -2468,21 +2785,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + "description": "The percentage of physical (hardware) memory usage for each node.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 9 + "x": 12, + "y": 93 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, - "id": 37, + "id": 48, "legend": { "alignAsTable": true, "avg": false, @@ -2500,7 +2817,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "connected", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -2533,31 +2850,23 @@ } ], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { "exemplar": true, - "expr": "sum(ray_component_cpu_percentage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Component) / 100", + "expr": "sum(ray_node_mem_used{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}/ray_node_mem_total{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"} * 100) by (instance, RayNodeType)", "interval": "", - "legendFormat": "{{Component}}", + "legendFormat": "Memory Used: {{instance}} ({{RayNodeType}})", "queryType": "randomWalk", "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node CPU by Component", + "title": "Node Memory % (heap + object store)", "tooltip": { "shared": true, "sort": 0, @@ -2574,7 +2883,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "cores", + "format": "%", "label": "", "logBase": 1, "max": null, @@ -2607,14 +2916,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 9 + "x": 0, + "y": 101 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 18, "legend": { @@ -2672,15 +2981,15 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * 1024 * 1024) by (instance, GpuIndex, GpuDeviceName)", + "expr": "sum(ray_node_gram_used{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"} * 1024 * 1024) by (instance, RayNodeType, GpuIndex, GpuDeviceName)", "interval": "", - "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + "legendFormat": "Used GRAM: {{instance}} ({{RayNodeType}}), gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "(sum(ray_node_gram_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})) * 1024 * 1024", + "expr": "(sum(ray_node_gram_available{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) + sum(ray_node_gram_used{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})) * 1024 * 1024", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -2736,21 +3045,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Network speed per node", + "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 10 + "x": 12, + "y": 101 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 20, + "id": 6, "legend": { "alignAsTable": true, "avg": false, @@ -2806,17 +3115,17 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_node_disk_usage{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, RayNodeType)", "interval": "", - "legendFormat": "Recv: {{instance}}", + "legendFormat": "Disk Used: {{instance}} ({{RayNodeType}})", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_node_disk_free{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) + sum(ray_node_disk_usage{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"})", "interval": "", - "legendFormat": "Send: {{instance}}", + "legendFormat": "MAX", "queryType": "randomWalk", "refId": "B" } @@ -2825,7 +3134,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node Network", + "title": "Node Disk", "tooltip": { "shared": true, "sort": 0, @@ -2842,7 +3151,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "Bps", + "format": "bytes", "label": "", "logBase": 1, "max": null, @@ -2870,21 +3179,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Note: not impacted by \"Instance\" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", + "description": "Disk IO per node.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 10 + "x": 0, + "y": 109 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 24, + "id": 32, "legend": { "alignAsTable": true, "avg": false, @@ -2940,34 +3249,26 @@ "targets": [ { "exemplar": true, - "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (NodeType)", + "expr": "sum(ray_node_disk_io_write_speed{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, RayNodeType)", "interval": "", - "legendFormat": "Active Nodes: {{NodeType}}", + "legendFormat": "Write: {{instance}} ({{RayNodeType}})", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (NodeType)", + "expr": "sum(ray_node_disk_io_read_speed{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, RayNodeType)", "interval": "", - "legendFormat": "Failed Nodes: {{NodeType}}", + "legendFormat": "Read: {{instance}} ({{RayNodeType}})", "queryType": "randomWalk", "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (NodeType)", - "interval": "", - "legendFormat": "Pending Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Node Count", + "title": "Node Disk IO Speed", "tooltip": { "shared": true, "sort": 0, @@ -2984,7 +3285,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "nodes", + "format": "Bps", "label": "", "logBase": 1, "max": null, @@ -3012,21 +3313,21 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", + "description": "Network speed per node", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 11 + "x": 12, + "y": 109 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, - "id": 41, + "id": 20, "legend": { "alignAsTable": true, "avg": false, @@ -3044,7 +3345,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": "connected", "options": { "alertThreshold": true }, @@ -3077,63 +3378,31 @@ } ], "spaceLength": 10, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, - "expr": "avg(ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", + "expr": "sum(ray_node_network_receive_speed{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, RayNodeType)", "interval": "", - "legendFormat": "CPU (physical)", + "legendFormat": "Recv: {{instance}} ({{RayNodeType}})", "queryType": "randomWalk", "refId": "A" }, { "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() (sum(autoscaler_cluster_resources{resource=\"GPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0))", + "expr": "sum(ray_node_network_send_speed{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, RayNodeType)", "interval": "", - "legendFormat": "GPU (physical)", + "legendFormat": "Send: {{instance}} ({{RayNodeType}})", "queryType": "randomWalk", "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})) * 100", - "interval": "", - "legendFormat": "Memory (RAM)", - "queryType": "randomWalk", - "refId": "C" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_gram_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})) * 100", - "interval": "", - "legendFormat": "GRAM", - "queryType": "randomWalk", - "refId": "D" - }, - { - "exemplar": true, - "expr": "sum(ray_object_store_memory{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() sum(ray_resources{Name=\"object_store_memory\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) * 100", - "interval": "", - "legendFormat": "Object Store Memory", - "queryType": "randomWalk", - "refId": "E" - }, - { - "exemplar": true, - "expr": "sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_disk_free{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})) * 100", - "interval": "", - "legendFormat": "Disk", - "queryType": "randomWalk", - "refId": "F" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Cluster Utilization", + "title": "Node Network", "tooltip": { "shared": true, "sort": 0, @@ -3150,7 +3419,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "%", + "format": "Bps", "label": "", "logBase": 1, "max": null, @@ -3171,13 +3440,539 @@ "align": false, "alignLevel": null } + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 117 + }, + "id": 1006, + "title": "TPU Utilization by Node", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Percentage of tensorcore utilization for the TPUs on this node. Computed by dividing the number of tensorcore operations by the maximum supported number of operations during the sample period.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 118 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 50, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_tpu_tensorcore_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, TpuIndex, TpuDeviceName, TpuType, TpuTopology)", + "interval": "", + "legendFormat": "{{instance}}, tpu.{{TpuIndex}}, {{TpuType}}, {{TpuTopology}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node TPU Tensorcore Utilization %", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Percentage of bandwidth memory utilization for the TPUs on this node. Computed by dividing the memory bandwidth used by the maximum supported memory bandwidth limit during the sample period.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 118 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 51, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_tpu_memory_bandwidth_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, TpuIndex, TpuDeviceName, TpuType, TpuTopology)", + "interval": "", + "legendFormat": "{{instance}}, tpu.{{TpuIndex}}, {{TpuType}}, {{TpuTopology}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node TPU High Bandwidth Memory Utilization %", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Percentage of time over the sample period during which the TPU is actively processing.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 126 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 52, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_tpu_duty_cycle{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, TpuIndex, TpuDeviceName, TpuType, TpuTopology) or vector(0)", + "interval": "", + "legendFormat": "{{instance}}, tpu.{{TpuIndex}}, {{TpuType}}, {{TpuTopology}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node TPU Duty Cycle %", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total memory used/allocated for the TPUs on this node.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 126 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 53, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_tpu_memory_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, TpuIndex, TpuDeviceName, TpuType, TpuTopology) or vector(0)", + "interval": "", + "legendFormat": "Memory Used: {{instance}}, tpu.{{TpuIndex}}, {{TpuType}}, {{TpuTopology}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_tpu_memory_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\"}) by (instance, TpuIndex, TpuDeviceName, TpuType, TpuTopology) or vector(0)", + "interval": "", + "legendFormat": "Memory Total: {{instance}}, tpu.{{TpuIndex}}, {{TpuType}}, {{TpuTopology}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node TPU Memory Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ] } ], "refresh": false, "schemaVersion": 27, "style": "dark", "tags": [ - "rayVersion:2.46.0" + "rayVersion:2.49.2" ], "templating": { "list": [ @@ -3204,7 +3999,7 @@ }, "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{}, SessionName)", - "description": "Filter queries to specific ray sessions.", + "description": "Filter queries to specific Ray sessions.", "error": null, "hide": 0, "includeAll": true, @@ -3216,7 +4011,7 @@ "query": "label_values(ray_node_network_receive_speed{}, SessionName)", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 2, @@ -3239,7 +4034,7 @@ }, "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{SessionName=~\"$SessionName\",}, instance)", - "description": null, + "description": "Filter queries to specific Ray nodes by their IP address.", "error": null, "hide": 0, "includeAll": true, @@ -3251,7 +4046,7 @@ "query": "label_values(ray_node_network_receive_speed{SessionName=~\"$SessionName\",}, instance)", "refId": "Prometheus-Instance-Variable-Query" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -3262,12 +4057,13 @@ "useTags": false }, { + "allValue": ".*", "current": { "selected": false }, "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", - "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.", + "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple Ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automatically with Prometheus PodMonitor.", "error": null, "hide": 0, "includeAll": true, @@ -3279,7 +4075,7 @@ "query": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 2, @@ -3288,9 +4084,46 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "description": "Filter queries to specific Ray node types (head or worker).", + "includeAll": true, + "multi": true, + "name": "RayNodeType", + "options": [ + { + "selected": false, + "text": "All", + "value": "$__all" + }, + { + "selected": false, + "text": "Head Node", + "value": "head" + }, + { + "selected": false, + "text": "Worker Node", + "value": "worker" + } + ], + "query": "head, worker", + "type": "custom" } ] }, + "rayMeta": [ + "supportsFullGrafanaView", + "supportsGlobalFilterOverride" + ], "time": { "from": "now-30m", "to": "now" @@ -3299,8 +4132,5 @@ "timezone": "", "title": "Default Dashboard", "uid": "rayDefaultDashboard", - "version": 4, - "rayMeta": [ - "supportsGlobalFilterOverride" - ] + "version": 4 } \ No newline at end of file diff --git a/config/grafana/serve_deployment_grafana_dashboard.json b/config/grafana/serve_deployment_grafana_dashboard.json index 07d37238f21..a00fecc879a 100644 --- a/config/grafana/serve_deployment_grafana_dashboard.json +++ b/config/grafana/serve_deployment_grafana_dashboard.json @@ -29,14 +29,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 0, "y": 0, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 1, "legend": { @@ -94,7 +94,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment)", + "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}) by (application, deployment)", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -155,14 +155,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 8, "y": 0, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 2, "legend": { @@ -220,7 +220,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_request_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica)", + "expr": "sum(rate(ray_serve_deployment_request_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, replica)", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -281,14 +281,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 16, "y": 0, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 3, "legend": { @@ -346,7 +346,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_error_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica)", + "expr": "sum(rate(ray_serve_deployment_error_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, replica)", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -407,14 +407,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 0, "y": 1, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 4, "legend": { @@ -434,7 +434,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -472,7 +472,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -480,7 +480,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -541,14 +541,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 8, "y": 1, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 5, "legend": { @@ -568,7 +568,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -606,7 +606,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -614,7 +614,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -675,14 +675,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 16, "y": 1, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 6, "legend": { @@ -702,7 +702,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -740,7 +740,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -748,7 +748,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -809,14 +809,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 0, "y": 2, "w": 12, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 7, "legend": { @@ -836,7 +836,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -874,7 +874,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment)", + "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}) by (application, deployment)", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -935,14 +935,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 12, "y": 2, "w": 12, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 8, "legend": { @@ -962,7 +962,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1000,7 +1000,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment, replica)", + "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}) by (application, deployment, replica)", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -1061,14 +1061,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 0, "y": 3, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 9, "legend": { @@ -1088,7 +1088,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1126,7 +1126,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment, replica)", + "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}) by (application, deployment, replica)", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -1187,14 +1187,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 8, "y": 3, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 10, "legend": { @@ -1214,7 +1214,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1252,7 +1252,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment, replica)", + "expr": "sum(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}) by (application, deployment, replica)", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -1313,14 +1313,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 16, "y": 3, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 11, "legend": { @@ -1340,7 +1340,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1378,7 +1378,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_multiplexed_models_unload_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment, replica)", + "expr": "sum(ray_serve_multiplexed_models_unload_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}) by (application, deployment, replica)", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -1434,19 +1434,19 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "P99 latency of mutliplexed model load per replica.", + "description": "P99 latency of multiplexed model load per replica.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 0, "y": 4, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 12, "legend": { @@ -1466,7 +1466,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1504,7 +1504,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -1560,19 +1560,19 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "P99 latency of mutliplexed model unload per replica.", + "description": "P99 latency of multiplexed model unload per replica.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 8, "y": 4, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 13, "legend": { @@ -1592,7 +1592,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1630,7 +1630,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -1691,14 +1691,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 16, "y": 4, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 14, "legend": { @@ -1718,7 +1718,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1756,7 +1756,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}", + "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}", "interval": "", "legendFormat": "{{replica}}:{{model_id}}", "queryType": "randomWalk", @@ -1817,14 +1817,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 0, "y": 5, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 15, "legend": { @@ -1882,7 +1882,7 @@ "targets": [ { "exemplar": true, - "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])))", + "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\"}[5m])))", "interval": "", "legendFormat": "{{application}}#{{deployment}}#{{replica}}", "queryType": "randomWalk", @@ -1937,7 +1937,7 @@ "schemaVersion": 27, "style": "dark", "tags": [ - "rayVersion:2.46.0" + "rayVersion:2.49.2" ], "templating": { "list": [ @@ -1982,7 +1982,7 @@ "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", "refId": "Prometheus-Instance-Variable-Query" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2017,7 +2017,7 @@ "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)", "refId": "Prometheus-Instance-Variable-Query" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2052,7 +2052,7 @@ "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)", "refId": "Prometheus-Instance-Variable-Query" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2087,7 +2087,7 @@ "query": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)", "refId": "Prometheus-Instance-Variable-Query" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2098,6 +2098,7 @@ "useTags": false }, { + "allValue": ".*", "current": { "selected": false }, @@ -2115,7 +2116,7 @@ "query": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 2, diff --git a/config/grafana/serve_grafana_dashboard.json b/config/grafana/serve_grafana_dashboard.json index 7b9b8492ed4..334cb0fa4c4 100644 --- a/config/grafana/serve_grafana_dashboard.json +++ b/config/grafana/serve_grafana_dashboard.json @@ -29,14 +29,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 5, "legend": { @@ -56,7 +56,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -94,7 +94,7 @@ "targets": [ { "exemplar": true, - "expr": "avg(ray_node_cpu_utilization{ray_io_cluster=~\"$Cluster\",})", + "expr": "avg(ray_node_cpu_utilization{ray_io_cluster=~\"$Cluster\"})", "interval": "", "legendFormat": "CPU (physical)", "queryType": "randomWalk", @@ -102,7 +102,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{ray_io_cluster=~\"$Cluster\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',ray_io_cluster=~\"$Cluster\",}) or vector(0))", + "expr": "sum(ray_node_gpus_utilization{ray_io_cluster=~\"$Cluster\"}) / on() (sum(autoscaler_cluster_resources{resource='GPU',ray_io_cluster=~\"$Cluster\"}) or vector(0))", "interval": "", "legendFormat": "GPU (physical)", "queryType": "randomWalk", @@ -110,7 +110,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_used{ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_mem_total{ray_io_cluster=~\"$Cluster\",})) * 100", + "expr": "sum(ray_node_mem_used{ray_io_cluster=~\"$Cluster\"}) / on() (sum(ray_node_mem_total{ray_io_cluster=~\"$Cluster\"})) * 100", "interval": "", "legendFormat": "Memory (RAM)", "queryType": "randomWalk", @@ -118,7 +118,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gram_used{ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_gram_available{ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_gram_used{ray_io_cluster=~\"$Cluster\",})) * 100", + "expr": "sum(ray_node_gram_used{ray_io_cluster=~\"$Cluster\"}) / on() (sum(ray_node_gram_available{ray_io_cluster=~\"$Cluster\"}) + sum(ray_node_gram_used{ray_io_cluster=~\"$Cluster\"})) * 100", "interval": "", "legendFormat": "GRAM", "queryType": "randomWalk", @@ -126,7 +126,7 @@ }, { "exemplar": true, - "expr": "sum(ray_object_store_memory{ray_io_cluster=~\"$Cluster\",}) / on() sum(ray_resources{Name=\"object_store_memory\",ray_io_cluster=~\"$Cluster\",}) * 100", + "expr": "sum(ray_object_store_memory{ray_io_cluster=~\"$Cluster\"}) / on() sum(ray_resources{Name=\"object_store_memory\",ray_io_cluster=~\"$Cluster\"}) * 100", "interval": "", "legendFormat": "Object Store Memory", "queryType": "randomWalk", @@ -134,7 +134,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_disk_usage{ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_disk_free{ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_disk_usage{ray_io_cluster=~\"$Cluster\",})) * 100", + "expr": "sum(ray_node_disk_usage{ray_io_cluster=~\"$Cluster\"}) / on() (sum(ray_node_disk_free{ray_io_cluster=~\"$Cluster\"}) + sum(ray_node_disk_usage{ray_io_cluster=~\"$Cluster\"})) * 100", "interval": "", "legendFormat": "Disk", "queryType": "randomWalk", @@ -195,14 +195,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 7, "legend": { @@ -260,7 +260,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_num_http_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route)", + "expr": "sum(rate(ray_serve_num_http_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, route)", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -268,7 +268,7 @@ }, { "exemplar": true, - "expr": "sum(rate(ray_serve_num_grpc_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method)", + "expr": "sum(rate(ray_serve_num_grpc_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, method)", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -329,14 +329,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 0, "y": 1, "w": 12, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 8, "legend": { @@ -394,7 +394,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route)", + "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, route)", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -402,7 +402,7 @@ }, { "exemplar": true, - "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method)", + "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, method)", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -463,14 +463,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 12, "y": 1, "w": 12, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 17, "legend": { @@ -528,7 +528,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route, error_code)", + "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, route, error_code)", "interval": "", "legendFormat": "{{application, route, error_code}}", "queryType": "randomWalk", @@ -536,7 +536,7 @@ }, { "exemplar": true, - "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method, error_code)", + "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, method, error_code)", "interval": "", "legendFormat": "{{application, method, error_code}}", "queryType": "randomWalk", @@ -597,14 +597,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 0, "y": 2, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 12, "legend": { @@ -624,7 +624,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -662,7 +662,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route, le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, route, le))", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -670,7 +670,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method, le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, method, le))", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -678,7 +678,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -739,14 +739,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 8, "y": 2, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 15, "legend": { @@ -766,7 +766,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -804,7 +804,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route, le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, route, le))", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -812,7 +812,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method, le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, method, le))", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -820,7 +820,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -881,14 +881,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 16, "y": 2, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 16, "legend": { @@ -908,7 +908,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -946,7 +946,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, route, le))", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -954,7 +954,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, method, le))", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -962,7 +962,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -1023,14 +1023,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 0, "y": 3, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 2, "legend": { @@ -1088,7 +1088,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_deployment_replica_healthy{ray_io_cluster=~\"$Cluster\",}) by (application, deployment)", + "expr": "sum(ray_serve_deployment_replica_healthy{ray_io_cluster=~\"$Cluster\"}) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1149,14 +1149,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 8, "y": 3, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 13, "legend": { @@ -1214,7 +1214,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_request_counter_total{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment)", + "expr": "sum(rate(ray_serve_deployment_request_counter_total{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1275,14 +1275,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 16, "y": 3, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 14, "legend": { @@ -1340,7 +1340,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_error_counter_total{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment)", + "expr": "sum(rate(ray_serve_deployment_error_counter_total{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1401,14 +1401,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 0, "y": 4, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 9, "legend": { @@ -1428,7 +1428,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1466,7 +1466,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, le))", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1474,7 +1474,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -1535,14 +1535,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 8, "y": 4, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 10, "legend": { @@ -1562,7 +1562,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1600,7 +1600,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, le))", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1608,7 +1608,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -1669,14 +1669,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 16, "y": 4, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 11, "legend": { @@ -1696,7 +1696,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1734,7 +1734,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (application, deployment, le))", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1742,7 +1742,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\"}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -1803,14 +1803,14 @@ "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "x": 0, "y": 5, "w": 8, "h": 8 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 3, "legend": { @@ -1830,7 +1830,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -1868,7 +1868,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_deployment_queued_queries{ray_io_cluster=~\"$Cluster\",}) by (application, deployment)", + "expr": "sum(ray_serve_deployment_queued_queries{ray_io_cluster=~\"$Cluster\"}) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1929,14 +1929,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 8, "y": 5, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 4, "legend": { @@ -1994,7 +1994,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(autoscaler_active_nodes{ray_io_cluster=~\"$Cluster\",}) by (NodeType)", + "expr": "sum(autoscaler_active_nodes{ray_io_cluster=~\"$Cluster\"}) by (NodeType)", "interval": "", "legendFormat": "Active Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -2002,7 +2002,7 @@ }, { "exemplar": true, - "expr": "sum(autoscaler_recently_failed_nodes{ray_io_cluster=~\"$Cluster\",}) by (NodeType)", + "expr": "sum(autoscaler_recently_failed_nodes{ray_io_cluster=~\"$Cluster\"}) by (NodeType)", "interval": "", "legendFormat": "Failed Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -2010,7 +2010,7 @@ }, { "exemplar": true, - "expr": "sum(autoscaler_pending_nodes{ray_io_cluster=~\"$Cluster\",}) by (NodeType)", + "expr": "sum(autoscaler_pending_nodes{ray_io_cluster=~\"$Cluster\"}) by (NodeType)", "interval": "", "legendFormat": "Pending Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -2071,14 +2071,14 @@ "defaults": {}, "overrides": [] }, - "fill": 1, - "fillGradient": 0, "gridPos": { "x": 16, "y": 5, "w": 8, "h": 8 }, + "fill": 1, + "fillGradient": 0, "hiddenSeries": false, "id": 6, "legend": { @@ -2098,7 +2098,7 @@ }, "lines": true, "linewidth": 2, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -2136,7 +2136,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_node_network_receive_speed{ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_node_network_receive_speed{ray_io_cluster=~\"$Cluster\"}) by (instance)", "interval": "", "legendFormat": "Recv: {{instance}}", "queryType": "randomWalk", @@ -2144,7 +2144,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_network_send_speed{ray_io_cluster=~\"$Cluster\",}) by (instance)", + "expr": "sum(ray_node_network_send_speed{ray_io_cluster=~\"$Cluster\"}) by (instance)", "interval": "", "legendFormat": "Send: {{instance}}", "queryType": "randomWalk", @@ -2205,14 +2205,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 0, "y": 6, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 20, "legend": { @@ -2270,7 +2270,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_serve_num_ongoing_http_requests{ray_io_cluster=~\"$Cluster\",}", + "expr": "ray_serve_num_ongoing_http_requests{ray_io_cluster=~\"$Cluster\"}", "interval": "", "legendFormat": "Ongoing HTTP Requests", "queryType": "randomWalk", @@ -2331,14 +2331,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 8, "y": 6, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 21, "legend": { @@ -2396,7 +2396,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_serve_num_ongoing_grpc_requests{ray_io_cluster=~\"$Cluster\",}", + "expr": "ray_serve_num_ongoing_grpc_requests{ray_io_cluster=~\"$Cluster\"}", "interval": "", "legendFormat": "Ongoing gRPC Requests", "queryType": "randomWalk", @@ -2457,14 +2457,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 16, "y": 6, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 22, "legend": { @@ -2522,7 +2522,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_serve_num_scheduling_tasks{ray_io_cluster=~\"$Cluster\",}", + "expr": "ray_serve_num_scheduling_tasks{ray_io_cluster=~\"$Cluster\"}", "interval": "", "legendFormat": "Scheduling Tasks", "queryType": "randomWalk", @@ -2583,14 +2583,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 0, "y": 7, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 23, "legend": { @@ -2648,7 +2648,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_serve_num_scheduling_tasks_in_backoff{ray_io_cluster=~\"$Cluster\",}", + "expr": "ray_serve_num_scheduling_tasks_in_backoff{ray_io_cluster=~\"$Cluster\"}", "interval": "", "legendFormat": "Scheduling Tasks in Backoff", "queryType": "randomWalk", @@ -2709,14 +2709,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 8, "y": 7, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 24, "legend": { @@ -2774,7 +2774,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_serve_controller_control_loop_duration_s{ray_io_cluster=~\"$Cluster\",}", + "expr": "ray_serve_controller_control_loop_duration_s{ray_io_cluster=~\"$Cluster\"}", "interval": "", "legendFormat": "Control Loop Duration", "queryType": "randomWalk", @@ -2835,14 +2835,14 @@ "defaults": {}, "overrides": [] }, - "fill": 10, - "fillGradient": 0, "gridPos": { "x": 16, "y": 7, "w": 8, "h": 8 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 25, "legend": { @@ -2900,7 +2900,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_serve_controller_num_control_loops{ray_io_cluster=~\"$Cluster\",}", + "expr": "ray_serve_controller_num_control_loops{ray_io_cluster=~\"$Cluster\"}", "interval": "", "legendFormat": "Control Loops", "queryType": "randomWalk", @@ -2955,7 +2955,7 @@ "schemaVersion": 27, "style": "dark", "tags": [ - "rayVersion:2.46.0" + "rayVersion:2.49.2" ], "templating": { "list": [ @@ -3000,7 +3000,7 @@ "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", "refId": "Prometheus-Instance-Variable-Query" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -3035,7 +3035,7 @@ "query": "label_values(ray_serve_num_http_requests_total{}, route)", "refId": "Prometheus-Instance-Variable-Query" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -3070,7 +3070,7 @@ "query": "label_values(ray_serve_num_grpc_requests{}, method)", "refId": "Prometheus-Instance-Variable-Query" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -3081,6 +3081,7 @@ "useTags": false }, { + "allValue": ".*", "current": { "selected": false }, @@ -3098,7 +3099,7 @@ "query": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 2, diff --git a/config/grafana/serve_llm_grafana_dashboard.json b/config/grafana/serve_llm_grafana_dashboard.json new file mode 100644 index 00000000000..d0a713df55e --- /dev/null +++ b/config/grafana/serve_llm_grafana_dashboard.json @@ -0,0 +1,2826 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of tokens processed per second", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by (model_name, WorkerId) (rate(ray_vllm:request_prompt_tokens_sum{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "interval": "", + "legendFormat": "Prompt Tokens/Sec - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum by (model_name, WorkerId) (rate(ray_vllm:generation_tokens_total{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "interval": "", + "legendFormat": "Generation Tokens/Sec - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Token Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tokens/s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time per output token latency.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by(le, model_name, WorkerId) (rate(ray_vllm:time_per_output_token_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P99 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum by(le, model_name, WorkerId) (rate(ray_vllm:time_per_output_token_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P95 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum by(le, model_name, WorkerId) (rate(ray_vllm:time_per_output_token_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P90 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum by(le, model_name, WorkerId) (rate(ray_vllm:time_per_output_token_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P50 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "D" + }, + { + "exemplar": true, + "expr": "(sum by(model_name, WorkerId) (rate(ray_vllm:time_per_output_token_seconds_sum{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))\n/\nsum by(model_name, WorkerId) (rate(ray_vllm:time_per_output_token_seconds_count{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "Mean - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Time Per Output Token Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Percentage of used cache blocks by vLLM.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 0, + "y": 8, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_vllm:gpu_cache_usage_perc{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }", + "interval": "", + "legendFormat": "GPU Cache Usage - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "ray_vllm:cpu_cache_usage_perc{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }", + "interval": "", + "legendFormat": "CPU Cache Usage - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Cache Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "percentunit", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P50, P90, P95, and P99 TTFT latency.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 12, + "y": 8, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "(sum by(model_name, WorkerId) (rate(ray_vllm:time_to_first_token_seconds_sum{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))\n/\nsum by(model_name, WorkerId) (rate(ray_vllm:time_to_first_token_seconds_count{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "Average - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum by(le, model_name, WorkerId)(rate(ray_vllm:time_to_first_token_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P50 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum by(le, model_name, WorkerId)(rate(ray_vllm:time_to_first_token_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P90 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum by(le, model_name, WorkerId) (rate(ray_vllm:time_to_first_token_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P95 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "D" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by(le, model_name, WorkerId)(rate(ray_vllm:time_to_first_token_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P99 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Time To First Token Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Latency from request start to first token returned (in seconds).", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 0, + "y": 16, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by(model_name, WorkerId) (rate(ray_vllm:e2e_request_latency_seconds_sum{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))\n/\nsum by(model_name, WorkerId) (rate(ray_vllm:e2e_request_latency_seconds_count{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "interval": "", + "legendFormat": "Average - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum by(le, model_name, WorkerId) (rate(ray_vllm:e2e_request_latency_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P50 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum by(le, model_name, WorkerId) (rate(ray_vllm:e2e_request_latency_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P90 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum by(le, model_name, WorkerId) (rate(ray_vllm:e2e_request_latency_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P95 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "D" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by(le, model_name, WorkerId) (rate(ray_vllm:e2e_request_latency_seconds_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])))", + "interval": "", + "legendFormat": "P99 - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: E2E Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 12, + "y": 16, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_vllm:num_requests_running{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }", + "interval": "", + "legendFormat": "Num Running - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "ray_vllm:num_requests_swapped{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }", + "interval": "", + "legendFormat": "Num Swapped - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "ray_vllm:num_requests_waiting{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }", + "interval": "", + "legendFormat": "Num Waiting - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Scheduler State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "${datasource}", + "description": "Heatmap of request prompt length", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "id": 8, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": { + "unit": "none" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "range": true, + "useBackend": false, + "expr": "sum by(le, model_name, WorkerId) (increase(ray_vllm:request_prompt_tokens_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "vLLM: Request Prompt Length", + "type": "heatmap", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "gridPos": { + "x": 0, + "y": 24, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "description": "Heatmap of request generation length", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "id": 9, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": { + "unit": "none" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "range": true, + "useBackend": false, + "expr": "sum by(le, model_name, WorkerId) (increase(ray_vllm:request_generation_tokens_bucket{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "vLLM: Request Generation Length", + "type": "heatmap", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "gridPos": { + "x": 12, + "y": 24, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 0, + "y": 32, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by(finished_reason, model_name, WorkerId) (increase(ray_vllm:request_success_total{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "interval": "", + "legendFormat": "{{finished_reason}} - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Finish Reason", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 12, + "y": 32, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by(model_name, WorkerId) (rate(ray_vllm:request_queue_time_seconds_sum{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "interval": "", + "legendFormat": "{{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Queue Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 0, + "y": 40, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by(model_name, WorkerId) (rate(ray_vllm:request_decode_time_seconds_sum{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "interval": "", + "legendFormat": "Decode - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum by(model_name, WorkerId) (rate(ray_vllm:request_prefill_time_seconds_sum{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "interval": "", + "legendFormat": "Prefill - {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Requests Prefill and Decode Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 12, + "y": 40, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by(model_name, WorkerId) (rate(ray_vllm:request_max_num_generation_tokens_sum{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]))", + "interval": "", + "legendFormat": "{{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Max Generation Token in Sequence Group", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Percentage of prefix cache queries that resulted in a cache hit (GPU).", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "x": 0, + "y": 48, + "w": 12, + "h": 8 + }, + "fill": 1, + "fillGradient": 0, + "hiddenSeries": false, + "id": 28, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "increase(ray_vllm:gpu_prefix_cache_hits_total{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval]) / increase(ray_vllm:gpu_prefix_cache_queries_total{model_name=~\"$vllm_model_name\", WorkerId=~\"$workerid\", }[$interval])", + "interval": "", + "legendFormat": "GPU: {{model_name}} - {{WorkerId}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "vLLM: Prefix Cache Hit Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "percentunit", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "rgb(230, 230, 230)", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 27, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "text": { + "titleSize": 12 + } + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "sum by (model_name) (delta(ray_vllm:prompt_tokens_total{WorkerId=~\"$workerid\", }[1w])) / sum by (model_name) (delta(ray_vllm:request_success_total{WorkerId=~\"$workerid\", }[1w]))", + "interval": "", + "legendFormat": "In: {{ model_name}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum by (model_name) (delta(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[1w])) / sum by (model_name) (delta(ray_vllm:request_success_total{WorkerId=~\"$workerid\", }[1w]))", + "interval": "", + "legendFormat": "Out: {{ model_name}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Tokens Per Request Per Model Last 7 Days", + "type": "gauge", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Tokens", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 12, + "y": 48, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "super-light-yellow", + "value": null + }, + { + "color": "super-light-green", + "value": 50 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "(sum by (model_name) (delta(ray_vllm:prompt_tokens_total{WorkerId=~\"$workerid\", }[1d])))", + "interval": "", + "legendFormat": "Input: {{model_name}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "(sum by (model_name) (delta(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[1d])))", + "interval": "", + "legendFormat": "Generated: {{model_name}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Tokens Last 24 Hours", + "type": "stat", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Tokens", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 0, + "y": 56, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "super-light-yellow", + "value": null + }, + { + "color": "super-light-green", + "value": 50 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "delta(ray_vllm:prompt_tokens_total{WorkerId=~\"$workerid\", }[1h])", + "interval": "", + "legendFormat": "Input: {{model_name}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "delta(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[1h])", + "interval": "", + "legendFormat": "Generated: {{model_name}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Tokens Last Hour", + "type": "stat", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Tokens", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 12, + "y": 56, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "id": 16, + "options": { + "displayLabels": [], + "legend": { + "displayMode": "table", + "placement": "right", + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {} + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "sum by (model_name) (delta(ray_vllm:request_success_total{WorkerId=~\"$workerid\", }[1d]))", + "interval": "", + "legendFormat": "{{model_name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Distribution of Requests Per Model Last 24 Hours", + "type": "piechart", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "gridPos": { + "x": 0, + "y": 64, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "super-light-yellow", + "value": null + }, + { + "color": "super-light-green", + "value": 50 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "sum by (model_name) (delta(ray_vllm:prompt_tokens_total{WorkerId=~\"$workerid\", }[1d])) / sum by (model_name) (delta(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[1d]))", + "interval": "", + "legendFormat": "{{model_name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Ratio Input:Generated Tokens Last 24 Hours", + "type": "stat", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "none", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 12, + "y": 64, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "super-light-yellow", + "value": null + }, + { + "color": "super-light-green", + "value": 50 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "sum by (model_name) (delta(ray_vllm:prompt_tokens_total{WorkerId=~\"$workerid\", }[1d])) + sum by (model_name) (delta(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[1d]))", + "interval": "", + "legendFormat": "{{model_name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Tokens Per Model Last 24 Hours", + "type": "stat", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Tokens", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 0, + "y": 72, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "super-light-yellow", + "value": null + }, + { + "color": "super-light-green", + "value": 50 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "max_over_time(sum by (model_name) (rate(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[2m]))[24h:])", + "interval": "", + "legendFormat": "{{model_name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Peak Tokens Per Second Per Model Last 24 Hours", + "type": "stat", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Tokens/s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 12, + "y": 72, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "rgb(230, 230, 230)", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 23, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "text": { + "titleSize": 12 + } + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "sum by (model_name) (delta(ray_vllm:request_success_total{WorkerId=~\"$workerid\", }[1w]))", + "interval": "", + "legendFormat": "{{ model_name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Requests Per Model Last Week", + "type": "gauge", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 0, + "y": 80, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "rgb(230, 230, 230)", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 24, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "text": { + "titleSize": 12 + } + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "(sum by (model_name) (delta(ray_vllm:prompt_tokens_total{WorkerId=~\"$workerid\", }[1w])) +\nsum by (model_name) (delta(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[1w]))) / sum by (model_name) (delta(ray_vllm:request_success_total{WorkerId=~\"$workerid\", }[1w]))", + "interval": "", + "legendFormat": "{{ model_name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Avg Total Tokens Per Request Last 7 Days", + "type": "gauge", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 12, + "y": 80, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "rgb(230, 230, 230)", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 25, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "text": { + "titleSize": 12 + } + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "(sum by (model_name) (delta(ray_vllm:prompt_tokens_total{WorkerId=~\"$workerid\", }[1w])) + sum by (model_name) (delta(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[1w])))/ sum by (model_name) (delta(ray_vllm:request_success_total{WorkerId=~\"$workerid\", }[1w]))", + "interval": "", + "legendFormat": "{{ model_name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Avg Total Tokens Per Request Per Model Last 7 Days", + "type": "gauge", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 0, + "y": 88, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "rgb(230, 230, 230)", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "id": 26, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "text": { + "titleSize": 12 + } + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "exemplar": true, + "expr": "sum by (model_name) (delta(ray_vllm:prompt_tokens_total{WorkerId=~\"$workerid\", }[1w]))", + "interval": "", + "legendFormat": "In: {{ model_name}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum by (model_name) (delta(ray_vllm:generation_tokens_total{WorkerId=~\"$workerid\", }[1w]))", + "interval": "", + "legendFormat": "Out: {{ model_name }}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Tokens Per Model Last 7 Days", + "type": "gauge", + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Tokens", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "description": "", + "gridPos": { + "x": 12, + "y": 88, + "w": 12, + "h": 8 + }, + "fill": 1, + "stack": false, + "linewidth": 2 + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:2.49.2" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "name": "vllm_model_name", + "label": "vLLM Model Name", + "type": "query", + "hide": 0, + "datasource": "${datasource}", + "definition": "label_values(ray_vllm:request_prompt_tokens_sum{}, model_name)", + "query": { + "query": "label_values(ray_vllm:request_prompt_tokens_sum{}, model_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "includeAll": true, + "multi": false, + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "name": "workerid", + "label": "Worker ID", + "type": "query", + "hide": 0, + "datasource": "${datasource}", + "definition": "label_values(ray_vllm:request_prompt_tokens_sum{}, WorkerId)", + "query": { + "query": "label_values(ray_vllm:request_prompt_tokens_sum{}, WorkerId)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "includeAll": true, + "multi": false, + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "name": "interval", + "label": "Interval", + "type": "custom", + "hide": 0, + "includeAll": false, + "multi": false, + "options": [ + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "15m", + "value": "15m" + } + ], + "current": { + "selected": true, + "text": "5m", + "value": "5m" + } + } + ] + }, + "rayMeta": [ + "excludesSystemRoutes", + "supportsGlobalFilterOverride" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Serve LLM Dashboard", + "uid": "rayServeLlmDashboard", + "version": 1 +} \ No newline at end of file diff --git a/config/grafana/train_grafana_dashboard.json b/config/grafana/train_grafana_dashboard.json index 29b85781d6f..c4d7eeaa245 100644 --- a/config/grafana/train_grafana_dashboard.json +++ b/config/grafana/train_grafana_dashboard.json @@ -1,42 +1,45 @@ { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, + "title": "Train Dashboard", + "uid": "rayTrainDashboard", + "version": 1, + "schemaVersion": 27, + "style": "dark", "editable": true, - "gnetId": null, - "graphTooltip": 0, - "iteration": 1667344411089, - "links": [], + "graphTooltip": 1, + "refresh": false, "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 14, + "title": "Train Metrics", + "type": "row", + "panels": [] + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Time taken to report a checkpoint to storage.", + "description": "Current state of the train controller.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 0 + "y": 1 }, + "fill": 10, + "fillGradient": 0, "hiddenSeries": false, "id": 1, "legend": { @@ -56,7 +59,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": "connected", "options": { "alertThreshold": true }, @@ -89,14 +92,14 @@ } ], "spaceLength": 10, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { "exemplar": true, - "expr": "sum(ray_train_report_total_blocked_time_s{SessionName=~\"$SessionName\",ray_train_run_name=~\"$TrainRunName\",}) by (ray_train_run_name, ray_train_worker_world_rank)", + "expr": "sum(ray_train_controller_state{ray_train_run_name=~\"$TrainRunName\", ray_train_run_id=~\"$TrainRunId\", SessionName=~\"$SessionName\"}) by (ray_train_run_name, ray_train_controller_state)", "interval": "", - "legendFormat": "Run Name: {{ray_train_run_name}}, World Rank: {{ray_train_worker_world_rank}}", + "legendFormat": "Run Name: {{ray_train_run_name}}, Controller State: {{ray_train_controller_state}}", "queryType": "randomWalk", "refId": "A" } @@ -105,7 +108,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Checkpoint Report Time", + "title": "Controller State", "tooltip": { "shared": true, "sort": 0, @@ -122,7 +125,7 @@ "yaxes": [ { "$$hashKey": "object:628", - "format": "seconds", + "format": "", "label": "", "logBase": 1, "max": null, @@ -150,19 +153,19 @@ "dashLength": 10, "dashes": false, "datasource": "${datasource}", - "description": "Time taken by the controller to perform various operations.", + "description": "Time taken by the controller for worker group operations.", "fieldConfig": { "defaults": {}, "overrides": [] }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 0 + "y": 1 }, + "fill": 0, + "fillGradient": 0, "hiddenSeries": false, "id": 2, "legend": { @@ -182,7 +185,7 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "nullPointMode": null, "options": { "alertThreshold": true }, @@ -220,7 +223,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_train_worker_group_start_total_time_s{SessionName=~\"$SessionName\",ray_train_run_name=~\"$TrainRunName\",}) by (ray_train_run_name)", + "expr": "sum(ray_train_worker_group_start_total_time_s{ray_train_run_name=~\"$TrainRunName\", ray_train_run_id=~\"$TrainRunId\", SessionName=~\"$SessionName\"}) by (ray_train_run_name)", "interval": "", "legendFormat": "Run Name: {{ray_train_run_name}}, Worker Group Start Time", "queryType": "randomWalk", @@ -228,7 +231,7 @@ }, { "exemplar": true, - "expr": "sum(ray_train_worker_group_shutdown_total_time_s{SessionName=~\"$SessionName\",ray_train_run_name=~\"$TrainRunName\",}) by (ray_train_run_name)", + "expr": "sum(ray_train_worker_group_shutdown_total_time_s{ray_train_run_name=~\"$TrainRunName\", ray_train_run_id=~\"$TrainRunId\", SessionName=~\"$SessionName\"}) by (ray_train_run_name)", "interval": "", "legendFormat": "Run Name: {{ray_train_run_name}}, Worker Group Shutdown Time", "queryType": "randomWalk", @@ -239,7 +242,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Train Controller Operation Time", + "title": "Controller Operation Time", "tooltip": { "shared": true, "sort": 0, @@ -277,101 +280,1767 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time taken to report a checkpoint to storage.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "fill": 0, + "fillGradient": 0, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": null, + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_train_report_total_blocked_time_s{ray_train_run_name=~\"$TrainRunName\", ray_train_run_id=~\"$TrainRunId\", ray_train_worker_world_rank=~\"$TrainWorkerWorldRank\", ray_train_worker_actor_id=~\"$TrainWorkerActorId\", SessionName=~\"$SessionName\"}) by (ray_train_run_name, ray_train_worker_world_rank, ray_train_worker_actor_id)", + "interval": "", + "legendFormat": "Run Name: {{ray_train_run_name}}, World Rank: {{ray_train_worker_world_rank}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Checkpoint Report Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 15, + "title": "Resource Utilization", + "type": "row", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "CPU core utilization across all workers.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_cpu_utilization{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"} * ray_node_cpu_count{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"} / 100) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "CPU Usage: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total physical memory used vs total available memory.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_mem_used{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Memory Used: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_total{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Memory allocation details including available and shared memory.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_mem_available{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Available Memory: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_shared_bytes{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Shared Memory: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Allocation Details", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "GPU utilization across all workers.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_gpus_utilization{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", GpuIndex=~\"$GpuIndex\", GpuDeviceName=~\"$GpuDeviceName\", SessionName=~\"$SessionName\"} / 100) by (instance, RayNodeType, GpuIndex, GpuDeviceName)", + "interval": "", + "legendFormat": "GPU Usage: {{instance}} ({{RayNodeType}}), gpu.{{GpuIndex}}, {{GpuDeviceName}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gpus_available{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", GpuIndex=~\"$GpuIndex\", GpuDeviceName=~\"$GpuDeviceName\", SessionName=~\"$SessionName\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "GPUs", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "GPU memory usage across all workers.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_gram_used{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", GpuIndex=~\"$GpuIndex\", GpuDeviceName=~\"$GpuDeviceName\", SessionName=~\"$SessionName\"} * 1024 * 1024) by (instance, RayNodeType, GpuIndex, GpuDeviceName)", + "interval": "", + "legendFormat": "Used GRAM: {{instance}} ({{RayNodeType}}), gpu.{{GpuIndex}}, {{GpuDeviceName}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "(sum(ray_node_gram_available{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", GpuIndex=~\"$GpuIndex\", GpuDeviceName=~\"$GpuDeviceName\", SessionName=~\"$SessionName\"}) + sum(ray_node_gram_used{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", GpuIndex=~\"$GpuIndex\", GpuDeviceName=~\"$GpuDeviceName\", SessionName=~\"$SessionName\"})) * 1024 * 1024", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Disk space usage across all workers.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_disk_usage{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Disk Used: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_disk_free{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) + sum(ray_node_disk_usage{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current disk read/write throughput.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_disk_io_read_speed{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Read Speed: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_disk_io_write_speed{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Write Speed: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current disk read/write operations per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_disk_read_iops{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Read IOPS: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_disk_write_iops{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Write IOPS: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Operations", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ops/s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current network send/receive throughput.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_network_receive_speed{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Receive Speed: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_network_send_speed{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Send Speed: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total network traffic sent/received.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 50 + }, + "fill": 10, + "fillGradient": 0, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_network_sent{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Total Sent: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_network_received{instance=~\"$Instance\", RayNodeType=~\"$RayNodeType\", SessionName=~\"$SessionName\"}) by (instance, RayNodeType)", + "interval": "", + "legendFormat": "Total Received: {{instance}} ({{RayNodeType}})", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Total Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ] } ], - "refresh": false, - "schemaVersion": 27, - "style": "dark", - "tags": [ - "rayVersion:2.46.0" - ], + "time": { + "from": "now-30m", + "to": "now" + }, + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, "templating": { "list": [ { - "current": { - "selected": false - }, + "name": "datasource", + "type": "datasource", "description": "Filter queries of a specific Prometheus type.", + "datasource": null, + "query": "prometheus", + "refresh": 1, "hide": 2, "includeAll": false, "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": ".+", "current": { "selected": false - }, - "datasource": "${datasource}", - "definition": "label_values(ray_train_report_total_blocked_time_s{}, SessionName)", + } + }, + { + "name": "SessionName", + "type": "query", "description": "Filter queries to specific ray sessions.", - "error": null, + "datasource": "${datasource}", + "definition": "label_values(ray_train_worker_group_start_total_time_s{}, SessionName)", + "query": { + "query": "label_values(ray_train_worker_group_start_total_time_s{}, SessionName)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, "hide": 0, "includeAll": true, - "label": null, "multi": false, - "name": "SessionName", - "options": [], + "allValue": ".*", + "sort": 2, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "name": "TrainRunName", + "type": "query", + "description": "Filter queries to specific Ray Train run names.", + "datasource": "${datasource}", + "definition": "label_values(ray_train_worker_group_start_total_time_s{}, ray_train_run_name)", "query": { - "query": "label_values(ray_train_report_total_blocked_time_s{}, SessionName)", + "query": "label_values(ray_train_worker_group_start_total_time_s{}, ray_train_run_name)", "refId": "StandardVariableQuery" }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, + "refresh": 1, + "hide": 0, + "includeAll": true, + "multi": false, + "allValue": ".*", "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "name": "TrainRunId", "type": "query", - "useTags": false + "description": "Filter queries to specific Ray Train run ids.", + "datasource": "${datasource}", + "definition": "label_values(ray_train_worker_group_start_total_time_s{}, ray_train_run_id)", + "query": { + "query": "label_values(ray_train_worker_group_start_total_time_s{}, ray_train_run_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "hide": 2, + "includeAll": true, + "multi": false, + "allValue": ".*", + "sort": 2, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } }, { - "allValue": ".+", + "name": "TrainWorkerWorldRank", + "type": "query", + "description": "Filter queries to specific Ray Train worker world ranks.", + "datasource": "${datasource}", + "definition": "label_values(ray_train_report_total_blocked_time_s{}, ray_train_worker_world_rank)", + "query": { + "query": "label_values(ray_train_report_total_blocked_time_s{}, ray_train_worker_world_rank)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "hide": 0, + "includeAll": true, + "multi": false, + "allValue": ".*", + "sort": 2, "current": { - "selected": false + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "name": "TrainWorkerActorId", + "type": "query", + "description": "Filter queries to specific Ray Train worker actor ids.", + "datasource": "${datasource}", + "definition": "label_values(ray_train_report_total_blocked_time_s{}, ray_train_worker_actor_id)", + "query": { + "query": "label_values(ray_train_report_total_blocked_time_s{}, ray_train_worker_actor_id)", + "refId": "StandardVariableQuery" }, + "refresh": 1, + "hide": 2, + "includeAll": true, + "multi": false, + "allValue": ".*", + "sort": 2, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "name": "Instance", + "type": "query", + "description": "Filter queries to specific node instances.", "datasource": "${datasource}", - "definition": "label_values(ray_train_report_total_blocked_time_s{}, ray_train_run_name)", - "description": "Filter queries to specific ray sessions.", - "error": null, - "hide": 0, + "definition": "label_values(ray_node_network_receive_speed{}, instance)", + "query": { + "query": "label_values(ray_node_network_receive_speed{}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "hide": 2, "includeAll": true, - "label": null, "multi": false, - "name": "TrainRunName", - "options": [], + "allValue": ".*", + "sort": 2, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "name": "GpuIndex", + "type": "query", + "description": "Filter queries to specific GPU indices.", + "datasource": "${datasource}", + "definition": "label_values(ray_node_gpus_utilization{}, GpuIndex)", "query": { - "query": "label_values(ray_train_report_total_blocked_time_s{}, ray_train_run_name)", + "query": "label_values(ray_node_gpus_utilization{}, GpuIndex)", "refId": "StandardVariableQuery" }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, + "refresh": 1, + "hide": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "name": "GpuDeviceName", "type": "query", - "useTags": false + "description": "Filter queries to specific GPU device names.", + "datasource": "${datasource}", + "definition": "label_values(ray_node_gpus_utilization{}, GpuDeviceName)", + "query": { + "query": "label_values(ray_node_gpus_utilization{}, GpuDeviceName)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "hide": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "sort": 2, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + } + }, + { + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "description": "Filter queries to specific Ray node types (head or worker).", + "includeAll": true, + "multi": true, + "name": "RayNodeType", + "options": [ + { + "selected": false, + "text": "All", + "value": "$__all" + }, + { + "selected": false, + "text": "Head Node", + "value": "head" + }, + { + "selected": false, + "text": "Worker Node", + "value": "worker" + } + ], + "query": "head, worker", + "type": "custom" } ] }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Train Dashboard", - "uid": "rayTrainDashboard", - "version": 1, + "tags": [ + "rayVersion:2.49.2" + ], "rayMeta": [ "supportsGlobalFilterOverride" ]