From ccad76a671d19b8aab80ab048dd87bc10c189216 Mon Sep 17 00:00:00 2001 From: Vadim Rutkovsky Date: Tue, 25 Apr 2023 11:14:56 +0200 Subject: [PATCH 1/6] dashboard: use recording rules for most metrics Add more recording rules to reduce the load on Thanos querier and Prometheus. This removes "auto" interval as it can't be cached via recording rules --- ...-operator_04_servicemonitor-apiserver.yaml | 118 ++++++++++++++++++ ...operator_05_api_performance_dashboard.yaml | 71 +++++------ 2 files changed, 151 insertions(+), 38 deletions(-) diff --git a/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml b/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml index 889ebf673b..b6aa7a729e 100644 --- a/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml +++ b/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml @@ -138,3 +138,121 @@ spec: - record: cluster:apiserver_current_inflight_requests:sum:max_over_time:2m expr: | max_over_time(sum(apiserver_current_inflight_requests{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver,request_kind)[2m:]) + - name: api-performance + rules: + - record: resource_verb:apiserver_request_duration_seconds_bucket:rate:1m + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[1m])) by (apiserver, resource, verb, le) + - record: resource_verb:apiserver_request_duration_seconds_bucket:rate:5m + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[5m])) by (apiserver, resource, verb, le) + - record: list:apiserver_request_duration_seconds_bucket:rate1m + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"LIST|GET"}[1m])) by (apiserver, le) + - record: list:apiserver_request_duration_seconds_bucket:rate5m + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"LIST|GET"}[5m])) by (apiserver, le) + - record: write:apiserver_request_duration_seconds_bucket:rate1m + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[1m])) by (apiserver, le) + - record: write:apiserver_request_duration_seconds_bucket:rate5m + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[5m])) by (apiserver, le) + - record: verb:apiserver_request_duration_seconds_bucket:rate1m + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[1m])) by (apiserver, verb, le) + - record: verb:apiserver_request_duration_seconds_bucket:rate5m + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[5m])) by (apiserver, verb, le) + - record: operation:etcd_request_duration_seconds_bucket:rate1m + expr: sum(rate(etcd_request_duration_seconds_bucket[1m])) by (operation, le) + - record: operation:etcd_request_duration_seconds_bucket:rate5m + expr: sum(rate(etcd_request_duration_seconds_bucket[5m])) by (operation, le) + - record: resource_verb:apiserver_request_total:rate1m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, resource, verb) + - record: resource_verb:apiserver_request_total:rate5m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, resource, verb) + - record: read:apiserver_request_total:rate1m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"LIST|GET"}[1m])) by (apiserver) + - record: read:apiserver_request_total:rate5m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"LIST|GET"}[5m])) by (apiserver) + - record: write:apiserver_request_total:rate1m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[1m])) by (apiserver) + - record: write:apiserver_request_total:rate5m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[5m])) by (apiserver) + - record: request_kind:apiserver_dropped_requests_total:rate1m + expr: sum(rate(apiserver_dropped_requests_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, request_kind) + - record: request_kind:apiserver_dropped_requests_total:rate5m + expr: sum(rate(apiserver_dropped_requests_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, request_kind) + - record: component_resource:apiserver_request_terminations_total:rate:1m + expr: sum(rate(apiserver_request_terminations_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, component, resource) + - record: component_resource:apiserver_request_terminations_total:rate:5m + expr: sum(rate(apiserver_request_terminations_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, component, resource) + - record: code:apiserver_request_total:rate1m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, code) + - record: code:apiserver_request_total:rate5m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, code) + - record: instance:apiserver_request_total:rate1m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, instance) + - record: instance:apiserver_request_total:rate5m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, instance) + - record: resource:apiserver_longrunning_requests:sum + expr: sum(apiserver_longrunning_gauge{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, resource) + - record: instance:apiserver_longrunning_requests:sum + expr: sum(apiserver_longrunning_gauge{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, instance) + - record: instance_request_kind:apiserver_current_inflight_requests:sum + expr: sum(apiserver_current_inflight_requests{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, instance, request_kind) + - record: instance:apiserver_response_sizes_sum:rate1m + expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, instance) + - record: instance:apiserver_response_sizes_sum:rate5m + expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, instance) + - record: resource_verb:apiserver_response_sizes_sum:rate1m + expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, resource, verb) + - record: resource_verb:apiserver_response_sizes_sum:rate5m + expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, resource, verb) + - record: flow_schema_priority_reason:apiserver_flowcontrol_request_queue_length_after_enqueue_bucket:rate1m + expr: sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, reason, le) + - record: flow_schema_priority_reason:apiserver_flowcontrol_request_queue_length_after_enqueue_bucket:rate5m + expr: sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, reason, le) + - record: flow_schema_priority_level:apiserver_flowcontrol_request_wait_duration_seconds_bucket:rate1m + expr: sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver", execute="true"}[1m])) by (apiserver, flow_schema, priority_level, le) + - record: flow_schema_priority_level:apiserver_flowcontrol_request_wait_duration_seconds_bucket:rate5m + expr: sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver", execute="true"}[5m])) by (apiserver, flow_schema, priority_level, le) + - record: flow_schema_priority_level_reason:apiserver_flowcontrol_rejected_requests_total:rate1m + expr: sum(rate(apiserver_flowcontrol_rejected_requests_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, reason) + - record: flow_schema_priority_level_reason:apiserver_flowcontrol_rejected_requests_total:rate5m + expr: sum(rate(apiserver_flowcontrol_rejected_requests_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, reason) + - record: flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate1m + expr: sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, le) + - record: flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate5m + expr: sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, le) + - record: flow_schema_priority_level:apiserver_flowcontrol_request_execution_seconds_bucket:rate1m + expr: sum without (le) (flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate1m) + - record: flow_schema_priority_level:apiserver_flowcontrol_request_execution_seconds_bucket:rate5m + expr: sum without (le) (flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate5m) + - record: flow_schema_priority_level:apiserver_flowcontrol_current_executing_requests:sum + expr: sum(apiserver_flowcontrol_current_executing_requests{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, flow_schema, priority_level) + - record: priority_level:apiserver_flowcontrol_request_concurrency_limit:sum + expr: sum(apiserver_flowcontrol_request_concurrency_limit{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, priority_level) + - record: flow_schema_priority_level:apiserver_flowcontrol_current_inqueue_requests:sum + expr: sum(apiserver_flowcontrol_current_inqueue_requests{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, flow_schema, priority_level) + - record: resource_verb:apiserver_selfrequest_total:rate1m + expr: sum(rate(apiserver_selfrequest_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, resource, verb) + - record: resource_verb:apiserver_selfrequest_total:rate5m + expr: sum(rate(apiserver_selfrequest_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, resource, verb) + - record: resource_verb:apiserver_request_aborts_total:rate1m + expr: sum(rate(apiserver_request_aborts_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, resource, verb) + - record: resource_verb:apiserver_request_aborts_total:rate5m + expr: sum(rate(apiserver_request_aborts_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, resource, verb) + - record: filter:apiserver_request_filter_duration_seconds_bucket:rate1m + expr: sum(rate(apiserver_request_filter_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, filter, le) + - record: filter:apiserver_request_filter_duration_seconds_bucket:rate5m + expr: sum(rate(apiserver_request_filter_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, filter, le) + - record: group_kind:apiserver_watch_events_total:rate1m + expr: sum(rate(apiserver_watch_events_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, group, kind) + - record: group_kind:apiserver_watch_events_total:rate5m + expr: sum(rate(apiserver_watch_events_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, group, kind) + - record: group_kind:apiserver_watch_events_sizes_sum:rate1m + expr: sum(rate(apiserver_watch_events_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, group, kind) + - record: group_kind:apiserver_watch_events_sizes_sum:rate5m + expr: sum(rate(apiserver_watch_events_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, group, kind) + - record: group_kind:apiserver_registered_watchers:sum + expr: sum(apiserver_registered_watchers{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, group, kind) + - record: cluster:apiserver_tls_handshake_errors_total:rate1m + expr: sum(rate(apiserver_tls_handshake_errors_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver) + - record: cluster:apiserver_tls_handshake_errors_total:rate5m + expr: sum(rate(apiserver_tls_handshake_errors_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver) + - record: resource:apiserver_storage_objects:max + expr: max(apiserver_storage_objects{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, resource) diff --git a/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml b/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml index 7b3bf71202..e7c457e408 100644 --- a/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml +++ b/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml @@ -81,7 +81,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver=\"$apiserver\",subresource!=\"log\",verb!~\"WATCH|WATCHLIST|PROXY\"}[$period])) by(verb,le))", + "expr": "histogram_quantile(0.99, sum(resource_verb:apiserver_request_duration_seconds_bucket:rate:$period{apiserver=\"$apiserver\"}) by (verb, le))", "format": "time_series", "interval": "", "legendFormat": "{{verb}}", @@ -189,7 +189,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(etcd_request_duration_seconds_bucket[$period])) by(operation,le))", + "expr": "histogram_quantile(0.99, operation:etcd_request_duration_seconds_bucket:rate$period)", "format": "time_series", "interval": "", "legendFormat": "{{operation}}", @@ -297,7 +297,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(20, histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver=\"$apiserver\",subresource!=\"log\",verb!~\"WATCH|WATCHLIST|PROXY\"}[$period])) by(resource,verb,le)))", + "expr": "topk(20, histogram_quantile(0.99, resource_verb:apiserver_request_duration_seconds_bucket:rate:$period{apiserver=\"$apiserver\"}))", "format": "time_series", "interval": "", "legendFormat": "{{resource}}-{{verb}}", @@ -404,7 +404,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(20, sum(rate(apiserver_request_total{apiserver=\"$apiserver\"}[$period])) by(resource,verb))", + "expr": "topk(20, resource_verb:apiserver_request_total:rate$period{apiserver=\"$apiserver\"})", "interval": "", "legendFormat": "{{resource}}-{{verb}}", "refId": "A" @@ -502,14 +502,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver=\"$apiserver\",verb=~\"LIST|GET\"}[$period])) by(le))", + "expr": "histogram_quantile(0.99, list:apiserver_request_duration_seconds_bucket:rate$period{apiserver=\"$apiserver\"})", "format": "time_series", "interval": "", "legendFormat": "read", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver=\"$apiserver\",verb=~\"POST|PUT|PATCH|UPDATE|DELETE\"}[$period])) by(le))", + "expr": "histogram_quantile(0.99, write:apiserver_request_duration_seconds_bucket:rate$period{apiserver=\"$apiserver\"})", "interval": "", "legendFormat": "write", "refId": "B" @@ -615,13 +615,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_request_total{apiserver=\"$apiserver\",verb=~\"LIST|GET\"}[$period]))", + "expr": "read:apiserver_request_total:rate$period{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "read", "refId": "B" }, { - "expr": "sum(rate(apiserver_request_total{apiserver=\"$apiserver\",verb=~\"POST|PUT|PATCH|UPDATE|DELETE\"}[$period]))", + "expr": "write:apiserver_request_total:rate$period{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "write", "refId": "A" @@ -718,7 +718,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_dropped_requests_total{apiserver=\"$apiserver\"}[$period])) by (request_kind)", + "expr": "request_kind:apiserver_dropped_requests_total:rate$period{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{request_kind}}", "refId": "A" @@ -817,7 +817,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_request_terminations_total{apiserver=\"$apiserver\"}[$period])) by(component,resource)", + "expr": "component_resource:apiserver_request_terminations_total:rate:$period{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{component}}-{{resource}}", "refId": "A" @@ -914,7 +914,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_request_total{apiserver=\"$apiserver\"}[$period])) by(code)", + "expr": "code:apiserver_request_total:rate$period{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{code}}", "refId": "A" @@ -1012,7 +1012,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_request_total{apiserver=\"$apiserver\"}[$period])) by(instance)", + "expr": "instance:apiserver_request_total:rate$period{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{instance}}", "refId": "A" @@ -1109,7 +1109,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(20, sum(apiserver_longrunning_gauge{apiserver=\"$apiserver\"}) by(resource))", + "expr": "topk(20, resource:apiserver_longrunning_requests:sum{apiserver=\"$apiserver\"})", "interval": "", "legendFormat": "{{resource}}", "refId": "A" @@ -1206,7 +1206,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(apiserver_longrunning_gauge{apiserver=\"$apiserver\"}) by(instance)", + "expr": "instance:apiserver_longrunning_requests:sum{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{instance}}", "refId": "A" @@ -1302,7 +1302,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(apiserver_current_inflight_requests{apiserver=\"$apiserver\"}) by (instance,request_kind)", + "expr": "instance_request_kind:apiserver_current_inflight_requests:sum{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{instance}}:{{request_kind}}", "refId": "A" @@ -1400,7 +1400,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_response_sizes_sum{apiserver=\"$apiserver\"}[$period])) by(instance)", + "expr": "instance:apiserver_response_sizes_sum:rate$period{apiserver=\"$apiserver\"}", "format": "time_series", "interval": "", "legendFormat": "{{instance}}", @@ -1508,7 +1508,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(apiserver_response_sizes_sum{apiserver=\"$apiserver\"}[$period])) by(resource,verb))", + "expr": "topk(10, resource_verb:apiserver_response_sizes_sum:rate$period{apiserver=\"$apiserver\"})", "format": "time_series", "interval": "", "legendFormat": "{{resource}}:{{verb}}", @@ -1617,7 +1617,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_flowcontrol_rejected_requests_total{apiserver=\"$apiserver\"}[$period])) by (flow_schema,priority_level,reason)", + "expr": "flow_schema_priority_level_reason:apiserver_flowcontrol_rejected_requests_total:rate$period{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{flow_schema}}:{{priority_level}}:{{reason}}", "refId": "A" @@ -1714,7 +1714,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{apiserver=\"$apiserver\",execute=\"true\"}[$period])) by(flow_schema, priority_level, le))", + "expr": "histogram_quantile(0.99, flow_schema_priority_level:apiserver_flowcontrol_request_wait_duration_seconds_bucket:rate$period{apiserver=\"$apiserver\"})", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -1816,7 +1816,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{apiserver=\"$apiserver\"}[$period])) by(flow_schema, priority_level, le))", + "expr": "histogram_quantile(0.99, flow_schema_priority_reason:apiserver_flowcontrol_request_queue_length_after_enqueue_bucket:rate$period{apiserver=\"$apiserver\"})", "interval": "", "legendFormat": "{{flow_schema}}:{{priority_level}}", "refId": "A" @@ -1913,7 +1913,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_flowcontrol_dispatched_requests_total{apiserver=\"$apiserver\"}[$period])) by(flow_schema,priority_level)", + "expr": "flow_schema_priority_level:apiserver_flowcontrol_request_execution_seconds_bucket:rate$period{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{flow_schema}}:{{priority_level}}", "refId": "A" @@ -2010,7 +2010,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{apiserver=\"$apiserver\"}[$period])) by(flow_schema, priority_level, le)) ", + "expr": "histogram_quantile(0.99, flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate$period{apiserver=\"$apiserver\"}) ", "interval": "", "legendFormat": "{{flow_schema}}:{{priority_level}}", "refId": "A" @@ -2107,7 +2107,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(apiserver_flowcontrol_current_executing_requests{apiserver=\"$apiserver\"}) by (flow_schema,priority_level)", + "expr": "flow_schema_priority_level:apiserver_flowcontrol_current_executing_requests:sum{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{flow_schema}}:{{priority_level}}", "refId": "A" @@ -2204,7 +2204,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(apiserver_flowcontrol_request_concurrency_limit{apiserver=\"$apiserver\"}) by (priority_level)", + "expr": "priority_level:apiserver_flowcontrol_request_concurrency_limit:sum{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{priority_level}}", "refId": "A" @@ -2301,7 +2301,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(apiserver_flowcontrol_current_inqueue_requests{apiserver=\"$apiserver\"}) by (flow_schema,priority_level)", + "expr": "flow_schema_priority_level:apiserver_flowcontrol_current_inqueue_requests:sum{apiserver=\"$apiserver\"}", "interval": "", "legendFormat": "{{flow_schema}}:{{priority_level}}", "refId": "A" @@ -2399,7 +2399,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_selfrequest_total{apiserver=\"$apiserver\"}[$period])) by(resource,verb)", + "expr": "resource_verb:apiserver_selfrequest_total:rate$period{apiserver=\"$apiserver\"}", "format": "time_series", "interval": "", "legendFormat": "{{resource}}:{{verb}}", @@ -2507,7 +2507,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_request_aborts_total{apiserver=\"$apiserver\"}[$period])) by(resource,verb)", + "expr": "resource_verb:apiserver_request_aborts_total:rate$period{apiserver=\"$apiserver\"}", "format": "time_series", "interval": "", "legendFormat": "{{resource}}:{{verb}}", @@ -2615,7 +2615,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(20, histogram_quantile(0.99, sum(rate(apiserver_request_filter_duration_seconds_bucket{apiserver=\"$apiserver\"}[$period])) by(filter,le)))", + "expr": "topk(20, histogram_quantile(0.99, filter:apiserver_request_filter_duration_seconds_bucket:rate$period{apiserver=\"$apiserver\"}))", "format": "time_series", "interval": "", "legendFormat": "{{filter}}", @@ -2723,7 +2723,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(25, sum(rate(apiserver_watch_events_total{apiserver=\"$apiserver\"}[$period])) by(group,kind))", + "expr": "topk(25, group_kind:apiserver_watch_events_total:rate$period{apiserver=\"$apiserver\"})", "format": "time_series", "interval": "", "legendFormat": "{{group}}:{{kind}}", @@ -2831,7 +2831,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(20, sum(rate(apiserver_watch_events_sizes_sum{apiserver=\"$apiserver\"}[$period])) by(group,kind))", + "expr": "topk(20, group_kind:apiserver_watch_events_sizes_sum:rate$period{apiserver=\"$apiserver\"})", "format": "time_series", "interval": "", "legendFormat": "{{group}}:{{kind}}", @@ -2939,7 +2939,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(25, sum(apiserver_registered_watchers{apiserver=\"$apiserver\"}) by(group,kind))", + "expr": "topk(25, group_kind:apiserver_registered_watchers:sum{apiserver=\"$apiserver\"})", "format": "time_series", "interval": "", "legendFormat": "{{group}}:{{kind}}", @@ -3047,7 +3047,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(apiserver_tls_handshake_errors_total{apiserver=\"$apiserver\"}[$period])) by()", + "expr": "cluster:apiserver_tls_handshake_errors_total:rate$period{apiserver=\"$apiserver\"}", "format": "time_series", "interval": "", "legendFormat": "", @@ -3285,11 +3285,6 @@ data: "label": "interval", "name": "period", "options": [ - { - "selected": false, - "text": "auto", - "value": "$__auto_interval_period" - }, { "selected": false, "text": "1m", @@ -3328,7 +3323,7 @@ data: ] }, "timezone": "", - "title": "API Performance", + "title": "API Performance v2", "uid": "X9gzM6XFF", "version": 2 } From 6c8f73cc32f1300901739bf3244f8ce07df4e03f Mon Sep 17 00:00:00 2001 From: Vadim Rutkovsky Date: Wed, 28 Jun 2023 11:18:47 +0200 Subject: [PATCH 2/6] manifests: remove unnecessary v2 suffix in dashboard name --- ...90_kube-apiserver-operator_05_api_performance_dashboard.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml b/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml index e7c457e408..b1ee1ca3a1 100644 --- a/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml +++ b/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml @@ -3323,7 +3323,7 @@ data: ] }, "timezone": "", - "title": "API Performance v2", + "title": "API Performance", "uid": "X9gzM6XFF", "version": 2 } From 205d6582ba49456b9564155634d3e89b788b619c Mon Sep 17 00:00:00 2001 From: Vadim Rutkovsky Date: Wed, 28 Jun 2023 11:26:05 +0200 Subject: [PATCH 3/6] manifests: add new PrometheusRule for recording rules Move recording rules out of `kube-apiserver` PrometheusRule as it is being removed by CVO (has "delete" annotation) --- ...iserver-operator_04_servicemonitor-apiserver.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml b/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml index b6aa7a729e..af6485b89e 100644 --- a/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml +++ b/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml @@ -138,6 +138,18 @@ spec: - record: cluster:apiserver_current_inflight_requests:sum:max_over_time:2m expr: | max_over_time(sum(apiserver_current_inflight_requests{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver,request_kind)[2m:]) +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: kube-apiserver-recording-rules + namespace: openshift-kube-apiserver + annotations: + include.release.openshift.io/self-managed-high-availability: "true" + include.release.openshift.io/single-node-developer: "true" + exclude.release.openshift.io/internal-openshift-hosted: "true" +spec: + groups: - name: api-performance rules: - record: resource_verb:apiserver_request_duration_seconds_bucket:rate:1m From e0d14fc3927df629c5ffe04dfdb6afbb7d878632 Mon Sep 17 00:00:00 2001 From: Vadim Rutkovsky Date: Wed, 28 Jun 2023 11:55:10 +0200 Subject: [PATCH 4/6] manifests: add openshift-oauth-apiserver on API Servers dashboard Update recording rules to include openshift-oauth-apiserver too --- ...-operator_04_servicemonitor-apiserver.yaml | 114 +++++++++--------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml b/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml index af6485b89e..1a4acfb204 100644 --- a/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml +++ b/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml @@ -153,118 +153,118 @@ spec: - name: api-performance rules: - record: resource_verb:apiserver_request_duration_seconds_bucket:rate:1m - expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[1m])) by (apiserver, resource, verb, le) + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[1m])) by (apiserver, resource, verb, le) - record: resource_verb:apiserver_request_duration_seconds_bucket:rate:5m - expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[5m])) by (apiserver, resource, verb, le) + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[5m])) by (apiserver, resource, verb, le) - record: list:apiserver_request_duration_seconds_bucket:rate1m - expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"LIST|GET"}[1m])) by (apiserver, le) + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",verb=~"LIST|GET"}[1m])) by (apiserver, le) - record: list:apiserver_request_duration_seconds_bucket:rate5m - expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"LIST|GET"}[5m])) by (apiserver, le) + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",verb=~"LIST|GET"}[5m])) by (apiserver, le) - record: write:apiserver_request_duration_seconds_bucket:rate1m - expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[1m])) by (apiserver, le) + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[1m])) by (apiserver, le) - record: write:apiserver_request_duration_seconds_bucket:rate5m - expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[5m])) by (apiserver, le) + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[5m])) by (apiserver, le) - record: verb:apiserver_request_duration_seconds_bucket:rate1m - expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[1m])) by (apiserver, verb, le) + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[1m])) by (apiserver, verb, le) - record: verb:apiserver_request_duration_seconds_bucket:rate5m - expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[5m])) by (apiserver, verb, le) + expr: sum(rate(apiserver_request_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",subresource!~"proxy|attach|log|exec|portforward",verb!~"WATCH|WATCHLIST|PROXY"}[5m])) by (apiserver, verb, le) - record: operation:etcd_request_duration_seconds_bucket:rate1m expr: sum(rate(etcd_request_duration_seconds_bucket[1m])) by (operation, le) - record: operation:etcd_request_duration_seconds_bucket:rate5m expr: sum(rate(etcd_request_duration_seconds_bucket[5m])) by (operation, le) - record: resource_verb:apiserver_request_total:rate1m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, resource, verb) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, resource, verb) - record: resource_verb:apiserver_request_total:rate5m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, resource, verb) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, resource, verb) - record: read:apiserver_request_total:rate1m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"LIST|GET"}[1m])) by (apiserver) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",verb=~"LIST|GET"}[1m])) by (apiserver) - record: read:apiserver_request_total:rate5m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"LIST|GET"}[5m])) by (apiserver) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",verb=~"LIST|GET"}[5m])) by (apiserver) - record: write:apiserver_request_total:rate1m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[1m])) by (apiserver) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[1m])) by (apiserver) - record: write:apiserver_request_total:rate5m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[5m])) by (apiserver) - - record: request_kind:apiserver_dropped_requests_total:rate1m - expr: sum(rate(apiserver_dropped_requests_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, request_kind) - - record: request_kind:apiserver_dropped_requests_total:rate5m - expr: sum(rate(apiserver_dropped_requests_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, request_kind) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[5m])) by (apiserver) + - record: group_resource:apiserver_request_total:rate1m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",code="429"}[1m])) by (apiserver, group, resource) + - record: group_resource:apiserver_request_total:rate5m + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver",code="429"}[5m])) by (apiserver, group, resource) - record: component_resource:apiserver_request_terminations_total:rate:1m - expr: sum(rate(apiserver_request_terminations_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, component, resource) + expr: sum(rate(apiserver_request_terminations_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, component, resource) - record: component_resource:apiserver_request_terminations_total:rate:5m - expr: sum(rate(apiserver_request_terminations_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, component, resource) + expr: sum(rate(apiserver_request_terminations_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, component, resource) - record: code:apiserver_request_total:rate1m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, code) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, code) - record: code:apiserver_request_total:rate5m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, code) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, code) - record: instance:apiserver_request_total:rate1m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, instance) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, instance) - record: instance:apiserver_request_total:rate5m - expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, instance) + expr: sum(rate(apiserver_request_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, instance) - record: resource:apiserver_longrunning_requests:sum - expr: sum(apiserver_longrunning_gauge{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, resource) + expr: sum(apiserver_longrunning_gauge{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}) by (apiserver, resource) - record: instance:apiserver_longrunning_requests:sum - expr: sum(apiserver_longrunning_gauge{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, instance) + expr: sum(apiserver_longrunning_gauge{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}) by (apiserver, instance) - record: instance_request_kind:apiserver_current_inflight_requests:sum - expr: sum(apiserver_current_inflight_requests{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, instance, request_kind) + expr: sum(apiserver_current_inflight_requests{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}) by (apiserver, instance, request_kind) - record: instance:apiserver_response_sizes_sum:rate1m - expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, instance) + expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, instance) - record: instance:apiserver_response_sizes_sum:rate5m - expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, instance) + expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, instance) - record: resource_verb:apiserver_response_sizes_sum:rate1m - expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, resource, verb) + expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, resource, verb) - record: resource_verb:apiserver_response_sizes_sum:rate5m - expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, resource, verb) + expr: sum(rate(apiserver_response_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, resource, verb) - record: flow_schema_priority_reason:apiserver_flowcontrol_request_queue_length_after_enqueue_bucket:rate1m - expr: sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, reason, le) + expr: sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, reason, le) - record: flow_schema_priority_reason:apiserver_flowcontrol_request_queue_length_after_enqueue_bucket:rate5m - expr: sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, reason, le) + expr: sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, reason, le) - record: flow_schema_priority_level:apiserver_flowcontrol_request_wait_duration_seconds_bucket:rate1m - expr: sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver", execute="true"}[1m])) by (apiserver, flow_schema, priority_level, le) + expr: sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver", execute="true"}[1m])) by (apiserver, flow_schema, priority_level, le) - record: flow_schema_priority_level:apiserver_flowcontrol_request_wait_duration_seconds_bucket:rate5m - expr: sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver", execute="true"}[5m])) by (apiserver, flow_schema, priority_level, le) + expr: sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver", execute="true"}[5m])) by (apiserver, flow_schema, priority_level, le) - record: flow_schema_priority_level_reason:apiserver_flowcontrol_rejected_requests_total:rate1m - expr: sum(rate(apiserver_flowcontrol_rejected_requests_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, reason) + expr: sum(rate(apiserver_flowcontrol_rejected_requests_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, reason) - record: flow_schema_priority_level_reason:apiserver_flowcontrol_rejected_requests_total:rate5m - expr: sum(rate(apiserver_flowcontrol_rejected_requests_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, reason) + expr: sum(rate(apiserver_flowcontrol_rejected_requests_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, reason) - record: flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate1m - expr: sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, le) + expr: sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, flow_schema, priority_level, le) - record: flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate5m - expr: sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, le) + expr: sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, flow_schema, priority_level, le) - record: flow_schema_priority_level:apiserver_flowcontrol_request_execution_seconds_bucket:rate1m expr: sum without (le) (flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate1m) - record: flow_schema_priority_level:apiserver_flowcontrol_request_execution_seconds_bucket:rate5m expr: sum without (le) (flow_schema_priority_level_le:apiserver_flowcontrol_request_execution_seconds_bucket:rate5m) - record: flow_schema_priority_level:apiserver_flowcontrol_current_executing_requests:sum - expr: sum(apiserver_flowcontrol_current_executing_requests{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, flow_schema, priority_level) + expr: sum(apiserver_flowcontrol_current_executing_requests{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}) by (apiserver, flow_schema, priority_level) - record: priority_level:apiserver_flowcontrol_request_concurrency_limit:sum - expr: sum(apiserver_flowcontrol_request_concurrency_limit{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, priority_level) + expr: sum(apiserver_flowcontrol_request_concurrency_limit{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}) by (apiserver, priority_level) - record: flow_schema_priority_level:apiserver_flowcontrol_current_inqueue_requests:sum - expr: sum(apiserver_flowcontrol_current_inqueue_requests{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, flow_schema, priority_level) + expr: sum(apiserver_flowcontrol_current_inqueue_requests{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}) by (apiserver, flow_schema, priority_level) - record: resource_verb:apiserver_selfrequest_total:rate1m - expr: sum(rate(apiserver_selfrequest_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, resource, verb) + expr: sum(rate(apiserver_selfrequest_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, resource, verb) - record: resource_verb:apiserver_selfrequest_total:rate5m - expr: sum(rate(apiserver_selfrequest_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, resource, verb) + expr: sum(rate(apiserver_selfrequest_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, resource, verb) - record: resource_verb:apiserver_request_aborts_total:rate1m - expr: sum(rate(apiserver_request_aborts_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, resource, verb) + expr: sum(rate(apiserver_request_aborts_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, resource, verb) - record: resource_verb:apiserver_request_aborts_total:rate5m - expr: sum(rate(apiserver_request_aborts_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, resource, verb) + expr: sum(rate(apiserver_request_aborts_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, resource, verb) - record: filter:apiserver_request_filter_duration_seconds_bucket:rate1m - expr: sum(rate(apiserver_request_filter_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, filter, le) + expr: sum(rate(apiserver_request_filter_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, filter, le) - record: filter:apiserver_request_filter_duration_seconds_bucket:rate5m - expr: sum(rate(apiserver_request_filter_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, filter, le) + expr: sum(rate(apiserver_request_filter_duration_seconds_bucket{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, filter, le) - record: group_kind:apiserver_watch_events_total:rate1m - expr: sum(rate(apiserver_watch_events_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, group, kind) + expr: sum(rate(apiserver_watch_events_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, group, kind) - record: group_kind:apiserver_watch_events_total:rate5m - expr: sum(rate(apiserver_watch_events_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, group, kind) + expr: sum(rate(apiserver_watch_events_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, group, kind) - record: group_kind:apiserver_watch_events_sizes_sum:rate1m - expr: sum(rate(apiserver_watch_events_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver, group, kind) + expr: sum(rate(apiserver_watch_events_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver, group, kind) - record: group_kind:apiserver_watch_events_sizes_sum:rate5m - expr: sum(rate(apiserver_watch_events_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver, group, kind) - - record: group_kind:apiserver_registered_watchers:sum - expr: sum(apiserver_registered_watchers{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, group, kind) + expr: sum(rate(apiserver_watch_events_sizes_sum{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver, group, kind) + - record: group_resource:apiserver_longrunning_requests:sum + expr: sum(apiserver_longrunning_gauge{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}) by (apiserver, group, resource) - record: cluster:apiserver_tls_handshake_errors_total:rate1m - expr: sum(rate(apiserver_tls_handshake_errors_total{apiserver=~"openshift-apiserver|kube-apiserver"}[1m])) by (apiserver) + expr: sum(rate(apiserver_tls_handshake_errors_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[1m])) by (apiserver) - record: cluster:apiserver_tls_handshake_errors_total:rate5m - expr: sum(rate(apiserver_tls_handshake_errors_total{apiserver=~"openshift-apiserver|kube-apiserver"}[5m])) by (apiserver) + expr: sum(rate(apiserver_tls_handshake_errors_total{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}[5m])) by (apiserver) - record: resource:apiserver_storage_objects:max - expr: max(apiserver_storage_objects{apiserver=~"openshift-apiserver|kube-apiserver"}) by (apiserver, resource) + expr: max(apiserver_storage_objects{apiserver=~"openshift-apiserver|kube-apiserver|openshift-oauth-apiserver"}) by (apiserver, resource) From f72ddb1454239abdb426b7f2bc4d775951ab21b4 Mon Sep 17 00:00:00 2001 From: Vadim Rutkovsky Date: Tue, 5 Sep 2023 11:06:10 +0200 Subject: [PATCH 5/6] manifests: don't include recording rules when console is disabled In order to avoid additional load on Prometheus the recording rules for kube-apiserver dashboard are not included when Console capability is not enablked . These are not used anywhere else, so it should not affect any other components. --- ...90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml | 3 +-- ...0_kube-apiserver-operator_05_api_performance_dashboard.yaml | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml b/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml index 1a4acfb204..f4112733e6 100644 --- a/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml +++ b/manifests/0000_90_kube-apiserver-operator_04_servicemonitor-apiserver.yaml @@ -146,8 +146,7 @@ metadata: namespace: openshift-kube-apiserver annotations: include.release.openshift.io/self-managed-high-availability: "true" - include.release.openshift.io/single-node-developer: "true" - exclude.release.openshift.io/internal-openshift-hosted: "true" + capability.openshift.io/name: Console spec: groups: - name: api-performance diff --git a/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml b/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml index b1ee1ca3a1..0beb539935 100644 --- a/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml +++ b/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml @@ -5,6 +5,7 @@ metadata: namespace: openshift-config-managed annotations: include.release.openshift.io/self-managed-high-availability: 'true' + capability.openshift.io/name: Console labels: console.openshift.io/dashboard: 'true' data: From 27bd18d6491ba4821db22ed623c2a452446ef7aa Mon Sep 17 00:00:00 2001 From: Vadim Rutkovsky Date: Tue, 10 Oct 2023 14:30:03 +0200 Subject: [PATCH 6/6] manifests: rename API performance dashboard In previous PR this manifests was labelled as "available only when Console capability enabled". This causes CVO to force enable Console capability when upgrading from baseline 4.13 cluster - as this manifest is present. In order to avoid this, the manifest needs to be renamed, so that CVO would treat it as a new one (since its applicability has changed) --- ...rver-operator_05_api_performance_dashboard.yaml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml b/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml index 0beb539935..47a2cccd93 100644 --- a/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml +++ b/manifests/0000_90_kube-apiserver-operator_05_api_performance_dashboard.yaml @@ -1,7 +1,8 @@ +--- apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-api-performance + name: grafana-dashboard-apiserver-performance namespace: openshift-config-managed annotations: include.release.openshift.io/self-managed-high-availability: 'true' @@ -3328,3 +3329,14 @@ data: "uid": "X9gzM6XFF", "version": 2 } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-api-performance + namespace: openshift-config-managed + annotations: + include.release.openshift.io/self-managed-high-availability: "true" + release.openshift.io/delete: "true" + labels: + console.openshift.io/dashboard: 'true'