From 7c7a9ceb520ab12cb819d30e9bdb79b9cc38c477 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Thu, 4 May 2023 23:38:07 +0200 Subject: [PATCH] Implement multi-cluster alerts --- examples/prometheus-alerting-rules/alerts.yaml | 14 +++++++------- jsonnet/kube-state-metrics-mixin/alerts.libsonnet | 14 +++++++------- jsonnet/kube-state-metrics-mixin/config.libsonnet | 1 + 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/examples/prometheus-alerting-rules/alerts.yaml b/examples/prometheus-alerting-rules/alerts.yaml index ba80354da0..7171ea4ead 100644 --- a/examples/prometheus-alerting-rules/alerts.yaml +++ b/examples/prometheus-alerting-rules/alerts.yaml @@ -6,9 +6,9 @@ groups: description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. summary: kube-state-metrics is experiencing errors in list operations. expr: | - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) > 0.01 for: 15m labels: @@ -18,9 +18,9 @@ groups: description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. summary: kube-state-metrics is experiencing errors in watch operations. expr: | - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) > 0.01 for: 15m labels: @@ -30,7 +30,7 @@ groups: description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. summary: kube-state-metrics sharding is misconfigured. expr: | - stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 for: 15m labels: severity: critical @@ -39,9 +39,9 @@ groups: description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. summary: kube-state-metrics shards are missing. expr: | - 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1 - - sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) != 0 for: 15m labels: diff --git a/jsonnet/kube-state-metrics-mixin/alerts.libsonnet b/jsonnet/kube-state-metrics-mixin/alerts.libsonnet index 396f910912..e378df18ee 100644 --- a/jsonnet/kube-state-metrics-mixin/alerts.libsonnet +++ b/jsonnet/kube-state-metrics-mixin/alerts.libsonnet @@ -7,9 +7,9 @@ { alert: 'KubeStateMetricsListErrors', expr: ||| - (sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) + (sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) by (%(clusterLabel)s) / - sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s}[5m]))) + sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s}[5m])) by (%(clusterLabel)s)) > 0.01 ||| % $._config, 'for': '15m', @@ -24,9 +24,9 @@ { alert: 'KubeStateMetricsWatchErrors', expr: ||| - (sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) + (sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) by (%(clusterLabel)s) / - sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s}[5m]))) + sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s}[5m])) by (%(clusterLabel)s)) > 0.01 ||| % $._config, 'for': '15m', @@ -42,7 +42,7 @@ alert: 'KubeStateMetricsShardingMismatch', // expr: ||| - stdvar (kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) != 0 + stdvar (kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) != 0 ||| % $._config, 'for': '15m', labels: { @@ -61,9 +61,9 @@ // A handy side effect of this computation is the result indicates what ordinals are missing. // Eg. a result of "5" decimal, which translates to binary "101", means shards #0 and #2 are not available. expr: ||| - 2^max(kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) - 1 + 2^max(kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - 1 - - sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{%(kubeStateMetricsSelector)s}) ) + sum( 2 ^ max by (%(clusterLabel)s, shard_ordinal) (kube_state_metrics_shard_ordinal{%(kubeStateMetricsSelector)s}) ) by (%(clusterLabel)s) != 0 ||| % $._config, 'for': '15m', diff --git a/jsonnet/kube-state-metrics-mixin/config.libsonnet b/jsonnet/kube-state-metrics-mixin/config.libsonnet index 8a5402c693..f4e66f12f3 100644 --- a/jsonnet/kube-state-metrics-mixin/config.libsonnet +++ b/jsonnet/kube-state-metrics-mixin/config.libsonnet @@ -2,5 +2,6 @@ _config+:: { // Select the metrics coming from the kube state metrics. kubeStateMetricsSelector: 'job="kube-state-metrics"', + clusterLabel: 'cluster', }, }