@@ -2,6 +2,7 @@ apiVersion: monitoring.coreos.com/v1
22kind : PrometheusRule
33metadata :
44 labels :
5+ app.kubernetes.io/component : kubernetes
56 app.kubernetes.io/managed-by : cluster-monitoring-operator
67 app.kubernetes.io/name : kube-prometheus
78 app.kubernetes.io/part-of : openshift-monitoring
@@ -243,20 +244,33 @@ spec:
243244 rules :
244245 - alert : KubeCPUOvercommit
245246 annotations :
246- description : Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
247+ description : Cluster has overcommitted CPU resource requests for Pods by {{ printf "%.2f" $value }} CPU shares and cannot tolerate node failure.
247248 summary : Cluster has overcommitted CPU resource requests.
248249 expr : |
249- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) -
250- sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 0
251- and
252- count(max by (node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3)
250+ # Non-HA clusters.
251+ (
252+ (
253+ sum(namespace_cpu:kube_pod_container_resource_requests:sum{})
254+ -
255+ sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 0
256+ )
257+ and
258+ count(max by (node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3
259+ )
253260 or
254- (sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) -
255- (sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) -
256- max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0
257- and
258- (sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) -
259- max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0)
261+ # HA clusters.
262+ (
263+ sum(namespace_cpu:kube_pod_container_resource_requests:sum{})
264+ -
265+ (
266+ # Skip clusters with only one allocatable node.
267+ (
268+ sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})
269+ -
270+ max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})
271+ ) > 0
272+ ) > 0
273+ )
260274 for : 10m
261275 labels :
262276 namespace : kube-system
@@ -266,17 +280,30 @@ spec:
266280 description : Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
267281 summary : Cluster has overcommitted memory resource requests.
268282 expr : |
269- (sum(namespace_memory:kube_pod_container_resource_requests:sum{}) -
270- sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 0
271- and
272- count(max by (node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3)
283+ # Non-HA clusters.
284+ (
285+ (
286+ sum(namespace_memory:kube_pod_container_resource_requests:sum{})
287+ -
288+ sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 0
289+ )
290+ and
291+ count(max by (node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3
292+ )
273293 or
274- (sum(namespace_memory:kube_pod_container_resource_requests:sum{}) -
275- (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) -
276- max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0
277- and
278- (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) -
279- max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0)
294+ # HA clusters.
295+ (
296+ sum(namespace_memory:kube_pod_container_resource_requests:sum{})
297+ -
298+ (
299+ # Skip clusters with only one allocatable node.
300+ (
301+ sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})
302+ -
303+ max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})
304+ ) > 0
305+ ) > 0
306+ )
280307 for : 10m
281308 labels :
282309 namespace : kube-system
@@ -468,7 +495,18 @@ spec:
468495 description : Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
469496 summary : Kubelet Pod startup latency is too high.
470497 expr : |
471- histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
498+ histogram_quantile(0.99,
499+ sum by (cluster, instance, le) (
500+ topk by (cluster, instance, le, operation_type) (1,
501+ rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])
502+ )
503+ )
504+ )
505+ * on(cluster, instance) group_left(node)
506+ topk by (cluster, instance, node) (1,
507+ kubelet_node_name{job="kubelet", metrics_path="/metrics"}
508+ )
509+ > 60
472510 for : 15m
473511 labels :
474512 namespace : kube-system
0 commit comments