Skip to content

Commit

Permalink
fix: kubelet job name in prometheus rules
Browse files Browse the repository at this point in the history
  • Loading branch information
JGiola committed Oct 9, 2024
1 parent a292afa commit 97a6f48
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 29 deletions.
2 changes: 1 addition & 1 deletion charts/monitoring/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: monitoring
version: 2.2.2
version: 2.2.3
kubeVersion: ">= 1.20.0-0"
appVersion: v0.74.0
description: "A Kubernetes monitoring stack based on Prometheus Operator"
Expand Down
11 changes: 5 additions & 6 deletions charts/monitoring/templates/default-rules/k8s.rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ This rules are lifted from the 'k8s.rules' group from https://raw.githubusercont
We only divide the list in sub resources for better skimming through them
*/ -}}
{{- $kubeStateMetricsJob := printf "%s" (include "mia-monitoring.kubeStateMetrics.name" .) }}
{{- $kubeletJob := printf "%s" (include "mia-monitoring.kubelet.fullname" .) }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -20,31 +19,31 @@ spec:
rules:
- expr: |
sum by (cluster, namespace, pod, container) (
irate(container_cpu_usage_seconds_total{job="{{ $kubeletJob }}", metrics_path="/metrics/cadvisor", image!=""}[5m])
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
- expr: |
container_memory_working_set_bytes{job="{{ $kubeletJob }}", metrics_path="/metrics/cadvisor", image!=""}
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes
- expr: |
container_memory_rss{job="{{ $kubeletJob }}", metrics_path="/metrics/cadvisor", image!=""}
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss
- expr: |
container_memory_cache{job="{{ $kubeletJob }}", metrics_path="/metrics/cadvisor", image!=""}
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache
- expr: |
container_memory_swap{job="{{ $kubeletJob }}", metrics_path="/metrics/cadvisor", image!=""}
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
Expand Down
7 changes: 3 additions & 4 deletions charts/monitoring/templates/default-rules/kubelet.rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
This rules are lifted from the 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml
We only divide the list in sub resources for better skimming through them
*/ -}}
{{- $kubeletJob := printf "%s" (include "mia-monitoring.kubelet.fullname" .) }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -18,17 +17,17 @@ spec:
- name: kubelet.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="{{ $kubeletJob }}"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="{{ $kubeletJob }}", metrics_path="/metrics"})
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.99"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="{{ $kubeletJob }}"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="{{ $kubeletJob }}", metrics_path="/metrics"})
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.9"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="{{ $kubeletJob }}"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="{{ $kubeletJob }}", metrics_path="/metrics"})
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.5"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
29 changes: 14 additions & 15 deletions charts/monitoring/templates/default-rules/kubernetes-storage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ This rules are lifted from the 'kuberentes-storage' group from https://raw.githu
We only divide the list in sub resources for better skimming through them
*/ -}}
{{- $kubeStateMetricsJob := printf "%s" (include "mia-monitoring.kubeStateMetrics.name" .) }}
{{- $kubeletJob := printf "%s" (include "mia-monitoring.kubelet.fullname" .) }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -25,12 +24,12 @@ spec:
summary: PersistentVolume is filling up.
expr: |
(
kubelet_volume_stats_available_bytes{job="{{ $kubeletJob }}", metrics_path="/metrics"}
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="{{ $kubeletJob }}", metrics_path="/metrics"}
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_used_bytes{job="{{ $kubeletJob }}", metrics_path="/metrics"} > 0
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
unless on(cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on(cluster, namespace, persistentvolumeclaim)
Expand All @@ -45,14 +44,14 @@ spec:
summary: PersistentVolume is filling up.
expr: |
(
kubelet_volume_stats_available_bytes{job="{{ $kubeletJob }}", metrics_path="/metrics"}
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="{{ $kubeletJob }}", metrics_path="/metrics"}
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_used_bytes{job="{{ $kubeletJob }}", metrics_path="/metrics"} > 0
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_available_bytes{job="{{ $kubeletJob }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on(cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on(cluster, namespace, persistentvolumeclaim)
Expand All @@ -67,12 +66,12 @@ spec:
summary: PersistentVolumeInodes are filling up.
expr: |
(
kubelet_volume_stats_inodes_free{job="{{ $kubeletJob }}", metrics_path="/metrics"}
kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="{{ $kubeletJob }}", metrics_path="/metrics"}
kubelet_volume_stats_inodes{job="kubelet", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_inodes_used{job="{{ $kubeletJob }}", metrics_path="/metrics"} > 0
kubelet_volume_stats_inodes_used{job="kubelet", metrics_path="/metrics"} > 0
unless on(cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on(cluster, namespace, persistentvolumeclaim)
Expand All @@ -87,14 +86,14 @@ spec:
summary: PersistentVolumeInodes are filling up.
expr: |
(
kubelet_volume_stats_inodes_free{job="{{ $kubeletJob }}", metrics_path="/metrics"}
kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="{{ $kubeletJob }}", metrics_path="/metrics"}
kubelet_volume_stats_inodes{job="kubelet", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_inodes_used{job="{{ $kubeletJob }}", metrics_path="/metrics"} > 0
kubelet_volume_stats_inodes_used{job="kubelet", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_inodes_free{job="{{ $kubeletJob }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on(cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on(cluster, namespace, persistentvolumeclaim)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ This rules are lifted from the 'kubernetes-system-kubelet' group from https://ra
We only divide the list in sub resources for better skimming through them
*/ -}}
{{- $kubeStateMetricsJob := printf "%s" (include "mia-monitoring.kubeStateMetrics.name" .) }}
{{- $kubeletJob := printf "%s" (include "mia-monitoring.kubelet.fullname" .) }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand Down Expand Up @@ -80,7 +79,7 @@ spec:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh/
summary: Kubelet Pod startup latency is too high.
expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="{{ $kubeletJob }}", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="{{ $kubeletJob }}", metrics_path="/metrics"} > 60
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m
labels:
severity: warning
Expand Down Expand Up @@ -146,7 +145,7 @@ spec:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown/
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="{{ $kubeletJob }}", metrics_path="/metrics"} == 1)
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical

0 comments on commit 97a6f48

Please sign in to comment.