|
| 1 | +apiVersion: monitoring.coreos.com/v1 |
| 2 | +kind: PrometheusRule |
| 3 | +metadata: |
| 4 | + annotations: |
| 5 | + source: https://github.com/projectsyn/component-prometheus |
| 6 | + labels: |
| 7 | + app.kubernetes.io/component: exporter |
| 8 | + app.kubernetes.io/managed-by: commodore |
| 9 | + app.kubernetes.io/name: kube-prometheus |
| 10 | + app.kubernetes.io/part-of: kube-prometheus |
| 11 | + monitoring.syn.tools/enabled: 'true' |
| 12 | + prometheus: default-instance |
| 13 | + role: alert-rules |
| 14 | + name: kube-prometheus-rules |
| 15 | + namespace: syn-prometheus |
| 16 | +spec: |
| 17 | + groups: |
| 18 | + - name: general.rules |
| 19 | + rules: |
| 20 | + - alert: TargetDown |
| 21 | + annotations: |
| 22 | + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ |
| 23 | + $labels.service }} targets in {{ $labels.namespace }} namespace are |
| 24 | + down.' |
| 25 | + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown |
| 26 | + summary: One or more targets are unreachable. |
| 27 | + expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) |
| 28 | + BY (cluster, job, namespace, service)) > 10 |
| 29 | + for: 10m |
| 30 | + labels: |
| 31 | + severity: warning |
| 32 | + - alert: Watchdog |
| 33 | + annotations: |
| 34 | + description: | |
| 35 | + This is an alert meant to ensure that the entire alerting pipeline is functional. |
| 36 | + This alert is always firing, therefore it should always be firing in Alertmanager |
| 37 | + and always fire against a receiver. There are integrations with various notification |
| 38 | + mechanisms that send a notification when this alert is not firing. For example the |
| 39 | + "DeadMansSnitch" integration in PagerDuty. |
| 40 | + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog |
| 41 | + summary: An alert that should always be firing to certify that Alertmanager |
| 42 | + is working properly. |
| 43 | + expr: vector(1) |
| 44 | + labels: |
| 45 | + severity: none |
| 46 | + - alert: InfoInhibitor |
| 47 | + annotations: |
| 48 | + description: | |
| 49 | + This is an alert that is used to inhibit info alerts. |
| 50 | + By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with |
| 51 | + other alerts. |
| 52 | + This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a |
| 53 | + severity of 'warning' or 'critical' starts firing on the same namespace. |
| 54 | + This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". |
| 55 | + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor |
| 56 | + summary: Info-level alert inhibition. |
| 57 | + expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname |
| 58 | + != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} |
| 59 | + == 1 |
| 60 | + labels: |
| 61 | + severity: none |
| 62 | + - name: node-network |
| 63 | + rules: |
| 64 | + - alert: NodeNetworkInterfaceFlapping |
| 65 | + annotations: |
| 66 | + description: Network interface "{{ $labels.device }}" changing its up |
| 67 | + status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod |
| 68 | + }} |
| 69 | + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping |
| 70 | + summary: Network interface is often changing its status |
| 71 | + expr: | |
| 72 | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 |
| 73 | + for: 2m |
| 74 | + labels: |
| 75 | + severity: warning |
| 76 | + - name: kube-prometheus-node-recording.rules |
| 77 | + rules: |
| 78 | + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) |
| 79 | + BY (instance) |
| 80 | + record: instance:node_cpu:rate:sum |
| 81 | + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) |
| 82 | + record: instance:node_network_receive_bytes:rate:sum |
| 83 | + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) |
| 84 | + record: instance:node_network_transmit_bytes:rate:sum |
| 85 | + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) |
| 86 | + WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) |
| 87 | + BY (instance, cpu)) BY (instance) |
| 88 | + record: instance:node_cpu:ratio |
| 89 | + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) |
| 90 | + record: cluster:node_cpu:sum_rate5m |
| 91 | + - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY |
| 92 | + (instance, cpu)) |
| 93 | + record: cluster:node_cpu:ratio |
| 94 | + - name: kube-prometheus-general.rules |
| 95 | + rules: |
| 96 | + - expr: count without(instance, pod, node) (up == 1) |
| 97 | + record: count:up1 |
| 98 | + - expr: count without(instance, pod, node) (up == 0) |
| 99 | + record: count:up0 |
0 commit comments