-
Notifications
You must be signed in to change notification settings - Fork 27
/
alert-rules_CPU_RAM_HDD_NODESTATUS.yml
42 lines (37 loc) · 1.51 KB
/
alert-rules_CPU_RAM_HDD_NODESTATUS.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# mkdir /etc/prometheus/rules
# vim /etc/prometheus/rules/alert-rules.yml
groups:
- name: alert-rules
rules:
- alert: ExporterDown
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down for more than 5 minutes.'
summary: 'Exporter down (instance {{ $labels.instance }})'
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 15 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 15% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 15
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 15% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 85
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 85%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"