Skip to content

Commit

Permalink
Merge pull request #34 from nuriel77/add/alerts
Browse files Browse the repository at this point in the history
Added alertmanager and prometheus alerting rules
  • Loading branch information
nuriel77 authored Dec 21, 2017
2 parents b3810c6 + 5a70158 commit b127407
Show file tree
Hide file tree
Showing 14 changed files with 1,316 additions and 39 deletions.
11 changes: 11 additions & 0 deletions group_vars/all/monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,14 @@ nodesource_version: 8
iota_prom_exporter_basedir: iota-prom-exporter
iota_prom_exporter_port: 9311
iota_prom_exporter_bind: 127.0.0.1

alertmanager_basedir: /opt/prometheus/alertmanager
alertmanager_version: 0.12.0
alertmanager_port: 9093
alertmanager_nginx_port: 9993
alertmanager_bind: 127.0.0.1
alertmanager_email_from: alertmanager
alertmanager_email_to: root@localhost
alertmanager_loglevel: info
smtp_host: localhost
smtp_port: 25
224 changes: 224 additions & 0 deletions roles/monitoring/files/alert.rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
groups:
- name: alert.rules
rules:

# Since we're configuring prometheus on this host, I doubt this alert will ever trigger.
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
for more than 1 minute.'
summary: Instance {{ $labels.instance }} down

# CPU Usage Alerts
- alert: NodeCPUUsageWarning
expr: (100 - (avg(irate(node_cpu{mode="idle",name="node-exporter"}[5m])) BY (instance)
* 100)) > 75
for: 3m
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: CPU usage is above 75% (current value is:
{{ $value }})'
SUMMARY: '{{$labels.instance}}: High CPU usage detected'

- alert: NodeCPUUsageCritical
expr: (100 - (avg(irate(node_cpu{mode="idle",name="node-exporter"}[5m])) BY (instance)
* 100)) > 90
for: 3m
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: CPU usage is above 90% (current value is:
{{ $value }})'
SUMMARY: '{{$labels.instance}}: High CPU usage detected'


# Load avg Alerts
- alert: NodeLoadAverageWarning
expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 4)
for: 5m
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: LA is high'
SUMMARY: '{{$labels.instance}}: High LA detected'

- alert: NodeLoadAverageCritical
expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 7)
for: 5m
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: LA is very high'
SUMMARY: '{{$labels.instance}}: Very high LA detected'


# Low Disk Space Alerts
- alert: NodeLowRootDiskWarning
expr: ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"})
/ node_filesystem_size{mountpoint="/root-disk"} * 100) > 80
for: 2m
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: Root disk usage is above 80% (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Low root disk space'

- alert: NodeLowRootDiskCritical
expr: ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"})
/ node_filesystem_size{mountpoint="/root-disk"} * 100) > 95
for: 2m
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: Root disk usage is above 95% (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Low root disk space'

# Swap Usage Alerts
- alert: NodeSwapUsageWarning
expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal)
* 100) > 80
for: 3m
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: Swap usage usage is above 80% (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Swap usage detected'

- alert: NodeSwapUsageCritical
expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal)
* 100) > 95
for: 3m
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: Swap usage usage is above 95% (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Swap usage detected'

# Memory Usage Alerts
- alert: NodeMemoryUsageWarning
expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal)
* 100)) > 80
for: 5m
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: Memory usage is above 80% (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: High memory usage detected'

- alert: NodeMemoryUsageCritical
expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal)
* 100)) > 95
for: 5m
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: Memory usage is above 95% (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: High memory usage detected'

# Latest Milestone Alerts
- alert: LatestMileStoneWarning
expr: iota_node_info_latest_milestone == 243000
for: 2h
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: Latest Milestone Reset (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Latest Milestone reset and stuck'

- alert: LatestMileStoneCritical
expr: iota_node_info_latest_milestone == 243000
for: 4h
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: Latest Milestone Reset (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Latest Milestone reset and stuck'

- alert: LatestSubtangleMileStoneBehindWarning
expr: (iota_node_info_latest_milestone - iota_node_info_latest_subtangle_milestone)
> 5
for: 1h
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: Latest Subtangle Milestone lagging (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Latest Subtangle Milestone lagging'

- alert: LatestSubtangleMileStoneBehindCritical
expr: (iota_node_info_latest_milestone - iota_node_info_latest_subtangle_milestone)
> 10
for: 1h
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: Latest Subtangle Milestone lagging (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Latest Subtangle Milestone lagging'

# Neighbor Alerts
- alert: TotalNeighborsFewWarning
expr: iota_node_info_total_neighbors < 2
for: 1m
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: Too few neighbors (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Too few neighbors'

- alert: TotalNeighborsActiveWarning
expr: iota_node_info_total_neighbors > 11
for: 1h
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: Too many active neighbors (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Too many active neighbors'

- alert: TotalNeighborsActiveCritical
expr: iota_node_info_total_neighbors > 15
for: 1h
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: Too many active neighbors (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Too many active neighbors'

- alert: InactiveNeighborsWarning
expr: (iota_node_info_total_neighbors - iota_neighbors_active_neighbors)
> 1
for: 1h
labels:
severity: warning
annotations:
DESCRIPTION: '{{$labels.instance}}: Inactive Neighbors (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Inactive Neighbors'

- alert: InactiveNeighborsCritical
expr: (iota_node_info_total_neighbors - iota_neighbors_active_neighbors)
> 3
for: 1h
labels:
severity: critical
annotations:
DESCRIPTION: '{{$labels.instance}}: Inactive Neighbors (current value
is: {{ $value }})'
SUMMARY: '{{$labels.instance}}: Inactive Neighbors'

Loading

0 comments on commit b127407

Please sign in to comment.