Skip to content

Commit

Permalink
metrics: add grafana and alertmanager scripts (#870)
Browse files Browse the repository at this point in the history
  • Loading branch information
WangXiangUSTC authored and july2993 committed Jan 9, 2020
1 parent d0987e4 commit ede19d5
Show file tree
Hide file tree
Showing 2 changed files with 2,213 additions and 0 deletions.
84 changes: 84 additions & 0 deletions metrics/alertmanager/binlog.rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
groups:
- name: alert.rules
rules:
- alert: binlog_pump_storage_error_count
expr: changes(binlog_pump_storage_error_count[1m]) > 0
labels:
env: ENV_LABELS_ENV
level: emergency
expr: changes(binlog_pump_storage_error_count[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: binlog pump storage write some binlogs failed

- alert: binlog_drainer_checkpoint_high_delay
expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: binlog drainer checkpoint delay more than 1 hour

- alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket
expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m]))
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: binlog pump write binlog RPC latency is too high

- alert: binlog_pump_storage_write_binlog_duration_time_bucket
expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m]))
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: binlog pump write binlog to disk is too slow

- alert: binlog_pump_storage_available_size_less_than_20G
expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024
for: 10s
labels:
env: ENV_LABELS_ENV
level: warning
expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: binlog pump storage available size less than 20G

- alert: binlog_drainer_execute_duration_time_more_than_10s
expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) > 10
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: binlog binlog drainer execute_duration_time_more_than_10s

- alert: binlog_drainer_checkpoint_tso_no_change_for_1m
expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: binlog drainer checkpoint tso no change for 1m
Loading

0 comments on commit ede19d5

Please sign in to comment.