Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,14 @@ parameters:
severity: critical
K8upBackupNotRunning:
annotations:
message: No K8up jobs were run in {{ $labels.exported_namespace }} within the last 24 hours. Check the operator, there might be a deadlock
expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0
message: No K8up jobs were run in {{ $labels.namespace }} within the last 24 hours. Check the operator, there might be a deadlock
expr: sum by (namespace) (rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: Figure out if we need to limit this to selected namespaces, currently we'd probably get paged for any stuck customer backups.

for: 1m
labels:
severity: critical
K8upJobStuck:
annotations:
message: Queued K8up jobs in {{ $labels.exported_namespace }} for the last hour.
message: Queued K8up jobs in {{ $labels.namespace }} for the last hour.
expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0
for: 1h
labels:
Expand Down
13 changes: 13 additions & 0 deletions component/monitoring.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,19 @@ local service_monitor = com.namespaced(params.namespace, {
{
interval: '10s',
port: 'http',
// NOTE(sg): This is required to ensure that the backup namespace is
// preserved as label `namespace`. Without this, the scraped metrics
// have the backup namespace as `exported_namespace` and are useless
// for OCP User Workload monitoring users, because UWM only allows
// querying metrics whose `namespace` label matches the alert rule
// source namespace.
honorLabels: true,
// add k8up namespace as label `k8up_namespace`.
relabelings: [ {
action: 'replace',
sourceLabels: [ 'namespace' ],
targetLabel: 'k8up_namespace',
} ],
},
],
selector: {
Expand Down
19 changes: 12 additions & 7 deletions tests/golden/defaults/backup-k8up/backup-k8up/30_monitoring.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,18 @@ spec:
rules:
- alert: K8upBackupNotRunning
annotations:
message: No K8up jobs were run in {{ $labels.exported_namespace }} within
the last 24 hours. Check the operator, there might be a deadlock
expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge
> 0
message: No K8up jobs were run in {{ $labels.namespace }} within the last
24 hours. Check the operator, there might be a deadlock
expr: sum by (namespace) (rate(k8up_jobs_total[25h])) == 0 and on(namespace)
k8up_schedules_gauge > 0
for: 1m
labels:
severity: critical
syn: 'true'
syn_component: backup-k8up
- alert: K8upJobStuck
annotations:
message: Queued K8up jobs in {{ $labels.exported_namespace }} for the
last hour.
message: Queued K8up jobs in {{ $labels.namespace }} for the last hour.
expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge
> 0
for: 1h
Expand Down Expand Up @@ -127,8 +126,14 @@ metadata:
namespace: syn-backup-k8up
spec:
endpoints:
- interval: 10s
- honorLabels: true
interval: 10s
port: http
relabelings:
- action: replace
sourceLabels:
- namespace
targetLabel: k8up_namespace
selector:
matchLabels:
app.kubernetes.io/instance: k8up
Expand Down