Skip to content

Commit d800670

Browse files
committed
Fix discrepancies concerning rule generations
Signed-off-by: Kemal Akkoyun <[email protected]>
1 parent 95d42c1 commit d800670

13 files changed

+306
-186
lines changed

README.md

+27-12
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
# observatorium/configuration
2+
23
This projects holds all the configuration files for our internal Observatorium deployments.
34

4-
## Installing jsonnet dependencies.
5+
## Installing jsonnet dependencies
6+
57
To install all dependencies:
6-
```
8+
9+
```console
710
jb install
811
#installs pinned versions from `jsonnetfile.lock.json` file.
912
```
1013

1114
To update a dependency:
12-
```
15+
16+
```console
1317
jb update https://github.com/thanos-io/kube-thanos
1418
#updates `kube-thanos` to master and sets the new hash in `jsonnetfile.lock.json`.
1519

@@ -18,35 +22,46 @@ jb update
1822
```
1923

2024
## Grafana dashboards
25+
2126
All dashboards are generated in `manifests/production/grafana` with:
22-
```
27+
28+
```console
2329
make grafana
2430
```
2531

2632
**Staging**: deploys on every commit master.
2733

2834
**Production**: update the commit hash ref in [`https://gitlab.cee.redhat.com/service/app-interface/blob/master/data/services/observability/cicd/saas/saas-grafana.yaml`](https://gitlab.cee.redhat.com/service/app-interface/blob/master/data/services/observability/cicd/saas/saas-grafana.yaml)
2935

36+
## Prometheus Rules
37+
38+
Use `synchronize.sh` to create a MR against `app-interface` to update dashboards.
39+
3040
## Components - Deployments, ServiceMonitors, ConfigMaps etc...
41+
3142
All components manifests are generated in `manifests/production/` with:
32-
```
43+
44+
```console
3345
make manifests
3446
```
47+
3548
**Staging**: deploys on every commit master.
3649

3750
**Production**: update the commit hash ref in [`https://gitlab.cee.redhat.com/service/app-interface/blob/master/data/services/telemeter/cicd/saas.yaml`](https://gitlab.cee.redhat.com/service/app-interface/blob/master/data/services/telemeter/cicd/saas.yaml)
3851

39-
4052
## CI Jobs
41-
Jobs runs are posted in:<br/>
42-
`#sd-app-sre-info` for grafana dashboards <br/>
43-
and <br/>
44-
`#team-monitoring-info` for everyrhing else.
53+
54+
Jobs runs are posted in:
55+
56+
`#sd-app-sre-info` for grafana dashboards
57+
58+
and
59+
60+
`#team-monitoring-info` for everything else.
4561

4662
## Troubleshooting
63+
4764
1. Enable port forwarding for a user - [example](
4865
https://gitlab.cee.redhat.com/service/app-interface/-/blob/ee91aac666ee39a273332c59ad4bdf7e0f50eeba/data/teams/telemeter/users/fbranczy.yml#L14)
4966
2. Add a pod name to the allowed list for port forwarding - [example](
5067
https://gitlab.cee.redhat.com/service/app-interface/-/blob/ee91aac666ee39a273332c59ad4bdf7e0f50eeba/resources/app-sre/telemeter-production/observatorium-allow-port-forward.role.yaml#L10)
51-
52-

prometheusrules.jsonnet

+52-22
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,17 @@ local absent(name, job) = {
2929

3030
// Add dashboards and runbook anntotations.
3131
// Overwrite severity to medium and high.
32-
local appSREOverwrites(prometheusAlerts, namespace) = {
32+
local appSREOverwrites(namespace) = {
33+
local environment = std.split(namespace, '-')[1],
34+
local dashboardDatasource = function(environment) {
35+
datasource:
36+
if
37+
environment == 'stage' then 'app-sre-stage-01-prometheus'
38+
else if
39+
environment == 'production' then 'telemeter-prod-01-prometheus'
40+
else error 'no datasource for environment %s' % environment,
41+
},
42+
3343
local dashboardID = function(name) {
3444
id:
3545
if
@@ -57,34 +67,54 @@ local appSREOverwrites(prometheusAlerts, namespace) = {
5767
else error 'no dashboard id for group %s' % name,
5868
},
5969

60-
local setSeverity = function(label, alertName) {
61-
label: if std.startsWith(alertName, 'Loki') then 'info'
62-
else if label == 'critical' then
63-
// For thanos page only for `ThanosNoRuleEvaluations`.
64-
if std.startsWith(alertName, 'Thanos') then
65-
if alertName != 'ThanosNoRuleEvaluations' then 'high' else label
66-
else label
67-
else if label == 'warning' then 'medium'
68-
else 'high',
70+
local setSeverity = function(label, environment, alertName) {
71+
label:
72+
if
73+
std.startsWith(alertName, 'Loki') then 'info'
74+
else if
75+
label == 'critical' then
76+
if
77+
environment == 'stage' then 'high'
78+
else if
79+
// For thanos, page only for `ThanosNoRuleEvaluations`.
80+
std.startsWith(alertName, 'Thanos') && alertName != 'ThanosNoRuleEvaluations' then 'high'
81+
else label
82+
else if
83+
label == 'warning' then 'medium'
84+
else 'high',
6985
},
7086

7187
groups: [
7288
g {
7389
rules: [
7490
if std.objectHas(r, 'alert') then
7591
r {
76-
annotations+: {
77-
runbook: 'https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#%s' % std.asciiLower(r.alert),
78-
dashboard: if std.startsWith(g.name, 'telemeter') then 'https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADJ/telemeter?orgId=1&refresh=1m&var-datasource=telemeter-prod-01-prometheus'
79-
else 'https://grafana.app-sre.devshift.net/d/%s/%s?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=%s&var-job=All&var-pod=All&var-interval=5m' % [
80-
dashboardID(g.name).id,
81-
g.name,
82-
namespace,
83-
],
84-
},
92+
annotations+:
93+
{
94+
// Message is a required field. Upstream thanos-mixin doesn't have it.
95+
message: if std.objectHasAll(self, 'description') then self.description else r.annotations.message,
96+
} +
97+
if std.startsWith(g.name, 'telemeter') then
98+
{
99+
runbook: 'https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/telemeter.md#%s' % std.asciiLower(r.alert),
100+
dashboard: 'https://grafana.app-sre.devshift.net/d/%s/telemeter?orgId=1&refresh=1m&var-datasource=%s' % [
101+
dashboardID(g.name).id,
102+
dashboardDatasource(environment).datasource,
103+
],
104+
}
105+
else
106+
{
107+
runbook: 'https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#%s' % std.asciiLower(r.alert),
108+
dashboard: 'https://grafana.app-sre.devshift.net/d/%s/%s?orgId=1&refresh=10s&var-datasource=%s&var-namespace=%s&var-job=All&var-pod=All&var-interval=5m' % [
109+
dashboardID(g.name).id,
110+
g.name,
111+
dashboardDatasource(environment).datasource,
112+
namespace,
113+
],
114+
},
85115
labels+: {
86116
service: 'telemeter',
87-
severity: setSeverity(r.labels.severity, r.alert).label,
117+
severity: setSeverity(r.labels.severity, environment, r.alert).label,
88118
},
89119
} else r
90120
for r in super.rules
@@ -152,7 +182,7 @@ local renderAlerts(name, namespace, mixin) = {
152182
},
153183

154184
spec: mixin {
155-
prometheusAlerts+:: appSREOverwrites(super.prometheusAlerts, namespace),
185+
prometheusAlerts+:: appSREOverwrites(namespace),
156186
}.prometheusAlerts,
157187
};
158188

@@ -204,7 +234,7 @@ local renderAlerts(name, namespace, mixin) = {
204234
},
205235

206236
'telemeter-slos-stage.prometheusrules': renderAlerts('telemeter-slos-stage', 'telemeter-stage', telemeter),
207-
'telemeter-slos-production.prometheusrules': renderAlerts('telemeter-slos-production', 'telemeter-prodcution', telemeter),
237+
'telemeter-slos-production.prometheusrules': renderAlerts('telemeter-slos-production', 'telemeter-production', telemeter),
208238
}
209239

210240
{

resources/observability/prometheusrules/observatorium-api-production.prometheusrules.yaml

+12-12
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ spec:
6969
record: http_requests_total:burnrate6h
7070
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
7171
annotations:
72-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-write-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
72+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-write-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
7373
message: 'High error budget burn for handler=receive,job=observatorium-observatorium-api (current value: {{ $value }})'
7474
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
7575
expr: |
@@ -84,7 +84,7 @@ spec:
8484
severity: critical
8585
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
8686
annotations:
87-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-write-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
87+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-write-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
8888
message: 'High error budget burn for handler=receive,job=observatorium-observatorium-api (current value: {{ $value }})'
8989
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
9090
expr: |
@@ -99,7 +99,7 @@ spec:
9999
severity: critical
100100
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
101101
annotations:
102-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-write-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
102+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-write-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
103103
message: 'High error budget burn for handler=receive,job=observatorium-observatorium-api (current value: {{ $value }})'
104104
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
105105
expr: |
@@ -114,7 +114,7 @@ spec:
114114
severity: medium
115115
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
116116
annotations:
117-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-write-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
117+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-write-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
118118
message: 'High error budget burn for handler=receive,job=observatorium-observatorium-api (current value: {{ $value }})'
119119
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
120120
expr: |
@@ -187,7 +187,7 @@ spec:
187187
record: http_requests_total:burnrate6h
188188
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
189189
annotations:
190-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
190+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
191191
message: 'High error budget burn for handler=~query|query_legacy,job=observatorium-observatorium-api (current value: {{ $value }})'
192192
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
193193
expr: |
@@ -202,7 +202,7 @@ spec:
202202
severity: critical
203203
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
204204
annotations:
205-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
205+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
206206
message: 'High error budget burn for handler=~query|query_legacy,job=observatorium-observatorium-api (current value: {{ $value }})'
207207
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
208208
expr: |
@@ -217,7 +217,7 @@ spec:
217217
severity: critical
218218
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
219219
annotations:
220-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
220+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
221221
message: 'High error budget burn for handler=~query|query_legacy,job=observatorium-observatorium-api (current value: {{ $value }})'
222222
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
223223
expr: |
@@ -232,7 +232,7 @@ spec:
232232
severity: medium
233233
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
234234
annotations:
235-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
235+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
236236
message: 'High error budget burn for handler=~query|query_legacy,job=observatorium-observatorium-api (current value: {{ $value }})'
237237
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
238238
expr: |
@@ -305,7 +305,7 @@ spec:
305305
record: http_requests_total:burnrate6h
306306
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
307307
annotations:
308-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-range-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
308+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-range-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
309309
message: 'High error budget burn for handler=query_range,job=observatorium-observatorium-api (current value: {{ $value }})'
310310
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
311311
expr: |
@@ -320,7 +320,7 @@ spec:
320320
severity: critical
321321
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
322322
annotations:
323-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-range-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
323+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-range-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
324324
message: 'High error budget burn for handler=query_range,job=observatorium-observatorium-api (current value: {{ $value }})'
325325
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
326326
expr: |
@@ -335,7 +335,7 @@ spec:
335335
severity: critical
336336
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
337337
annotations:
338-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-range-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
338+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-range-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
339339
message: 'High error budget burn for handler=query_range,job=observatorium-observatorium-api (current value: {{ $value }})'
340340
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
341341
expr: |
@@ -350,7 +350,7 @@ spec:
350350
severity: medium
351351
- alert: ObservatoriumAPIErrorsSLOBudgetBurn
352352
annotations:
353-
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-range-errors.slo.rules?orgId=1&refresh=10s&var-datasource=app-sre-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
353+
dashboard: https://grafana.app-sre.devshift.net/d/Tg-mH0rizaSJDKSADX/observatorium-api-query-range-errors.slo.rules?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace=telemeter-production&var-job=All&var-pod=All&var-interval=5m
354354
message: 'High error budget burn for handler=query_range,job=observatorium-observatorium-api (current value: {{ $value }})'
355355
runbook: https://gitlab.cee.redhat.com/observatorium/configuration/blob/master/docs/sop/observatorium.md#observatoriumapierrorsslobudgetburn
356356
expr: |

0 commit comments

Comments
 (0)