Skip to content

Commit ac7ef4c

Browse files
committed
OCPBUGS-15430: remove Kubernetes API alerting rules
This commit removes the Kubernetes API alerting rules from the `kubernetes-monitoring-rules` PrometheusRule resource. From now on, these rules will be managed by the Kubernetes API server operator. Signed-off-by: Simon Pasquier <[email protected]>
1 parent 54a344d commit ac7ef4c

File tree

4 files changed

+5
-91
lines changed

4 files changed

+5
-91
lines changed

assets/control-plane/prometheus-rule.yaml

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -331,46 +331,6 @@ spec:
331331
for: 15m
332332
labels:
333333
severity: warning
334-
- name: kubernetes-system-apiserver
335-
rules:
336-
- alert: KubeAggregatedAPIErrors
337-
annotations:
338-
description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors.
339-
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAggregatedAPIErrors.md
340-
summary: Kubernetes aggregated API has reported errors.
341-
expr: |
342-
sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0
343-
for: 10m
344-
labels:
345-
severity: warning
346-
- alert: KubeAggregatedAPIDown
347-
annotations:
348-
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
349-
summary: Kubernetes aggregated API is down.
350-
expr: |
351-
(1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
352-
for: 15m
353-
labels:
354-
severity: warning
355-
- alert: KubeAPIDown
356-
annotations:
357-
description: KubeAPI has disappeared from Prometheus target discovery.
358-
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAPIDown.md
359-
summary: Target disappeared from Prometheus target discovery.
360-
expr: |
361-
absent(up{job="apiserver"} == 1)
362-
for: 15m
363-
labels:
364-
severity: critical
365-
- alert: KubeAPITerminatedRequests
366-
annotations:
367-
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
368-
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
369-
expr: |
370-
sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
371-
for: 5m
372-
labels:
373-
severity: warning
374334
- name: kubernetes-system-kubelet
375335
rules:
376336
- alert: KubeNodeNotReady

jsonnet/utils/sanitize-rules.libsonnet

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ local k8sMixinUtils = import 'github.com/kubernetes-monitoring/kubernetes-mixin/
22

33
// List of rule groups which are dropped from the final manifests.
44
local excludedRuleGroups = [
5-
'kube-apiserver-availability.rules',
65
// rules managed by openshift/cluster-kube-controller-manager-operator.
76
'kubernetes-system-controller-manager',
87
// rules managed by openshift/cluster-kube-scheduler-operator.
@@ -12,6 +11,8 @@ local excludedRuleGroups = [
1211
'kube-apiserver.rules',
1312
'kube-apiserver-burnrate.rules',
1413
'kube-apiserver-histogram.rules',
14+
'kube-apiserver-availability.rules',
15+
'kubernetes-system-apiserver',
1516
// Availability of kube-proxy depends on the selected CNO plugin hence the
1617
// rules should be managed by CNO directly.
1718
'kubernetes-system-kube-proxy',
@@ -64,15 +65,6 @@ local excludedRules = [
6465
{ alert: 'KubeMemoryQuotaOvercommit' },
6566
],
6667
},
67-
{
68-
name: 'kubernetes-system-apiserver',
69-
rules: [
70-
// KubeClientCertificateExpiration alert isn't
71-
// actionable because the cluster admin has no way to
72-
// prevent a client from using an expird certificate.
73-
{ alert: 'KubeClientCertificateExpiration' },
74-
],
75-
},
7668
{
7769
name: 'kubernetes-system-kubelet',
7870
rules: [
@@ -383,15 +375,6 @@ local patchedRules = [
383375
},
384376
],
385377
},
386-
{
387-
name: 'kubernetes-system-apiserver',
388-
rules: [
389-
{
390-
alert: 'KubeAggregatedAPIDown',
391-
'for': '15m',
392-
},
393-
],
394-
},
395378
{
396379
name: 'prometheus',
397380
rules: [
@@ -510,8 +493,6 @@ local includeRunbooks = {
510493
ClusterMonitoringOperatorDeprecatedConfig: openShiftRunbookCMO('ClusterMonitoringOperatorDeprecatedConfig.md'),
511494
ClusterOperatorDegraded: openShiftRunbookCMO('ClusterOperatorDegraded.md'),
512495
ClusterOperatorDown: openShiftRunbookCMO('ClusterOperatorDown.md'),
513-
KubeAggregatedAPIErrors: openShiftRunbookCMO('KubeAggregatedAPIErrors.md'),
514-
KubeAPIDown: openShiftRunbookCMO('KubeAPIDown.md'),
515496
KubeDeploymentReplicasMismatch: openShiftRunbookCMO('KubeDeploymentReplicasMismatch.md'),
516497
KubeJobFailed: openShiftRunbookCMO('KubeJobFailed.md'),
517498
KubeNodeNotReady: openShiftRunbookCMO('KubeNodeNotReady.md'),

pkg/manifests/manifests.go

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2474,21 +2474,6 @@ func (f *Factory) ControlPlanePrometheusRule() (*monv1.PrometheusRule, error) {
24742474

24752475
r.Namespace = f.namespace
24762476

2477-
if f.infrastructure.HostedControlPlane() {
2478-
groups := []monv1.RuleGroup{}
2479-
for _, g := range r.Spec.Groups {
2480-
switch g.Name {
2481-
case "kubernetes-system-apiserver",
2482-
"kubernetes-system-controller-manager",
2483-
"kubernetes-system-scheduler":
2484-
// skip
2485-
default:
2486-
groups = append(groups, g)
2487-
}
2488-
}
2489-
r.Spec.Groups = groups
2490-
}
2491-
24922477
return r, nil
24932478
}
24942479

pkg/manifests/manifests_test.go

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3873,25 +3873,14 @@ func TestPrometheusK8sControlPlaneRulesFiltered(t *testing.T) {
38733873
tests := []struct {
38743874
name string
38753875
infrastructure InfrastructureReader
3876-
verify func(bool)
38773876
}{
38783877
{
38793878
name: "default config",
38803879
infrastructure: defaultInfrastructureReader(),
3881-
verify: func(api bool) {
3882-
if !api {
3883-
t.Fatal("did not get all expected kubernetes control plane rules")
3884-
}
3885-
},
38863880
},
38873881
{
38883882
name: "hosted control plane",
38893883
infrastructure: &fakeInfrastructureReader{highlyAvailableInfrastructure: true, hostedControlPlane: true},
3890-
verify: func(api bool) {
3891-
if api {
3892-
t.Fatalf("kubernetes control plane rules found, none expected")
3893-
}
3894-
},
38953884
},
38963885
}
38973886

@@ -3901,14 +3890,13 @@ func TestPrometheusK8sControlPlaneRulesFiltered(t *testing.T) {
39013890
if err != nil {
39023891
t.Fatal(err)
39033892
}
3904-
apiServerRulesFound := false
3893+
39053894
for _, g := range r.Spec.Groups {
39063895
switch g.Name {
3907-
case "kubernetes-system-apiserver":
3908-
apiServerRulesFound = true
3896+
case "kubernetes-system-apiserver", "kubernetes-system-controller-manager", "kubernetes-system-scheduler":
3897+
t.Fatalf("Kubernetes control plane rule group %s found, none expected", g.Name)
39093898
}
39103899
}
3911-
tc.verify(apiServerRulesFound)
39123900
}
39133901
}
39143902

0 commit comments

Comments
 (0)