Skip to content

Commit 0142d3f

Browse files
Merge pull request #2671 from simonpasquier/OCPBUGS-15430
OCPBUGS-15430: remove Kubernetes API alerting rules
2 parents 2d71c48 + ac7ef4c commit 0142d3f

File tree

4 files changed

+5
-91
lines changed

4 files changed

+5
-91
lines changed

assets/control-plane/prometheus-rule.yaml

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -358,46 +358,6 @@ spec:
358358
for: 15m
359359
labels:
360360
severity: warning
361-
- name: kubernetes-system-apiserver
362-
rules:
363-
- alert: KubeAggregatedAPIErrors
364-
annotations:
365-
description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors.
366-
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAggregatedAPIErrors.md
367-
summary: Kubernetes aggregated API has reported errors.
368-
expr: |
369-
sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0
370-
for: 10m
371-
labels:
372-
severity: warning
373-
- alert: KubeAggregatedAPIDown
374-
annotations:
375-
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
376-
summary: Kubernetes aggregated API is down.
377-
expr: |
378-
(1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
379-
for: 15m
380-
labels:
381-
severity: warning
382-
- alert: KubeAPIDown
383-
annotations:
384-
description: KubeAPI has disappeared from Prometheus target discovery.
385-
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAPIDown.md
386-
summary: Target disappeared from Prometheus target discovery.
387-
expr: |
388-
absent(up{job="apiserver"} == 1)
389-
for: 15m
390-
labels:
391-
severity: critical
392-
- alert: KubeAPITerminatedRequests
393-
annotations:
394-
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
395-
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
396-
expr: |
397-
sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
398-
for: 5m
399-
labels:
400-
severity: warning
401361
- name: kubernetes-system-kubelet
402362
rules:
403363
- alert: KubeNodeNotReady

jsonnet/utils/sanitize-rules.libsonnet

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ local k8sMixinUtils = import 'github.com/kubernetes-monitoring/kubernetes-mixin/
22

33
// List of rule groups which are dropped from the final manifests.
44
local excludedRuleGroups = [
5-
'kube-apiserver-availability.rules',
65
// rules managed by openshift/cluster-kube-controller-manager-operator.
76
'kubernetes-system-controller-manager',
87
// rules managed by openshift/cluster-kube-scheduler-operator.
@@ -12,6 +11,8 @@ local excludedRuleGroups = [
1211
'kube-apiserver.rules',
1312
'kube-apiserver-burnrate.rules',
1413
'kube-apiserver-histogram.rules',
14+
'kube-apiserver-availability.rules',
15+
'kubernetes-system-apiserver',
1516
// Availability of kube-proxy depends on the selected CNO plugin hence the
1617
// rules should be managed by CNO directly.
1718
'kubernetes-system-kube-proxy',
@@ -64,15 +65,6 @@ local excludedRules = [
6465
{ alert: 'KubeMemoryQuotaOvercommit' },
6566
],
6667
},
67-
{
68-
name: 'kubernetes-system-apiserver',
69-
rules: [
70-
// KubeClientCertificateExpiration alert isn't
71-
// actionable because the cluster admin has no way to
72-
// prevent a client from using an expird certificate.
73-
{ alert: 'KubeClientCertificateExpiration' },
74-
],
75-
},
7668
{
7769
name: 'kubernetes-system-kubelet',
7870
rules: [
@@ -387,15 +379,6 @@ local patchedRules = [
387379
},
388380
],
389381
},
390-
{
391-
name: 'kubernetes-system-apiserver',
392-
rules: [
393-
{
394-
alert: 'KubeAggregatedAPIDown',
395-
'for': '15m',
396-
},
397-
],
398-
},
399382
{
400383
name: 'prometheus',
401384
rules: [
@@ -514,8 +497,6 @@ local includeRunbooks = {
514497
ClusterMonitoringOperatorDeprecatedConfig: openShiftRunbookCMO('ClusterMonitoringOperatorDeprecatedConfig.md'),
515498
ClusterOperatorDegraded: openShiftRunbookCMO('ClusterOperatorDegraded.md'),
516499
ClusterOperatorDown: openShiftRunbookCMO('ClusterOperatorDown.md'),
517-
KubeAggregatedAPIErrors: openShiftRunbookCMO('KubeAggregatedAPIErrors.md'),
518-
KubeAPIDown: openShiftRunbookCMO('KubeAPIDown.md'),
519500
KubeDeploymentReplicasMismatch: openShiftRunbookCMO('KubeDeploymentReplicasMismatch.md'),
520501
KubeJobFailed: openShiftRunbookCMO('KubeJobFailed.md'),
521502
KubeNodeNotReady: openShiftRunbookCMO('KubeNodeNotReady.md'),

pkg/manifests/manifests.go

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2474,21 +2474,6 @@ func (f *Factory) ControlPlanePrometheusRule() (*monv1.PrometheusRule, error) {
24742474

24752475
r.Namespace = f.namespace
24762476

2477-
if f.infrastructure.HostedControlPlane() {
2478-
groups := []monv1.RuleGroup{}
2479-
for _, g := range r.Spec.Groups {
2480-
switch g.Name {
2481-
case "kubernetes-system-apiserver",
2482-
"kubernetes-system-controller-manager",
2483-
"kubernetes-system-scheduler":
2484-
// skip
2485-
default:
2486-
groups = append(groups, g)
2487-
}
2488-
}
2489-
r.Spec.Groups = groups
2490-
}
2491-
24922477
return r, nil
24932478
}
24942479

pkg/manifests/manifests_test.go

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3873,25 +3873,14 @@ func TestPrometheusK8sControlPlaneRulesFiltered(t *testing.T) {
38733873
tests := []struct {
38743874
name string
38753875
infrastructure InfrastructureReader
3876-
verify func(bool)
38773876
}{
38783877
{
38793878
name: "default config",
38803879
infrastructure: defaultInfrastructureReader(),
3881-
verify: func(api bool) {
3882-
if !api {
3883-
t.Fatal("did not get all expected kubernetes control plane rules")
3884-
}
3885-
},
38863880
},
38873881
{
38883882
name: "hosted control plane",
38893883
infrastructure: &fakeInfrastructureReader{highlyAvailableInfrastructure: true, hostedControlPlane: true},
3890-
verify: func(api bool) {
3891-
if api {
3892-
t.Fatalf("kubernetes control plane rules found, none expected")
3893-
}
3894-
},
38953884
},
38963885
}
38973886

@@ -3901,14 +3890,13 @@ func TestPrometheusK8sControlPlaneRulesFiltered(t *testing.T) {
39013890
if err != nil {
39023891
t.Fatal(err)
39033892
}
3904-
apiServerRulesFound := false
3893+
39053894
for _, g := range r.Spec.Groups {
39063895
switch g.Name {
3907-
case "kubernetes-system-apiserver":
3908-
apiServerRulesFound = true
3896+
case "kubernetes-system-apiserver", "kubernetes-system-controller-manager", "kubernetes-system-scheduler":
3897+
t.Fatalf("Kubernetes control plane rule group %s found, none expected", g.Name)
39093898
}
39103899
}
3911-
tc.verify(apiServerRulesFound)
39123900
}
39133901
}
39143902

0 commit comments

Comments
 (0)