From 9100c81eb02f20c0878b3491b7de929eb32cc582 Mon Sep 17 00:00:00 2001 From: Amber Brown Date: Thu, 2 Nov 2023 01:10:44 +1100 Subject: [PATCH] clean up some of the monitor emitting metrics for aro operator statuses and prometheus (#3235) --- pkg/monitor/cluster/arooperatorconditions.go | 19 +++- .../cluster/arooperatorconditions_test.go | 103 ++++++++++++++++++ .../cluster/clusteroperatorconditions.go | 13 ++- pkg/monitor/cluster/prometheusalerts.go | 5 + 4 files changed, 133 insertions(+), 7 deletions(-) create mode 100644 pkg/monitor/cluster/arooperatorconditions_test.go diff --git a/pkg/monitor/cluster/arooperatorconditions.go b/pkg/monitor/cluster/arooperatorconditions.go index 62c4b043d2a..598e1d62357 100644 --- a/pkg/monitor/cluster/arooperatorconditions.go +++ b/pkg/monitor/cluster/arooperatorconditions.go @@ -5,6 +5,7 @@ package cluster import ( "context" + "strings" operatorv1 "github.com/openshift/api/operator/v1" "github.com/sirupsen/logrus" @@ -13,9 +14,16 @@ import ( arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" ) +const ( + operatorConditionsMetricsTopic = "arooperator.conditions" +) + var aroOperatorConditionsExpected = map[string]operatorv1.ConditionStatus{ arov1alpha1.InternetReachableFromMaster: operatorv1.ConditionTrue, arov1alpha1.InternetReachableFromWorker: operatorv1.ConditionTrue, + arov1alpha1.ServicePrincipalValid: operatorv1.ConditionTrue, + arov1alpha1.DefaultIngressCertificate: operatorv1.ConditionTrue, + arov1alpha1.MachineValid: operatorv1.ConditionTrue, } func (mon *Monitor) emitAroOperatorConditions(ctx context.Context) error { @@ -29,14 +37,21 @@ func (mon *Monitor) emitAroOperatorConditions(ctx context.Context) error { continue } - mon.emitGauge("arooperator.conditions", 1, map[string]string{ + // filter out expected conditions (available=true, progressing=false, degraded=false) + if (strings.HasSuffix(c.Type, "ControllerAvailable") && c.Status == operatorv1.ConditionTrue) || + (strings.HasSuffix(c.Type, "ControllerProgressing") && c.Status == operatorv1.ConditionFalse) || + (strings.HasSuffix(c.Type, "ControllerDegraded") && c.Status == operatorv1.ConditionFalse) { + continue + } + + mon.emitGauge(operatorConditionsMetricsTopic, 1, map[string]string{ "status": string(c.Status), "type": c.Type, }) if mon.hourlyRun && c.Status == operatorv1.ConditionFalse { mon.log.WithFields(logrus.Fields{ - "metric": "arooperator.conditions", + "metric": operatorConditionsMetricsTopic, "status": c.Status, "type": c.Type, "message": c.Message, diff --git a/pkg/monitor/cluster/arooperatorconditions_test.go b/pkg/monitor/cluster/arooperatorconditions_test.go new file mode 100644 index 00000000000..cc903661379 --- /dev/null +++ b/pkg/monitor/cluster/arooperatorconditions_test.go @@ -0,0 +1,103 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "testing" + + "github.com/golang/mock/gomock" + operatorv1 "github.com/openshift/api/operator/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" + arofake "github.com/Azure/ARO-RP/pkg/operator/clientset/versioned/fake" + mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics" +) + +func TestEmitAROOperatorConditions(t *testing.T) { + baseCluster := &arov1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: arov1alpha1.SingletonClusterName, + }, + Spec: arov1alpha1.ClusterSpec{}, + } + + for _, tt := range []struct { + name string + conditions []operatorv1.OperatorCondition + expectMetricsDims []map[string]string + }{ + { + name: "expected values are ignored", + conditions: []operatorv1.OperatorCondition{ + { + Type: "DnsmasqClusterControllerDegraded", + Status: operatorv1.ConditionFalse, + }, + { + Type: "DnsmasqClusterControllerProgressing", + Status: operatorv1.ConditionFalse, + }, { + Type: "DnsmasqClusterControllerAvailable", + Status: operatorv1.ConditionTrue, + }, + { + Type: "MachineValid", + Status: operatorv1.ConditionTrue, + }, + }, + }, + { + name: "non-expected values are emitted", + conditions: []operatorv1.OperatorCondition{ + { + Type: "DnsmasqClusterControllerDegraded", + Status: operatorv1.ConditionTrue, + }, + { + Type: "DnsmasqClusterControllerProgressing", + Status: operatorv1.ConditionTrue, + }, { + Type: "DnsmasqClusterControllerAvailable", + Status: operatorv1.ConditionFalse, + }, + { + Type: "MachineValid", + Status: operatorv1.ConditionUnknown, + }, + }, + expectMetricsDims: []map[string]string{ + {"type": "MachineValid", "status": "Unknown"}, + {"type": "DnsmasqClusterControllerDegraded", "status": "True"}, + {"type": "DnsmasqClusterControllerProgressing", "status": "True"}, + {"type": "DnsmasqClusterControllerAvailable", "status": "False"}, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + controller := gomock.NewController(t) + defer controller.Finish() + + ctx := context.Background() + baseCluster.Status.Conditions = tt.conditions + arocli := arofake.NewSimpleClientset(baseCluster) + m := mock_metrics.NewMockEmitter(controller) + + mon := &Monitor{ + arocli: arocli, + m: m, + } + + for _, i := range tt.expectMetricsDims { + m.EXPECT().EmitGauge(operatorConditionsMetricsTopic, int64(1), i) + } + + err := mon.emitAroOperatorConditions(ctx) + if err != nil { + t.Fatal(err) + } + }) + } +} diff --git a/pkg/monitor/cluster/clusteroperatorconditions.go b/pkg/monitor/cluster/clusteroperatorconditions.go index 38b561c1de7..c9d7d2cca11 100644 --- a/pkg/monitor/cluster/clusteroperatorconditions.go +++ b/pkg/monitor/cluster/clusteroperatorconditions.go @@ -19,11 +19,14 @@ type clusterOperatorConditionsIgnoreStruct struct { // clusterOperatorConditionsIgnore contains list of failures we know we can // ignore for now var clusterOperatorConditionsIgnore = map[clusterOperatorConditionsIgnoreStruct]struct{}{ - {"insights", "Disabled", configv1.ConditionFalse}: {}, - {"insights", "Disabled", configv1.ConditionTrue}: {}, - {"openshift-controller-manager", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {}, - {"service-ca", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {}, - {"service-catalog-apiserver", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {}, + {"insights", "Disabled", configv1.ConditionFalse}: {}, + {"insights", "Disabled", configv1.ConditionTrue}: {}, + {"openshift-controller-manager", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {}, + {"service-ca", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {}, + {"service-catalog-apiserver", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {}, + {"cloud-controller-manager", "TrustedCABundleControllerControllerDegraded", configv1.ConditionFalse}: {}, + {"cloud-controller-manager", "TrustedCABundleControllerControllerAvailable", configv1.ConditionTrue}: {}, + {"cloud-controller-manager", "CloudConfigControllerDegraded", configv1.ConditionFalse}: {}, } var clusterOperatorConditionsExpected = map[configv1.ClusterStatusConditionType]configv1.ConditionStatus{ diff --git a/pkg/monitor/cluster/prometheusalerts.go b/pkg/monitor/cluster/prometheusalerts.go index 8a4f1d26919..10d63e71e0b 100644 --- a/pkg/monitor/cluster/prometheusalerts.go +++ b/pkg/monitor/cluster/prometheusalerts.go @@ -19,6 +19,7 @@ import ( var ignoredAlerts = map[string]struct{}{ "ImagePruningDisabled": {}, + "InsightsDisabled": {}, } func (mon *Monitor) emitPrometheusAlerts(ctx context.Context) error { @@ -107,9 +108,13 @@ func (mon *Monitor) emitPrometheusAlerts(ctx context.Context) error { } func alertIsIgnored(alertName string) bool { + // Customers using deprecated/removed APIs is not useful for us to scrape if strings.HasPrefix(alertName, "UsingDeprecatedAPI") { return true } + if strings.HasPrefix(alertName, "APIRemovedInNext") { + return true + } if _, ok := ignoredAlerts[alertName]; ok { return true