Skip to content

Commit

Permalink
clean up some of the monitor emitting metrics for aro operator status…
Browse files Browse the repository at this point in the history
…es and prometheus (Azure#3235)
  • Loading branch information
hawkowl authored Nov 1, 2023
1 parent 0dd1ec9 commit 9100c81
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 7 deletions.
19 changes: 17 additions & 2 deletions pkg/monitor/cluster/arooperatorconditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package cluster

import (
"context"
"strings"

operatorv1 "github.com/openshift/api/operator/v1"
"github.com/sirupsen/logrus"
Expand All @@ -13,9 +14,16 @@ import (
arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1"
)

const (
operatorConditionsMetricsTopic = "arooperator.conditions"
)

var aroOperatorConditionsExpected = map[string]operatorv1.ConditionStatus{
arov1alpha1.InternetReachableFromMaster: operatorv1.ConditionTrue,
arov1alpha1.InternetReachableFromWorker: operatorv1.ConditionTrue,
arov1alpha1.ServicePrincipalValid: operatorv1.ConditionTrue,
arov1alpha1.DefaultIngressCertificate: operatorv1.ConditionTrue,
arov1alpha1.MachineValid: operatorv1.ConditionTrue,
}

func (mon *Monitor) emitAroOperatorConditions(ctx context.Context) error {
Expand All @@ -29,14 +37,21 @@ func (mon *Monitor) emitAroOperatorConditions(ctx context.Context) error {
continue
}

mon.emitGauge("arooperator.conditions", 1, map[string]string{
// filter out expected conditions (available=true, progressing=false, degraded=false)
if (strings.HasSuffix(c.Type, "ControllerAvailable") && c.Status == operatorv1.ConditionTrue) ||
(strings.HasSuffix(c.Type, "ControllerProgressing") && c.Status == operatorv1.ConditionFalse) ||
(strings.HasSuffix(c.Type, "ControllerDegraded") && c.Status == operatorv1.ConditionFalse) {
continue
}

mon.emitGauge(operatorConditionsMetricsTopic, 1, map[string]string{
"status": string(c.Status),
"type": c.Type,
})

if mon.hourlyRun && c.Status == operatorv1.ConditionFalse {
mon.log.WithFields(logrus.Fields{
"metric": "arooperator.conditions",
"metric": operatorConditionsMetricsTopic,
"status": c.Status,
"type": c.Type,
"message": c.Message,
Expand Down
103 changes: 103 additions & 0 deletions pkg/monitor/cluster/arooperatorconditions_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package cluster

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"testing"

"github.com/golang/mock/gomock"
operatorv1 "github.com/openshift/api/operator/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1"
arofake "github.com/Azure/ARO-RP/pkg/operator/clientset/versioned/fake"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)

func TestEmitAROOperatorConditions(t *testing.T) {
baseCluster := &arov1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: arov1alpha1.SingletonClusterName,
},
Spec: arov1alpha1.ClusterSpec{},
}

for _, tt := range []struct {
name string
conditions []operatorv1.OperatorCondition
expectMetricsDims []map[string]string
}{
{
name: "expected values are ignored",
conditions: []operatorv1.OperatorCondition{
{
Type: "DnsmasqClusterControllerDegraded",
Status: operatorv1.ConditionFalse,
},
{
Type: "DnsmasqClusterControllerProgressing",
Status: operatorv1.ConditionFalse,
}, {
Type: "DnsmasqClusterControllerAvailable",
Status: operatorv1.ConditionTrue,
},
{
Type: "MachineValid",
Status: operatorv1.ConditionTrue,
},
},
},
{
name: "non-expected values are emitted",
conditions: []operatorv1.OperatorCondition{
{
Type: "DnsmasqClusterControllerDegraded",
Status: operatorv1.ConditionTrue,
},
{
Type: "DnsmasqClusterControllerProgressing",
Status: operatorv1.ConditionTrue,
}, {
Type: "DnsmasqClusterControllerAvailable",
Status: operatorv1.ConditionFalse,
},
{
Type: "MachineValid",
Status: operatorv1.ConditionUnknown,
},
},
expectMetricsDims: []map[string]string{
{"type": "MachineValid", "status": "Unknown"},
{"type": "DnsmasqClusterControllerDegraded", "status": "True"},
{"type": "DnsmasqClusterControllerProgressing", "status": "True"},
{"type": "DnsmasqClusterControllerAvailable", "status": "False"},
},
},
} {
t.Run(tt.name, func(t *testing.T) {
controller := gomock.NewController(t)
defer controller.Finish()

ctx := context.Background()
baseCluster.Status.Conditions = tt.conditions
arocli := arofake.NewSimpleClientset(baseCluster)
m := mock_metrics.NewMockEmitter(controller)

mon := &Monitor{
arocli: arocli,
m: m,
}

for _, i := range tt.expectMetricsDims {
m.EXPECT().EmitGauge(operatorConditionsMetricsTopic, int64(1), i)
}

err := mon.emitAroOperatorConditions(ctx)
if err != nil {
t.Fatal(err)
}
})
}
}
13 changes: 8 additions & 5 deletions pkg/monitor/cluster/clusteroperatorconditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@ type clusterOperatorConditionsIgnoreStruct struct {
// clusterOperatorConditionsIgnore contains list of failures we know we can
// ignore for now
var clusterOperatorConditionsIgnore = map[clusterOperatorConditionsIgnoreStruct]struct{}{
{"insights", "Disabled", configv1.ConditionFalse}: {},
{"insights", "Disabled", configv1.ConditionTrue}: {},
{"openshift-controller-manager", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
{"service-ca", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
{"service-catalog-apiserver", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
{"insights", "Disabled", configv1.ConditionFalse}: {},
{"insights", "Disabled", configv1.ConditionTrue}: {},
{"openshift-controller-manager", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
{"service-ca", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
{"service-catalog-apiserver", configv1.OperatorUpgradeable, configv1.ConditionUnknown}: {},
{"cloud-controller-manager", "TrustedCABundleControllerControllerDegraded", configv1.ConditionFalse}: {},
{"cloud-controller-manager", "TrustedCABundleControllerControllerAvailable", configv1.ConditionTrue}: {},
{"cloud-controller-manager", "CloudConfigControllerDegraded", configv1.ConditionFalse}: {},
}

var clusterOperatorConditionsExpected = map[configv1.ClusterStatusConditionType]configv1.ConditionStatus{
Expand Down
5 changes: 5 additions & 0 deletions pkg/monitor/cluster/prometheusalerts.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (

var ignoredAlerts = map[string]struct{}{
"ImagePruningDisabled": {},
"InsightsDisabled": {},
}

func (mon *Monitor) emitPrometheusAlerts(ctx context.Context) error {
Expand Down Expand Up @@ -107,9 +108,13 @@ func (mon *Monitor) emitPrometheusAlerts(ctx context.Context) error {
}

func alertIsIgnored(alertName string) bool {
// Customers using deprecated/removed APIs is not useful for us to scrape
if strings.HasPrefix(alertName, "UsingDeprecatedAPI") {
return true
}
if strings.HasPrefix(alertName, "APIRemovedInNext") {
return true
}

if _, ok := ignoredAlerts[alertName]; ok {
return true
Expand Down

0 comments on commit 9100c81

Please sign in to comment.