From 9d1aa64f0e2df2661a9afb9fa9988adeaf5c3732 Mon Sep 17 00:00:00 2001 From: Steven Fairchild Date: Wed, 12 Jun 2024 15:11:40 -0400 Subject: [PATCH 1/2] Emit cluster features enabled/disabled during cluster provisioning Add workload identity or cluster service principal metrics emission Add emit features Emit features concerning newly installed clusters. Add emitMetrics to all provisioning states Add logging statements for metrics, Allows metrics being emitted to also be used in kusto dashboards. Constant values are used for all metric names in deminsions. If an error is encountered while gathering metrics, that metric is omitted. Unit tests provide 100% coverage for emitMetrics. Rename emitMetrics to emitProvisioningMetrics, rename emitInstallMetrics to emitMetrics, emitMetrics can be used for all provisioning states, not just creating state. Changed name to reflect this. --- pkg/backend/metrics.go | 216 ++++++++++++++++++++++++++ pkg/backend/metrics_const.go | 75 +++++++++ pkg/backend/metrics_provisioning.go | 28 ++++ pkg/backend/metrics_test.go | 224 +++++++++++++++++++++++++++ pkg/backend/openshiftcluster.go | 51 +++--- pkg/backend/openshiftcluster_test.go | 127 +++++++++++++++ 6 files changed, 688 insertions(+), 33 deletions(-) create mode 100644 pkg/backend/metrics.go create mode 100644 pkg/backend/metrics_const.go create mode 100644 pkg/backend/metrics_provisioning.go create mode 100644 pkg/backend/metrics_test.go diff --git a/pkg/backend/metrics.go b/pkg/backend/metrics.go new file mode 100644 index 00000000000..e2222c1adfd --- /dev/null +++ b/pkg/backend/metrics.go @@ -0,0 +1,216 @@ +package backend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "fmt" + "strconv" + + "github.com/sirupsen/logrus" + + "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/util/dns" + utillog "github.com/Azure/ARO-RP/pkg/util/log" +) + +func (ocb *openShiftClusterBackend) emitMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, operationType, provisioningState api.ProvisioningState, backendErr error) map[string]string { + dimensions := map[string]string{} + + ocb.gatherOperationMetrics(operationType, provisioningState, backendErr, dimensions) + ocb.gatherCorrelationID(doc, dimensions) + ocb.gatherMiscMetrics(doc, dimensions) + ocb.gatherAuthMetrics(doc, dimensions) + ocb.gatherNetworkMetrics(doc, dimensions) + ocb.gatherNodeMetrics(doc, dimensions) + + ocb.logMetricDimensions(log, operationType, dimensions) + ocb.m.EmitGauge(ocb.getMetricName(operationType), metricValue, dimensions) + + // dimensions is returned here for testing purposes + return dimensions +} + +func (ocb *openShiftClusterBackend) getMetricName(operationType api.ProvisioningState) string { + return fmt.Sprintf("%s.%s", metricPackage, operationType) +} + +func (ocb *openShiftClusterBackend) getResultType(backendErr error) utillog.ResultType { + var resultType utillog.ResultType + err, ok := backendErr.(*api.CloudError) + if ok { + resultType = utillog.MapStatusCodeToResultType(err.StatusCode) + } + return resultType +} + +func (ocb *openShiftClusterBackend) logMetricDimensions(log *logrus.Entry, operationType api.ProvisioningState, dimensions map[string]string) { + for metric, value := range dimensions { + log.Info(fmt.Sprintf("%s.%s: %s = %s", metricPackage, operationType, metric, value)) + } +} + +func (m *openShiftClusterBackend) gatherCorrelationID(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + if doc.CorrelationData != nil { + dimensions[correlationDataIdMetricName] = doc.CorrelationData.CorrelationID + dimensions[correlationDataClientRequestIdMetricName] = doc.CorrelationData.ClientRequestID + dimensions[correlationDataRequestIdMetricName] = doc.CorrelationData.RequestID + } else { + dimensions[correlationDataIdMetricName] = empty + dimensions[correlationDataClientRequestIdMetricName] = empty + dimensions[correlationDataRequestIdMetricName] = empty + } +} + +func (ocb *openShiftClusterBackend) gatherOperationMetrics(operationType, provisioningState api.ProvisioningState, backendErr error, dimensions map[string]string) { + dimensions[operationTypeMetricName] = operationType.String() + dimensions[provisioningStateMetricName] = provisioningState.String() + dimensions[resultTypeMetricName] = string(ocb.getResultType(backendErr)) +} + +func (ocb *openShiftClusterBackend) gatherMiscMetrics(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + dimensions[subscriptionIdMetricName] = ocb.env.SubscriptionID() + dimensions[resourceIdMetricName] = doc.ResourceID + if doc.OpenShiftCluster != nil { + dimensions[clusterNameMetricName] = doc.OpenShiftCluster.Name + dimensions[locationMetricName] = doc.OpenShiftCluster.Location + dimensions[ocpVersionMetricName] = doc.OpenShiftCluster.Properties.ClusterProfile.Version + dimensions[rpVersionMetricName] = doc.OpenShiftCluster.Properties.ProvisionedBy + dimensions[resourecGroupMetricName] = doc.OpenShiftCluster.Properties.ClusterProfile.ResourceGroupID + + for flag, feature := range doc.OpenShiftCluster.Properties.OperatorFlags { + dimensions[fmt.Sprintf("%s-%s", operatorFlagsMetricName, flag)] = feature + } + } + + dimensions[asyncOperationsIdMetricName] = doc.AsyncOperationID + + if doc.OpenShiftCluster.Properties.WorkerProfiles != nil { + dimensions[workerProfileCountMetricName] = strconv.FormatInt(int64(len(doc.OpenShiftCluster.Properties.WorkerProfiles)), 10) + } + + if doc.OpenShiftCluster.Tags != nil { + dimensions[tagsMetricName] = enabled + } else { + dimensions[tagsMetricName] = disabled + } +} + +func (ocb *openShiftClusterBackend) gatherNodeMetrics(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + if doc.OpenShiftCluster.Properties.MasterProfile.DiskEncryptionSetID != "" { + dimensions[masterProfileEncryptionSetIdMetricName] = enabled + } else { + dimensions[masterProfileEncryptionSetIdMetricName] = disabled + } + + mp := doc.OpenShiftCluster.Properties.MasterProfile + dimensions[masterProfileVmSizeMetricName] = string(mp.VMSize) + + if doc.OpenShiftCluster.Properties.MasterProfile.EncryptionAtHost == api.EncryptionAtHostEnabled { + dimensions[masterEncryptionAtHostMetricName] = string(api.EncryptionAtHostEnabled) + } else if doc.OpenShiftCluster.Properties.MasterProfile.EncryptionAtHost == api.EncryptionAtHostDisabled { + dimensions[masterEncryptionAtHostMetricName] = string(api.EncryptionAtHostDisabled) + } else { + dimensions[masterEncryptionAtHostMetricName] = unknown + } + + if len(doc.OpenShiftCluster.Properties.WorkerProfiles) > 0 { + wp := doc.OpenShiftCluster.Properties.WorkerProfiles[0] + dimensions[workerVmSizeMetricName] = string(wp.VMSize) + dimensions[workerVmDiskSizeMetricName] = strconv.FormatInt(int64(wp.DiskSizeGB), 10) + + dimensions[workerVmSizeMetricName] = string(wp.VMSize) + dimensions[workerVmDiskSizeMetricName] = strconv.FormatInt(int64(wp.DiskSizeGB), 10) + + if wp.EncryptionAtHost == api.EncryptionAtHostEnabled { + dimensions[workerEncryptionAtHostMetricName] = string(api.EncryptionAtHostEnabled) + } else if wp.EncryptionAtHost == api.EncryptionAtHostDisabled { + dimensions[workerEncryptionAtHostMetricName] = string(api.EncryptionAtHostDisabled) + } else { + dimensions[workerEncryptionAtHostMetricName] = unknown + } + } + + if doc.OpenShiftCluster.Properties.ClusterProfile.FipsValidatedModules == api.FipsValidatedModulesEnabled { + dimensions[fipsMetricName] = string(api.FipsValidatedModulesEnabled) + } else if doc.OpenShiftCluster.Properties.ClusterProfile.FipsValidatedModules == api.FipsValidatedModulesDisabled { + dimensions[fipsMetricName] = string(api.FipsValidatedModulesDisabled) + } else { + dimensions[fipsMetricName] = unknown + } +} + +func (ocb *openShiftClusterBackend) gatherAuthMetrics(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + if doc.OpenShiftCluster.Properties.PlatformWorkloadIdentityProfile != nil { + dimensions[clusterIdentityMetricName] = clusterIdentityManagedIdMetricName + } else if doc.OpenShiftCluster.Properties.ServicePrincipalProfile != nil { + dimensions[clusterIdentityMetricName] = clusterIdentityServicePrincipalMetricName + } else { + dimensions[clusterIdentityMetricName] = unknown + } + + if doc.OpenShiftCluster.Properties.ClusterProfile.PullSecret != "" { + dimensions[pullSecretMetricName] = enabled + } else { + dimensions[pullSecretMetricName] = disabled + } +} + +func (ocb *openShiftClusterBackend) gatherNetworkMetrics(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + for _, p := range doc.OpenShiftCluster.Properties.IngressProfiles { + if p.Visibility == api.VisibilityPrivate { + dimensions[ingressProfileMetricName] = fmt.Sprintf("%s.%s", string(api.VisibilityPrivate), p.Name) + } else if p.Visibility == api.VisibilityPublic { + dimensions[ingressProfileMetricName] = fmt.Sprintf("%s.%s", string(api.VisibilityPublic), p.Name) + } else { + dimensions[ingressProfileMetricName] = unknown + } + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.OutboundType == api.OutboundTypeUserDefinedRouting { + dimensions[networkProfileOutboundTypeMetricName] = string(api.OutboundTypeUserDefinedRouting) + } else if doc.OpenShiftCluster.Properties.NetworkProfile.OutboundType == api.OutboundTypeLoadbalancer { + dimensions[networkProfileOutboundTypeMetricName] = string(api.OutboundTypeLoadbalancer) + } else { + dimensions[networkProfileOutboundTypeMetricName] = unknown + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.PodCIDR != podCidrDefaultValue { + dimensions[podCidrMetricName] = custom + } else { + dimensions[podCidrMetricName] = defaultSet + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.ServiceCIDR != serviceCidrDefaultValue { + dimensions[serviceCidrMetricName] = custom + } else { + dimensions[serviceCidrMetricName] = defaultSet + } + + domain, err := dns.ManagedDomain(ocb.env, doc.OpenShiftCluster.Properties.ClusterProfile.Domain) + if err == nil { + if domain != "" { + dimensions[clusterProfileDomainMetricName] = custom + } else { + dimensions[clusterProfileDomainMetricName] = managed + } + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.LoadBalancerProfile.ManagedOutboundIPs != nil { + dimensions[networkProfileManagedOutboundIpsMetricName] = strconv.FormatInt(int64(doc.OpenShiftCluster.Properties.NetworkProfile.LoadBalancerProfile.ManagedOutboundIPs.Count), 10) + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled { + dimensions[networkProfilePreConfiguredNSGMetricName] = string(api.PreconfiguredNSGEnabled) + } else if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGDisabled { + dimensions[networkProfilePreConfiguredNSGMetricName] = string(api.PreconfiguredNSGDisabled) + } else { + dimensions[networkProfilePreConfiguredNSGMetricName] = unknown + } + + if doc.OpenShiftCluster.Properties.FeatureProfile.GatewayEnabled { + dimensions[featureProfileGatewayEnabledMetricName] = enabled + } else { + dimensions[featureProfileGatewayEnabledMetricName] = disabled + } +} diff --git a/pkg/backend/metrics_const.go b/pkg/backend/metrics_const.go new file mode 100644 index 00000000000..a3c294bac9c --- /dev/null +++ b/pkg/backend/metrics_const.go @@ -0,0 +1,75 @@ +package backend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +const ( + metricPackage = "backend.openshiftcluster" + metricValue int64 = 1 + enabled = "Enabled" + disabled = "Disabled" + custom = "Custom" + defaultSet = "Default" + unknown = "unknown" + empty = "empty" + managed = "managed" + + encryptionAtHostMetricName = "encryptionathost" + diskSizeMetricName = "disksize" + vmSizeMetricName = "vmsize" + countMetricName = "count" + + workerProfileMetricName = "workprofile" + workerVmSizeMetricName = workerProfileMetricName + "." + vmSizeMetricName + workerVmDiskSizeMetricName = workerProfileMetricName + "." + diskSizeMetricName + workerEncryptionAtHostMetricName = workerProfileMetricName + "." + encryptionAtHostMetricName + workerProfileCountMetricName = workerProfileMetricName + "." + countMetricName + + masterProfileMetricName = "masterprofile" + masterEncryptionAtHostMetricName = masterProfileMetricName + "." + encryptionAtHostMetricName + masterProfileEncryptionSetIdMetricName = masterProfileMetricName + "." + "diskencryptionsetid" + masterProfileVmSizeMetricName = masterProfileMetricName + "." + vmSizeMetricName + + fipsMetricName = "fips" + clusterIdentityMetricName = "clusteridentity" + clusterIdentityManagedIdMetricName = managed + "id" + clusterIdentityServicePrincipalMetricName = "serviceprincipal" + pullSecretMetricName = "pullsecret" + + ingressProfileMetricName = "ingressprofile" + networkProfileMetricName = "networkprofile" + networkProfileOutboundTypeMetricName = networkProfileMetricName + "." + "outboundtype" + networkProfileManagedOutboundIpsMetricName = networkProfileMetricName + "." + "managedoutboundips" + networkProfilePreConfiguredNSGMetricName = networkProfileMetricName + "." + "preconfigurednsg" + podCidrMetricName = networkProfileMetricName + "." + "podcidr" + serviceCidrMetricName = networkProfileMetricName + "." + "servicecidr" + podCidrDefaultValue = "10.128.0.0/14" + serviceCidrDefaultValue = "172.30.0.0/16" + + featureProfileMetricName = "featureprofile" + featureProfileGatewayEnabledMetricName = featureProfileMetricName + "." + "gatewayenabled" + + clusterProfileMetricName = "clusterprofile" + clusterProfileDomainMetricName = clusterProfileMetricName + "." + "domain" + + tagsMetricName = "tags" + operatorFlagsMetricName = "operatorflags" + + asyncOperationsIdMetricName = "async_operationsid" + rpVersionMetricName = "rpversion" + ocpVersionMetricName = "ocpversion" + clusterNameMetricName = "clustername" + resourecGroupMetricName = "resourcegroup" + locationMetricName = "location" + resourceIdMetricName = "resourceid" + subscriptionIdMetricName = "subscriptionid" + + correlationDataMetricName = "correlationdata" + correlationDataRequestIdMetricName = correlationDataMetricName + "." + "requestid" + correlationDataClientRequestIdMetricName = correlationDataMetricName + "." + "client_requestid" + correlationDataIdMetricName = correlationDataMetricName + "." + "correlationid" + + operationTypeMetricName = "operationtype" + provisioningStateMetricName = "provisioningstate" + resultTypeMetricName = "resulttype" +) diff --git a/pkg/backend/metrics_provisioning.go b/pkg/backend/metrics_provisioning.go new file mode 100644 index 00000000000..ce7bad07039 --- /dev/null +++ b/pkg/backend/metrics_provisioning.go @@ -0,0 +1,28 @@ +package backend + +import ( + "time" + + "github.com/Azure/ARO-RP/pkg/api" +) + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +func (ocb *openShiftClusterBackend) emitProvisioningMetrics(doc *api.OpenShiftClusterDocument, provisioningState api.ProvisioningState) { + if doc.CorrelationData == nil { + return + } + + duration := time.Since(doc.CorrelationData.RequestTime).Milliseconds() + + ocb.m.EmitGauge("backend.openshiftcluster.duration", duration, map[string]string{ + "oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState), + "newProvisioningState": string(provisioningState), + }) + + ocb.m.EmitGauge("backend.openshiftcluster.count", 1, map[string]string{ + "oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState), + "newProvisioningState": string(provisioningState), + }) +} diff --git a/pkg/backend/metrics_test.go b/pkg/backend/metrics_test.go new file mode 100644 index 00000000000..913dce459e4 --- /dev/null +++ b/pkg/backend/metrics_test.go @@ -0,0 +1,224 @@ +package backend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "fmt" + "reflect" + "testing" + + "github.com/golang/mock/gomock" + "github.com/sirupsen/logrus" + + "github.com/Azure/ARO-RP/pkg/api" + mock_env "github.com/Azure/ARO-RP/pkg/util/mocks/env" + mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics" +) + +func TestEmitMetrics(t *testing.T) { + controller := gomock.NewController(t) + emitter := mock_metrics.NewMockEmitter(controller) + env := mock_env.NewMockInterface(controller) + env.EXPECT().SubscriptionID().AnyTimes() + env.EXPECT().Domain().AnyTimes() + + log := logrus.NewEntry(&logrus.Logger{}) + + b := &backend{ + baseLog: log, + env: env, + m: emitter, + } + ocb := newOpenShiftClusterBackend(b) + + mockSubID := "00000000-0000-0000-0000-000000000000" + resourceGroup := "resourceGroup" + resourceID := fmt.Sprintf("/subscriptions/%s/resourcegroups/%s/providers/Microsoft.RedHatOpenShift/openShiftClusters/resourceName", mockSubID, resourceGroup) + + for _, tt := range []struct { + name string + operationType api.ProvisioningState + provisioningState api.ProvisioningState + doc *api.OpenShiftClusterDocument + backendErr error + managedDomain bool + }{ + { + name: "Pass default cluster install", + doc: &api.OpenShiftClusterDocument{ + CorrelationData: &api.CorrelationData{ + CorrelationID: "id", + ClientRequestID: "client request id", + RequestID: "request id", + }, + ResourceID: resourceID, + OpenShiftCluster: &api.OpenShiftCluster{ + Location: "eastus", + Properties: api.OpenShiftClusterProperties{ + ClusterProfile: api.ClusterProfile{ + Domain: "cluster.domain.example", + PullSecret: api.SecureString("super secret"), + FipsValidatedModules: api.FipsValidatedModulesDisabled, + }, + NetworkProfile: api.NetworkProfile{ + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 1, + }, + }, + PodCIDR: podCidrDefaultValue, + ServiceCIDR: serviceCidrDefaultValue, + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + }, + OperatorFlags: api.OperatorFlags{"testFlag": "true"}, + WorkerProfiles: api.ExampleOpenShiftClusterDocument().OpenShiftCluster.Properties.WorkerProfiles, + MasterProfile: api.ExampleOpenShiftClusterDocument().OpenShiftCluster.Properties.MasterProfile, + ServicePrincipalProfile: &api.ServicePrincipalProfile{}, + IngressProfiles: api.ExampleOpenShiftClusterDocument().OpenShiftCluster.Properties.IngressProfiles, + FeatureProfile: api.FeatureProfile{ + GatewayEnabled: true, + }, + }, + }, + }, + operationType: api.ProvisioningStateCreating, + provisioningState: api.ProvisioningStateSucceeded, + }, + { + name: "Pass workload identity cluster install", + doc: &api.OpenShiftClusterDocument{ + CorrelationData: &api.CorrelationData{ + CorrelationID: "id", + ClientRequestID: "client request id", + RequestID: "request id", + }, + ResourceID: resourceID, + OpenShiftCluster: &api.OpenShiftCluster{ + Location: "eastus", + Tags: map[string]string{"tag1": "true"}, + Properties: api.OpenShiftClusterProperties{ + ClusterProfile: api.ClusterProfile{ + Domain: "cluster.domain.example", + PullSecret: api.SecureString("super secret"), + FipsValidatedModules: api.FipsValidatedModulesEnabled, + }, + NetworkProfile: api.NetworkProfile{ + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 1, + }, + }, + PodCIDR: "10.128.0.1/14", + ServiceCIDR: "172.30.0.1/16", + PreconfiguredNSG: api.PreconfiguredNSGEnabled, + }, + OperatorFlags: api.OperatorFlags{"testFlag": "true"}, + WorkerProfiles: []api.WorkerProfile{ + { + DiskEncryptionSetID: "testing/disk/encryptionset", + EncryptionAtHost: api.EncryptionAtHostEnabled, + }, + }, + MasterProfile: api.MasterProfile{ + DiskEncryptionSetID: "testing/disk/encryptionset", + EncryptionAtHost: api.EncryptionAtHostEnabled, + }, + PlatformWorkloadIdentityProfile: &api.PlatformWorkloadIdentityProfile{}, + IngressProfiles: []api.IngressProfile{ + { + Name: "PrivateIngressProfile", + Visibility: api.VisibilityPrivate, + }, + }, + FeatureProfile: api.FeatureProfile{ + GatewayEnabled: true, + }, + }, + }, + }, + operationType: api.ProvisioningStateCreating, + provisioningState: api.ProvisioningStateSucceeded, + }, + { + name: "Pass backend error", + backendErr: &api.CloudError{ + StatusCode: 200, + }, + doc: &api.OpenShiftClusterDocument{ + CorrelationData: &api.CorrelationData{ + CorrelationID: "id", + ClientRequestID: "client request id", + RequestID: "request id", + }, + ResourceID: resourceID, + OpenShiftCluster: &api.OpenShiftCluster{ + Location: "eastus", + Tags: map[string]string{"tag1": "true"}, + Properties: api.OpenShiftClusterProperties{ + ClusterProfile: api.ClusterProfile{ + Domain: "cluster", + PullSecret: api.SecureString("super secret"), + FipsValidatedModules: api.FipsValidatedModulesEnabled, + }, + NetworkProfile: api.NetworkProfile{ + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 1, + }, + }, + PodCIDR: "10.128.0.1/14", + ServiceCIDR: "172.30.0.1/16", + }, + OperatorFlags: api.OperatorFlags{"testFlag": "true"}, + WorkerProfiles: []api.WorkerProfile{ + { + DiskEncryptionSetID: "testing/disk/encryptionset", + EncryptionAtHost: api.EncryptionAtHostDisabled, + }, + }, + MasterProfile: api.MasterProfile{ + DiskEncryptionSetID: "testing/disk/encryptionset", + EncryptionAtHost: api.EncryptionAtHostDisabled, + }, + PlatformWorkloadIdentityProfile: &api.PlatformWorkloadIdentityProfile{}, + IngressProfiles: []api.IngressProfile{ + { + Name: "EmptyIngressProfile", + }, + }, + FeatureProfile: api.FeatureProfile{ + GatewayEnabled: true, + }, + }, + }, + }, + operationType: api.ProvisioningStateCreating, + provisioningState: api.ProvisioningStateSucceeded, + managedDomain: true, + }, + } { + t.Run(tt.name, func(t *testing.T) { + if tt.managedDomain { + t.Setenv("DOMAIN_NAME", "aro-managed.example") + } + + dimensions := map[string]string{} + ocb.gatherOperationMetrics(tt.operationType, tt.provisioningState, tt.backendErr, dimensions) + ocb.gatherCorrelationID(tt.doc, dimensions) + ocb.gatherMiscMetrics(tt.doc, dimensions) + ocb.gatherAuthMetrics(tt.doc, dimensions) + ocb.gatherNetworkMetrics(tt.doc, dimensions) + ocb.gatherNodeMetrics(tt.doc, dimensions) + + emitter.EXPECT().EmitGauge(ocb.getMetricName(tt.operationType), metricValue, dimensions).MaxTimes(1) + + d := ocb.emitMetrics(log, tt.doc, tt.operationType, tt.provisioningState, tt.backendErr) + + ok := reflect.DeepEqual(dimensions, d) + if !ok { + t.Errorf("%s != %s", dimensions, d) + } + }) + } +} diff --git a/pkg/backend/openshiftcluster.go b/pkg/backend/openshiftcluster.go index 2ad95b6f5f1..a7f9fddc9d5 100644 --- a/pkg/backend/openshiftcluster.go +++ b/pkg/backend/openshiftcluster.go @@ -58,7 +58,7 @@ func (ocb *openShiftClusterBackend) try(ctx context.Context) (bool, error) { if doc.Dequeues > maxDequeueCount { err := fmt.Errorf("dequeued %d times, failing", doc.Dequeues) - return true, ocb.endLease(ctx, log, nil, doc, api.ProvisioningStateFailed, err) + return true, ocb.endLease(ctx, log, nil, doc, api.ProvisioningStateFailed, api.ProvisioningStateFailed, err) } log.Print("dequeued") @@ -131,7 +131,7 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr m, err := ocb.newManager(ctx, log, ocb.env, ocb.dbOpenShiftClusters, ocb.dbGateway, ocb.dbOpenShiftVersions, ocb.aead, ocb.billing, doc, subscriptionDoc, hr, ocb.m) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, api.ProvisioningStateFailed, err) } switch doc.OpenShiftCluster.Properties.ProvisioningState { @@ -140,7 +140,7 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr err = m.Install(ctx) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateFailed, err) } // re-get document and check the state: // if Install = nil, we are done with the install. @@ -148,12 +148,12 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr // backend worker to pick up next install phase doc, err = ocb.dbOpenShiftClusters.Get(ctx, strings.ToLower(doc.OpenShiftCluster.ID)) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateFailed, err) } if doc.OpenShiftCluster.Properties.Install == nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateSucceeded, nil) } - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, nil) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateCreating, nil) case api.ProvisioningStateAdminUpdating: log.Printf("admin updating (type: %s)", doc.OpenShiftCluster.Properties.MaintenanceTask) @@ -161,23 +161,23 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr err = m.AdminUpdate(ctx) if err != nil { // Customer will continue to see the cluster in an ongoing maintenance state - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateFailed, err) } // Maintenance task is complete, so we can clear the maintenance state doc, err = ocb.setNoMaintenanceState(ctx, doc) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateFailed, err) } - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateSucceeded, nil) case api.ProvisioningStateUpdating: log.Print("updating") err = m.Update(ctx) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateUpdating, api.ProvisioningStateFailed, err) } - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateUpdating, api.ProvisioningStateSucceeded, nil) case api.ProvisioningStateDeleting: log.Print("deleting") @@ -185,12 +185,12 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr err = m.Delete(ctx) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateDeleting, api.ProvisioningStateFailed, err) } err = ocb.updateAsyncOperation(ctx, log, doc.AsyncOperationID, nil, api.ProvisioningStateSucceeded, "", nil) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateDeleting, api.ProvisioningStateFailed, err) } stop() @@ -288,7 +288,7 @@ func (ocb *openShiftClusterBackend) updateAsyncOperation(ctx context.Context, lo return nil } -func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.Entry, stop func(), doc *api.OpenShiftClusterDocument, provisioningState api.ProvisioningState, backendErr error) error { +func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.Entry, stop func(), doc *api.OpenShiftClusterDocument, operationType, provisioningState api.ProvisioningState, backendErr error) error { var adminUpdateError *string var failedProvisioningState api.ProvisioningState initialProvisioningState := doc.OpenShiftCluster.Properties.ProvisioningState @@ -306,7 +306,8 @@ func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.En return err } ocb.asyncOperationResultLog(log, initialProvisioningState, backendErr) - ocb.emitMetrics(doc, provisioningState) + ocb.emitMetrics(log, doc, operationType, provisioningState, nil) + ocb.emitProvisioningMetrics(doc, provisioningState) } if initialProvisioningState == api.ProvisioningStateAdminUpdating { @@ -324,6 +325,8 @@ func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.En stop() } + ocb.emitMetrics(log, doc, operationType, provisioningState, nil) + _, err := ocb.dbOpenShiftClusters.EndLease(ctx, doc.Key, provisioningState, failedProvisioningState, adminUpdateError) return err } @@ -362,24 +365,6 @@ func (ocb *openShiftClusterBackend) asyncOperationResultLog(log *logrus.Entry, i log.Info("long running operation failed") } -func (ocb *openShiftClusterBackend) emitMetrics(doc *api.OpenShiftClusterDocument, provisioningState api.ProvisioningState) { - if doc.CorrelationData == nil { - return - } - - duration := time.Since(doc.CorrelationData.RequestTime).Milliseconds() - - ocb.m.EmitGauge("backend.openshiftcluster.duration", duration, map[string]string{ - "oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState), - "newProvisioningState": string(provisioningState), - }) - - ocb.m.EmitGauge("backend.openshiftcluster.count", 1, map[string]string{ - "oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState), - "newProvisioningState": string(provisioningState), - }) -} - func (ocb *openShiftClusterBackend) setNoMaintenanceState(ctx context.Context, doc *api.OpenShiftClusterDocument) (*api.OpenShiftClusterDocument, error) { return ocb.dbOpenShiftClusters.Patch(ctx, doc.Key, func(doc *api.OpenShiftClusterDocument) error { doc.OpenShiftCluster.Properties.MaintenanceState = api.MaintenanceStateNone diff --git a/pkg/backend/openshiftcluster_test.go b/pkg/backend/openshiftcluster_test.go index a8de5fce56d..eff59b5e98b 100644 --- a/pkg/backend/openshiftcluster_test.go +++ b/pkg/backend/openshiftcluster_test.go @@ -51,6 +51,11 @@ func TestBackendTry(t *testing.T) { fixture: func(f *testdatabase.Fixture) { f.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{ Key: strings.ToLower(resourceID), + CorrelationData: &api.CorrelationData{ + CorrelationID: "correlationId", + ClientRequestID: "clientRequestId", + RequestID: "requestId", + }, OpenShiftCluster: &api.OpenShiftCluster{ ID: resourceID, Name: "resourceName", @@ -58,6 +63,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateCreating, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeUserDefinedRouting, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -78,6 +94,17 @@ func TestBackendTry(t *testing.T) { Install: &api.Install{ Phase: api.InstallPhaseBootstrap, }, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeUserDefinedRouting, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -104,6 +131,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateCreating, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -121,6 +159,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateSucceeded, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -147,6 +196,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateCreating, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -166,6 +226,17 @@ func TestBackendTry(t *testing.T) { Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateFailed, FailedProvisioningState: api.ProvisioningStateCreating, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -192,6 +263,17 @@ func TestBackendTry(t *testing.T) { LastAdminUpdateError: "oh no", MaintenanceTask: api.MaintenanceTaskEverything, MaintenanceState: api.MaintenanceStateUnplanned, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -210,6 +292,17 @@ func TestBackendTry(t *testing.T) { Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateSucceeded, MaintenanceState: api.MaintenanceStateNone, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -234,6 +327,17 @@ func TestBackendTry(t *testing.T) { FailedProvisioningState: api.ProvisioningStateUpdating, MaintenanceTask: api.MaintenanceTaskEverything, MaintenanceState: api.MaintenanceStateUnplanned, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -254,6 +358,17 @@ func TestBackendTry(t *testing.T) { FailedProvisioningState: api.ProvisioningStateUpdating, LastAdminUpdateError: "oh no!", MaintenanceState: api.MaintenanceStateUnplanned, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -274,6 +389,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateDeleting, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -297,6 +423,7 @@ func TestBackendTry(t *testing.T) { manager := mock_cluster.NewMockInterface(controller) _env := mock_env.NewMockInterface(controller) _env.EXPECT().LiveConfig().AnyTimes().Return(tlc) + _env.EXPECT().SubscriptionID().AnyTimes().Return(mockSubID) dbOpenShiftClusters, clientOpenShiftClusters := testdatabase.NewFakeOpenShiftClusters() dbSubscriptions, _ := testdatabase.NewFakeSubscriptions() From a99acf924838ff59d6d5804b5d48e542f3aaed83 Mon Sep 17 00:00:00 2001 From: Steven Fairchild Date: Thu, 27 Jun 2024 16:43:54 -0400 Subject: [PATCH 2/2] Improve logging, handling of empty metric values --- pkg/backend/metrics.go | 94 +++++++++++++++++++++++------------- pkg/backend/metrics_const.go | 31 ++++++------ pkg/backend/metrics_test.go | 24 ++++----- 3 files changed, 87 insertions(+), 62 deletions(-) diff --git a/pkg/backend/metrics.go b/pkg/backend/metrics.go index e2222c1adfd..fb57cb6a558 100644 --- a/pkg/backend/metrics.go +++ b/pkg/backend/metrics.go @@ -17,12 +17,12 @@ import ( func (ocb *openShiftClusterBackend) emitMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, operationType, provisioningState api.ProvisioningState, backendErr error) map[string]string { dimensions := map[string]string{} - ocb.gatherOperationMetrics(operationType, provisioningState, backendErr, dimensions) - ocb.gatherCorrelationID(doc, dimensions) - ocb.gatherMiscMetrics(doc, dimensions) - ocb.gatherAuthMetrics(doc, dimensions) - ocb.gatherNetworkMetrics(doc, dimensions) - ocb.gatherNodeMetrics(doc, dimensions) + ocb.gatherOperationMetrics(log, operationType, provisioningState, backendErr, dimensions) + ocb.gatherCorrelationID(log, doc, dimensions) + ocb.gatherMiscMetrics(log, doc, dimensions) + ocb.gatherAuthMetrics(log, doc, dimensions) + ocb.gatherNetworkMetrics(log, doc, dimensions) + ocb.gatherNodeMetrics(log, doc, dimensions) ocb.logMetricDimensions(log, operationType, dimensions) ocb.m.EmitGauge(ocb.getMetricName(operationType), metricValue, dimensions) @@ -44,49 +44,64 @@ func (ocb *openShiftClusterBackend) getResultType(backendErr error) utillog.Resu return resultType } +func (ocb *openShiftClusterBackend) getStringMetricValue(log *logrus.Entry, metricName, value string) string { + if value != "" { + return value + } + + log.Warnf("%s %s", metricFailToCollectErr, metricName) + return empty +} + func (ocb *openShiftClusterBackend) logMetricDimensions(log *logrus.Entry, operationType api.ProvisioningState, dimensions map[string]string) { for metric, value := range dimensions { log.Info(fmt.Sprintf("%s.%s: %s = %s", metricPackage, operationType, metric, value)) } } -func (m *openShiftClusterBackend) gatherCorrelationID(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { +func (ocb *openShiftClusterBackend) gatherCorrelationID(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { if doc.CorrelationData != nil { - dimensions[correlationDataIdMetricName] = doc.CorrelationData.CorrelationID - dimensions[correlationDataClientRequestIdMetricName] = doc.CorrelationData.ClientRequestID - dimensions[correlationDataRequestIdMetricName] = doc.CorrelationData.RequestID + dimensions[correlationDataIdMetricName] = ocb.getStringMetricValue(log, correlationDataIdMetricName, doc.CorrelationData.CorrelationID) + dimensions[correlationDataClientRequestIdMetricName] = ocb.getStringMetricValue(log, correlationDataClientRequestIdMetricName, doc.CorrelationData.ClientRequestID) + dimensions[correlationDataRequestIdMetricName] = ocb.getStringMetricValue(log, correlationDataRequestIdMetricName, doc.CorrelationData.RequestID) } else { + log.Warnf("%s %s", metricFailToCollectErr, correlationDataMetricName) dimensions[correlationDataIdMetricName] = empty dimensions[correlationDataClientRequestIdMetricName] = empty dimensions[correlationDataRequestIdMetricName] = empty } } -func (ocb *openShiftClusterBackend) gatherOperationMetrics(operationType, provisioningState api.ProvisioningState, backendErr error, dimensions map[string]string) { +func (ocb *openShiftClusterBackend) gatherOperationMetrics(log *logrus.Entry, operationType, provisioningState api.ProvisioningState, backendErr error, dimensions map[string]string) { + // These are provided internally by endLease, not expected to be "" dimensions[operationTypeMetricName] = operationType.String() dimensions[provisioningStateMetricName] = provisioningState.String() - dimensions[resultTypeMetricName] = string(ocb.getResultType(backendErr)) + + dimensions[resultTypeMetricName] = ocb.getStringMetricValue(log, resultTypeMetricName, string(ocb.getResultType(backendErr))) } -func (ocb *openShiftClusterBackend) gatherMiscMetrics(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { - dimensions[subscriptionIdMetricName] = ocb.env.SubscriptionID() - dimensions[resourceIdMetricName] = doc.ResourceID - if doc.OpenShiftCluster != nil { - dimensions[clusterNameMetricName] = doc.OpenShiftCluster.Name - dimensions[locationMetricName] = doc.OpenShiftCluster.Location - dimensions[ocpVersionMetricName] = doc.OpenShiftCluster.Properties.ClusterProfile.Version - dimensions[rpVersionMetricName] = doc.OpenShiftCluster.Properties.ProvisionedBy - dimensions[resourecGroupMetricName] = doc.OpenShiftCluster.Properties.ClusterProfile.ResourceGroupID - - for flag, feature := range doc.OpenShiftCluster.Properties.OperatorFlags { - dimensions[fmt.Sprintf("%s-%s", operatorFlagsMetricName, flag)] = feature - } +func (ocb *openShiftClusterBackend) gatherMiscMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + dimensions[subscriptionIdMetricName] = ocb.getStringMetricValue(log, subscriptionIdMetricName, ocb.env.SubscriptionID()) + dimensions[resourceIdMetricName] = ocb.getStringMetricValue(log, resourceIdMetricName, doc.ResourceID) + + dimensions[clusterNameMetricName] = ocb.getStringMetricValue(log, clusterNameMetricName, doc.OpenShiftCluster.Name) + dimensions[clusterIdMetricName] = ocb.getStringMetricValue(log, clusterIdMetricName, doc.OpenShiftCluster.ID) + dimensions[locationMetricName] = ocb.getStringMetricValue(log, locationMetricName, doc.OpenShiftCluster.Location) + dimensions[ocpVersionMetricName] = ocb.getStringMetricValue(log, ocpVersionMetricName, doc.OpenShiftCluster.Properties.ClusterProfile.Version) + dimensions[rpVersionMetricName] = ocb.getStringMetricValue(log, rpVersionMetricName, doc.OpenShiftCluster.Properties.ProvisionedBy) + dimensions[resourecGroupMetricName] = ocb.getStringMetricValue(log, resourecGroupMetricName, doc.OpenShiftCluster.Properties.ClusterProfile.ResourceGroupID) + + for flag, feature := range doc.OpenShiftCluster.Properties.OperatorFlags { + flagMetricName := fmt.Sprintf("%s-%s", operatorFlagsMetricName, flag) + dimensions[flagMetricName] = ocb.getStringMetricValue(log, flagMetricName, feature) } - dimensions[asyncOperationsIdMetricName] = doc.AsyncOperationID + dimensions[asyncOperationsIdMetricName] = ocb.getStringMetricValue(log, asyncOperationsIdMetricName, doc.AsyncOperationID) if doc.OpenShiftCluster.Properties.WorkerProfiles != nil { dimensions[workerProfileCountMetricName] = strconv.FormatInt(int64(len(doc.OpenShiftCluster.Properties.WorkerProfiles)), 10) + } else { + dimensions[workerProfileCountMetricName] = ocb.getStringMetricValue(log, workerProfileCountMetricName, "") } if doc.OpenShiftCluster.Tags != nil { @@ -96,7 +111,7 @@ func (ocb *openShiftClusterBackend) gatherMiscMetrics(doc *api.OpenShiftClusterD } } -func (ocb *openShiftClusterBackend) gatherNodeMetrics(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { +func (ocb *openShiftClusterBackend) gatherNodeMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { if doc.OpenShiftCluster.Properties.MasterProfile.DiskEncryptionSetID != "" { dimensions[masterProfileEncryptionSetIdMetricName] = enabled } else { @@ -104,22 +119,21 @@ func (ocb *openShiftClusterBackend) gatherNodeMetrics(doc *api.OpenShiftClusterD } mp := doc.OpenShiftCluster.Properties.MasterProfile - dimensions[masterProfileVmSizeMetricName] = string(mp.VMSize) + dimensions[masterProfileVmSizeMetricName] = ocb.getStringMetricValue(log, masterProfileVmSizeMetricName, string(mp.VMSize)) if doc.OpenShiftCluster.Properties.MasterProfile.EncryptionAtHost == api.EncryptionAtHostEnabled { dimensions[masterEncryptionAtHostMetricName] = string(api.EncryptionAtHostEnabled) } else if doc.OpenShiftCluster.Properties.MasterProfile.EncryptionAtHost == api.EncryptionAtHostDisabled { dimensions[masterEncryptionAtHostMetricName] = string(api.EncryptionAtHostDisabled) } else { + log.Warnf("%s %s", metricFailToCollectErr, masterEncryptionAtHostMetricName) dimensions[masterEncryptionAtHostMetricName] = unknown } if len(doc.OpenShiftCluster.Properties.WorkerProfiles) > 0 { wp := doc.OpenShiftCluster.Properties.WorkerProfiles[0] - dimensions[workerVmSizeMetricName] = string(wp.VMSize) dimensions[workerVmDiskSizeMetricName] = strconv.FormatInt(int64(wp.DiskSizeGB), 10) - - dimensions[workerVmSizeMetricName] = string(wp.VMSize) + dimensions[workerVmSizeMetricName] = ocb.getStringMetricValue(log, workerVmSizeMetricName, string(wp.VMSize)) dimensions[workerVmDiskSizeMetricName] = strconv.FormatInt(int64(wp.DiskSizeGB), 10) if wp.EncryptionAtHost == api.EncryptionAtHostEnabled { @@ -127,6 +141,7 @@ func (ocb *openShiftClusterBackend) gatherNodeMetrics(doc *api.OpenShiftClusterD } else if wp.EncryptionAtHost == api.EncryptionAtHostDisabled { dimensions[workerEncryptionAtHostMetricName] = string(api.EncryptionAtHostDisabled) } else { + log.Warnf("%s %s", metricFailToCollectErr, workerEncryptionAtHostMetricName) dimensions[workerEncryptionAtHostMetricName] = unknown } } @@ -136,16 +151,18 @@ func (ocb *openShiftClusterBackend) gatherNodeMetrics(doc *api.OpenShiftClusterD } else if doc.OpenShiftCluster.Properties.ClusterProfile.FipsValidatedModules == api.FipsValidatedModulesDisabled { dimensions[fipsMetricName] = string(api.FipsValidatedModulesDisabled) } else { + log.Warnf("%s %s", metricFailToCollectErr, fipsMetricName) dimensions[fipsMetricName] = unknown } } -func (ocb *openShiftClusterBackend) gatherAuthMetrics(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { +func (ocb *openShiftClusterBackend) gatherAuthMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { if doc.OpenShiftCluster.Properties.PlatformWorkloadIdentityProfile != nil { dimensions[clusterIdentityMetricName] = clusterIdentityManagedIdMetricName } else if doc.OpenShiftCluster.Properties.ServicePrincipalProfile != nil { dimensions[clusterIdentityMetricName] = clusterIdentityServicePrincipalMetricName } else { + log.Warnf("%s %s", metricFailToCollectErr, clusterIdentityMetricName) dimensions[clusterIdentityMetricName] = unknown } @@ -156,13 +173,14 @@ func (ocb *openShiftClusterBackend) gatherAuthMetrics(doc *api.OpenShiftClusterD } } -func (ocb *openShiftClusterBackend) gatherNetworkMetrics(doc *api.OpenShiftClusterDocument, dimensions map[string]string) { +func (ocb *openShiftClusterBackend) gatherNetworkMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { for _, p := range doc.OpenShiftCluster.Properties.IngressProfiles { if p.Visibility == api.VisibilityPrivate { dimensions[ingressProfileMetricName] = fmt.Sprintf("%s.%s", string(api.VisibilityPrivate), p.Name) } else if p.Visibility == api.VisibilityPublic { dimensions[ingressProfileMetricName] = fmt.Sprintf("%s.%s", string(api.VisibilityPublic), p.Name) } else { + log.Warnf("%s %s", metricFailToCollectErr, ingressProfileMetricName) dimensions[ingressProfileMetricName] = unknown } } @@ -172,6 +190,7 @@ func (ocb *openShiftClusterBackend) gatherNetworkMetrics(doc *api.OpenShiftClust } else if doc.OpenShiftCluster.Properties.NetworkProfile.OutboundType == api.OutboundTypeLoadbalancer { dimensions[networkProfileOutboundTypeMetricName] = string(api.OutboundTypeLoadbalancer) } else { + log.Warnf("%s %s", metricFailToCollectErr, networkProfileManagedOutboundIpsMetricName) dimensions[networkProfileOutboundTypeMetricName] = unknown } @@ -188,7 +207,10 @@ func (ocb *openShiftClusterBackend) gatherNetworkMetrics(doc *api.OpenShiftClust } domain, err := dns.ManagedDomain(ocb.env, doc.OpenShiftCluster.Properties.ClusterProfile.Domain) - if err == nil { + if err != nil { + dimensions[clusterProfileDomainMetricName] = empty + log.Warnf("%s %s, due to %s", metricFailToCollectErr, clusterProfileDomainMetricName, err.Error()) + } else { if domain != "" { dimensions[clusterProfileDomainMetricName] = custom } else { @@ -198,6 +220,9 @@ func (ocb *openShiftClusterBackend) gatherNetworkMetrics(doc *api.OpenShiftClust if doc.OpenShiftCluster.Properties.NetworkProfile.LoadBalancerProfile.ManagedOutboundIPs != nil { dimensions[networkProfileManagedOutboundIpsMetricName] = strconv.FormatInt(int64(doc.OpenShiftCluster.Properties.NetworkProfile.LoadBalancerProfile.ManagedOutboundIPs.Count), 10) + } else { + log.Warnf("%s %s", metricFailToCollectErr, networkProfileManagedOutboundIpsMetricName) + dimensions[networkProfileManagedOutboundIpsMetricName] = unknown } if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled { @@ -205,6 +230,7 @@ func (ocb *openShiftClusterBackend) gatherNetworkMetrics(doc *api.OpenShiftClust } else if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGDisabled { dimensions[networkProfilePreConfiguredNSGMetricName] = string(api.PreconfiguredNSGDisabled) } else { + log.Warnf("%s %s", metricFailToCollectErr, networkProfilePreConfiguredNSGMetricName) dimensions[networkProfilePreConfiguredNSGMetricName] = unknown } diff --git a/pkg/backend/metrics_const.go b/pkg/backend/metrics_const.go index a3c294bac9c..9e64b63378a 100644 --- a/pkg/backend/metrics_const.go +++ b/pkg/backend/metrics_const.go @@ -4,15 +4,16 @@ package backend // Licensed under the Apache License 2.0. const ( - metricPackage = "backend.openshiftcluster" - metricValue int64 = 1 - enabled = "Enabled" - disabled = "Disabled" - custom = "Custom" - defaultSet = "Default" - unknown = "unknown" - empty = "empty" - managed = "managed" + metricPackage = "backend.openshiftcluster" + metricValue int64 = 1 + enabled = "Enabled" + disabled = "Disabled" + custom = "Custom" + defaultSet = "Default" + unknown = "unknown" + empty = "empty" + managed = "managed" + metricFailToCollectErr = "failed to collect metric:" encryptionAtHostMetricName = "encryptionathost" diskSizeMetricName = "disksize" @@ -56,11 +57,13 @@ const ( operatorFlagsMetricName = "operatorflags" asyncOperationsIdMetricName = "async_operationsid" - rpVersionMetricName = "rpversion" - ocpVersionMetricName = "ocpversion" - clusterNameMetricName = "clustername" - resourecGroupMetricName = "resourcegroup" - locationMetricName = "location" + openshiftClusterMetricName = "openshiftcluster" + rpVersionMetricName = openshiftClusterMetricName + "." + "rpversion" + ocpVersionMetricName = openshiftClusterMetricName + "." + "ocpversion" + clusterNameMetricName = openshiftClusterMetricName + "." + "clustername" + clusterIdMetricName = openshiftClusterMetricName + "." + "clusterid" + resourecGroupMetricName = openshiftClusterMetricName + "." + "resourcegroup" + locationMetricName = openshiftClusterMetricName + "." + "location" resourceIdMetricName = "resourceid" subscriptionIdMetricName = "subscriptionid" diff --git a/pkg/backend/metrics_test.go b/pkg/backend/metrics_test.go index 913dce459e4..8d73765faa8 100644 --- a/pkg/backend/metrics_test.go +++ b/pkg/backend/metrics_test.go @@ -104,14 +104,10 @@ func TestEmitMetrics(t *testing.T) { FipsValidatedModules: api.FipsValidatedModulesEnabled, }, NetworkProfile: api.NetworkProfile{ - LoadBalancerProfile: &api.LoadBalancerProfile{ - ManagedOutboundIPs: &api.ManagedOutboundIPs{ - Count: 1, - }, - }, - PodCIDR: "10.128.0.1/14", - ServiceCIDR: "172.30.0.1/16", - PreconfiguredNSG: api.PreconfiguredNSGEnabled, + LoadBalancerProfile: &api.LoadBalancerProfile{}, + PodCIDR: "10.128.0.1/14", + ServiceCIDR: "172.30.0.1/16", + PreconfiguredNSG: api.PreconfiguredNSGEnabled, }, OperatorFlags: api.OperatorFlags{"testFlag": "true"}, WorkerProfiles: []api.WorkerProfile{ @@ -204,12 +200,12 @@ func TestEmitMetrics(t *testing.T) { } dimensions := map[string]string{} - ocb.gatherOperationMetrics(tt.operationType, tt.provisioningState, tt.backendErr, dimensions) - ocb.gatherCorrelationID(tt.doc, dimensions) - ocb.gatherMiscMetrics(tt.doc, dimensions) - ocb.gatherAuthMetrics(tt.doc, dimensions) - ocb.gatherNetworkMetrics(tt.doc, dimensions) - ocb.gatherNodeMetrics(tt.doc, dimensions) + ocb.gatherOperationMetrics(log, tt.operationType, tt.provisioningState, tt.backendErr, dimensions) + ocb.gatherCorrelationID(log, tt.doc, dimensions) + ocb.gatherMiscMetrics(log, tt.doc, dimensions) + ocb.gatherAuthMetrics(log, tt.doc, dimensions) + ocb.gatherNetworkMetrics(log, tt.doc, dimensions) + ocb.gatherNodeMetrics(log, tt.doc, dimensions) emitter.EXPECT().EmitGauge(ocb.getMetricName(tt.operationType), metricValue, dimensions).MaxTimes(1)