diff --git a/pkg/backend/metrics.go b/pkg/backend/metrics.go new file mode 100644 index 00000000000..fb57cb6a558 --- /dev/null +++ b/pkg/backend/metrics.go @@ -0,0 +1,242 @@ +package backend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "fmt" + "strconv" + + "github.com/sirupsen/logrus" + + "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/util/dns" + utillog "github.com/Azure/ARO-RP/pkg/util/log" +) + +func (ocb *openShiftClusterBackend) emitMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, operationType, provisioningState api.ProvisioningState, backendErr error) map[string]string { + dimensions := map[string]string{} + + ocb.gatherOperationMetrics(log, operationType, provisioningState, backendErr, dimensions) + ocb.gatherCorrelationID(log, doc, dimensions) + ocb.gatherMiscMetrics(log, doc, dimensions) + ocb.gatherAuthMetrics(log, doc, dimensions) + ocb.gatherNetworkMetrics(log, doc, dimensions) + ocb.gatherNodeMetrics(log, doc, dimensions) + + ocb.logMetricDimensions(log, operationType, dimensions) + ocb.m.EmitGauge(ocb.getMetricName(operationType), metricValue, dimensions) + + // dimensions is returned here for testing purposes + return dimensions +} + +func (ocb *openShiftClusterBackend) getMetricName(operationType api.ProvisioningState) string { + return fmt.Sprintf("%s.%s", metricPackage, operationType) +} + +func (ocb *openShiftClusterBackend) getResultType(backendErr error) utillog.ResultType { + var resultType utillog.ResultType + err, ok := backendErr.(*api.CloudError) + if ok { + resultType = utillog.MapStatusCodeToResultType(err.StatusCode) + } + return resultType +} + +func (ocb *openShiftClusterBackend) getStringMetricValue(log *logrus.Entry, metricName, value string) string { + if value != "" { + return value + } + + log.Warnf("%s %s", metricFailToCollectErr, metricName) + return empty +} + +func (ocb *openShiftClusterBackend) logMetricDimensions(log *logrus.Entry, operationType api.ProvisioningState, dimensions map[string]string) { + for metric, value := range dimensions { + log.Info(fmt.Sprintf("%s.%s: %s = %s", metricPackage, operationType, metric, value)) + } +} + +func (ocb *openShiftClusterBackend) gatherCorrelationID(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + if doc.CorrelationData != nil { + dimensions[correlationDataIdMetricName] = ocb.getStringMetricValue(log, correlationDataIdMetricName, doc.CorrelationData.CorrelationID) + dimensions[correlationDataClientRequestIdMetricName] = ocb.getStringMetricValue(log, correlationDataClientRequestIdMetricName, doc.CorrelationData.ClientRequestID) + dimensions[correlationDataRequestIdMetricName] = ocb.getStringMetricValue(log, correlationDataRequestIdMetricName, doc.CorrelationData.RequestID) + } else { + log.Warnf("%s %s", metricFailToCollectErr, correlationDataMetricName) + dimensions[correlationDataIdMetricName] = empty + dimensions[correlationDataClientRequestIdMetricName] = empty + dimensions[correlationDataRequestIdMetricName] = empty + } +} + +func (ocb *openShiftClusterBackend) gatherOperationMetrics(log *logrus.Entry, operationType, provisioningState api.ProvisioningState, backendErr error, dimensions map[string]string) { + // These are provided internally by endLease, not expected to be "" + dimensions[operationTypeMetricName] = operationType.String() + dimensions[provisioningStateMetricName] = provisioningState.String() + + dimensions[resultTypeMetricName] = ocb.getStringMetricValue(log, resultTypeMetricName, string(ocb.getResultType(backendErr))) +} + +func (ocb *openShiftClusterBackend) gatherMiscMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + dimensions[subscriptionIdMetricName] = ocb.getStringMetricValue(log, subscriptionIdMetricName, ocb.env.SubscriptionID()) + dimensions[resourceIdMetricName] = ocb.getStringMetricValue(log, resourceIdMetricName, doc.ResourceID) + + dimensions[clusterNameMetricName] = ocb.getStringMetricValue(log, clusterNameMetricName, doc.OpenShiftCluster.Name) + dimensions[clusterIdMetricName] = ocb.getStringMetricValue(log, clusterIdMetricName, doc.OpenShiftCluster.ID) + dimensions[locationMetricName] = ocb.getStringMetricValue(log, locationMetricName, doc.OpenShiftCluster.Location) + dimensions[ocpVersionMetricName] = ocb.getStringMetricValue(log, ocpVersionMetricName, doc.OpenShiftCluster.Properties.ClusterProfile.Version) + dimensions[rpVersionMetricName] = ocb.getStringMetricValue(log, rpVersionMetricName, doc.OpenShiftCluster.Properties.ProvisionedBy) + dimensions[resourecGroupMetricName] = ocb.getStringMetricValue(log, resourecGroupMetricName, doc.OpenShiftCluster.Properties.ClusterProfile.ResourceGroupID) + + for flag, feature := range doc.OpenShiftCluster.Properties.OperatorFlags { + flagMetricName := fmt.Sprintf("%s-%s", operatorFlagsMetricName, flag) + dimensions[flagMetricName] = ocb.getStringMetricValue(log, flagMetricName, feature) + } + + dimensions[asyncOperationsIdMetricName] = ocb.getStringMetricValue(log, asyncOperationsIdMetricName, doc.AsyncOperationID) + + if doc.OpenShiftCluster.Properties.WorkerProfiles != nil { + dimensions[workerProfileCountMetricName] = strconv.FormatInt(int64(len(doc.OpenShiftCluster.Properties.WorkerProfiles)), 10) + } else { + dimensions[workerProfileCountMetricName] = ocb.getStringMetricValue(log, workerProfileCountMetricName, "") + } + + if doc.OpenShiftCluster.Tags != nil { + dimensions[tagsMetricName] = enabled + } else { + dimensions[tagsMetricName] = disabled + } +} + +func (ocb *openShiftClusterBackend) gatherNodeMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + if doc.OpenShiftCluster.Properties.MasterProfile.DiskEncryptionSetID != "" { + dimensions[masterProfileEncryptionSetIdMetricName] = enabled + } else { + dimensions[masterProfileEncryptionSetIdMetricName] = disabled + } + + mp := doc.OpenShiftCluster.Properties.MasterProfile + dimensions[masterProfileVmSizeMetricName] = ocb.getStringMetricValue(log, masterProfileVmSizeMetricName, string(mp.VMSize)) + + if doc.OpenShiftCluster.Properties.MasterProfile.EncryptionAtHost == api.EncryptionAtHostEnabled { + dimensions[masterEncryptionAtHostMetricName] = string(api.EncryptionAtHostEnabled) + } else if doc.OpenShiftCluster.Properties.MasterProfile.EncryptionAtHost == api.EncryptionAtHostDisabled { + dimensions[masterEncryptionAtHostMetricName] = string(api.EncryptionAtHostDisabled) + } else { + log.Warnf("%s %s", metricFailToCollectErr, masterEncryptionAtHostMetricName) + dimensions[masterEncryptionAtHostMetricName] = unknown + } + + if len(doc.OpenShiftCluster.Properties.WorkerProfiles) > 0 { + wp := doc.OpenShiftCluster.Properties.WorkerProfiles[0] + dimensions[workerVmDiskSizeMetricName] = strconv.FormatInt(int64(wp.DiskSizeGB), 10) + dimensions[workerVmSizeMetricName] = ocb.getStringMetricValue(log, workerVmSizeMetricName, string(wp.VMSize)) + dimensions[workerVmDiskSizeMetricName] = strconv.FormatInt(int64(wp.DiskSizeGB), 10) + + if wp.EncryptionAtHost == api.EncryptionAtHostEnabled { + dimensions[workerEncryptionAtHostMetricName] = string(api.EncryptionAtHostEnabled) + } else if wp.EncryptionAtHost == api.EncryptionAtHostDisabled { + dimensions[workerEncryptionAtHostMetricName] = string(api.EncryptionAtHostDisabled) + } else { + log.Warnf("%s %s", metricFailToCollectErr, workerEncryptionAtHostMetricName) + dimensions[workerEncryptionAtHostMetricName] = unknown + } + } + + if doc.OpenShiftCluster.Properties.ClusterProfile.FipsValidatedModules == api.FipsValidatedModulesEnabled { + dimensions[fipsMetricName] = string(api.FipsValidatedModulesEnabled) + } else if doc.OpenShiftCluster.Properties.ClusterProfile.FipsValidatedModules == api.FipsValidatedModulesDisabled { + dimensions[fipsMetricName] = string(api.FipsValidatedModulesDisabled) + } else { + log.Warnf("%s %s", metricFailToCollectErr, fipsMetricName) + dimensions[fipsMetricName] = unknown + } +} + +func (ocb *openShiftClusterBackend) gatherAuthMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + if doc.OpenShiftCluster.Properties.PlatformWorkloadIdentityProfile != nil { + dimensions[clusterIdentityMetricName] = clusterIdentityManagedIdMetricName + } else if doc.OpenShiftCluster.Properties.ServicePrincipalProfile != nil { + dimensions[clusterIdentityMetricName] = clusterIdentityServicePrincipalMetricName + } else { + log.Warnf("%s %s", metricFailToCollectErr, clusterIdentityMetricName) + dimensions[clusterIdentityMetricName] = unknown + } + + if doc.OpenShiftCluster.Properties.ClusterProfile.PullSecret != "" { + dimensions[pullSecretMetricName] = enabled + } else { + dimensions[pullSecretMetricName] = disabled + } +} + +func (ocb *openShiftClusterBackend) gatherNetworkMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) { + for _, p := range doc.OpenShiftCluster.Properties.IngressProfiles { + if p.Visibility == api.VisibilityPrivate { + dimensions[ingressProfileMetricName] = fmt.Sprintf("%s.%s", string(api.VisibilityPrivate), p.Name) + } else if p.Visibility == api.VisibilityPublic { + dimensions[ingressProfileMetricName] = fmt.Sprintf("%s.%s", string(api.VisibilityPublic), p.Name) + } else { + log.Warnf("%s %s", metricFailToCollectErr, ingressProfileMetricName) + dimensions[ingressProfileMetricName] = unknown + } + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.OutboundType == api.OutboundTypeUserDefinedRouting { + dimensions[networkProfileOutboundTypeMetricName] = string(api.OutboundTypeUserDefinedRouting) + } else if doc.OpenShiftCluster.Properties.NetworkProfile.OutboundType == api.OutboundTypeLoadbalancer { + dimensions[networkProfileOutboundTypeMetricName] = string(api.OutboundTypeLoadbalancer) + } else { + log.Warnf("%s %s", metricFailToCollectErr, networkProfileManagedOutboundIpsMetricName) + dimensions[networkProfileOutboundTypeMetricName] = unknown + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.PodCIDR != podCidrDefaultValue { + dimensions[podCidrMetricName] = custom + } else { + dimensions[podCidrMetricName] = defaultSet + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.ServiceCIDR != serviceCidrDefaultValue { + dimensions[serviceCidrMetricName] = custom + } else { + dimensions[serviceCidrMetricName] = defaultSet + } + + domain, err := dns.ManagedDomain(ocb.env, doc.OpenShiftCluster.Properties.ClusterProfile.Domain) + if err != nil { + dimensions[clusterProfileDomainMetricName] = empty + log.Warnf("%s %s, due to %s", metricFailToCollectErr, clusterProfileDomainMetricName, err.Error()) + } else { + if domain != "" { + dimensions[clusterProfileDomainMetricName] = custom + } else { + dimensions[clusterProfileDomainMetricName] = managed + } + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.LoadBalancerProfile.ManagedOutboundIPs != nil { + dimensions[networkProfileManagedOutboundIpsMetricName] = strconv.FormatInt(int64(doc.OpenShiftCluster.Properties.NetworkProfile.LoadBalancerProfile.ManagedOutboundIPs.Count), 10) + } else { + log.Warnf("%s %s", metricFailToCollectErr, networkProfileManagedOutboundIpsMetricName) + dimensions[networkProfileManagedOutboundIpsMetricName] = unknown + } + + if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled { + dimensions[networkProfilePreConfiguredNSGMetricName] = string(api.PreconfiguredNSGEnabled) + } else if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGDisabled { + dimensions[networkProfilePreConfiguredNSGMetricName] = string(api.PreconfiguredNSGDisabled) + } else { + log.Warnf("%s %s", metricFailToCollectErr, networkProfilePreConfiguredNSGMetricName) + dimensions[networkProfilePreConfiguredNSGMetricName] = unknown + } + + if doc.OpenShiftCluster.Properties.FeatureProfile.GatewayEnabled { + dimensions[featureProfileGatewayEnabledMetricName] = enabled + } else { + dimensions[featureProfileGatewayEnabledMetricName] = disabled + } +} diff --git a/pkg/backend/metrics_const.go b/pkg/backend/metrics_const.go new file mode 100644 index 00000000000..9e64b63378a --- /dev/null +++ b/pkg/backend/metrics_const.go @@ -0,0 +1,78 @@ +package backend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +const ( + metricPackage = "backend.openshiftcluster" + metricValue int64 = 1 + enabled = "Enabled" + disabled = "Disabled" + custom = "Custom" + defaultSet = "Default" + unknown = "unknown" + empty = "empty" + managed = "managed" + metricFailToCollectErr = "failed to collect metric:" + + encryptionAtHostMetricName = "encryptionathost" + diskSizeMetricName = "disksize" + vmSizeMetricName = "vmsize" + countMetricName = "count" + + workerProfileMetricName = "workprofile" + workerVmSizeMetricName = workerProfileMetricName + "." + vmSizeMetricName + workerVmDiskSizeMetricName = workerProfileMetricName + "." + diskSizeMetricName + workerEncryptionAtHostMetricName = workerProfileMetricName + "." + encryptionAtHostMetricName + workerProfileCountMetricName = workerProfileMetricName + "." + countMetricName + + masterProfileMetricName = "masterprofile" + masterEncryptionAtHostMetricName = masterProfileMetricName + "." + encryptionAtHostMetricName + masterProfileEncryptionSetIdMetricName = masterProfileMetricName + "." + "diskencryptionsetid" + masterProfileVmSizeMetricName = masterProfileMetricName + "." + vmSizeMetricName + + fipsMetricName = "fips" + clusterIdentityMetricName = "clusteridentity" + clusterIdentityManagedIdMetricName = managed + "id" + clusterIdentityServicePrincipalMetricName = "serviceprincipal" + pullSecretMetricName = "pullsecret" + + ingressProfileMetricName = "ingressprofile" + networkProfileMetricName = "networkprofile" + networkProfileOutboundTypeMetricName = networkProfileMetricName + "." + "outboundtype" + networkProfileManagedOutboundIpsMetricName = networkProfileMetricName + "." + "managedoutboundips" + networkProfilePreConfiguredNSGMetricName = networkProfileMetricName + "." + "preconfigurednsg" + podCidrMetricName = networkProfileMetricName + "." + "podcidr" + serviceCidrMetricName = networkProfileMetricName + "." + "servicecidr" + podCidrDefaultValue = "10.128.0.0/14" + serviceCidrDefaultValue = "172.30.0.0/16" + + featureProfileMetricName = "featureprofile" + featureProfileGatewayEnabledMetricName = featureProfileMetricName + "." + "gatewayenabled" + + clusterProfileMetricName = "clusterprofile" + clusterProfileDomainMetricName = clusterProfileMetricName + "." + "domain" + + tagsMetricName = "tags" + operatorFlagsMetricName = "operatorflags" + + asyncOperationsIdMetricName = "async_operationsid" + openshiftClusterMetricName = "openshiftcluster" + rpVersionMetricName = openshiftClusterMetricName + "." + "rpversion" + ocpVersionMetricName = openshiftClusterMetricName + "." + "ocpversion" + clusterNameMetricName = openshiftClusterMetricName + "." + "clustername" + clusterIdMetricName = openshiftClusterMetricName + "." + "clusterid" + resourecGroupMetricName = openshiftClusterMetricName + "." + "resourcegroup" + locationMetricName = openshiftClusterMetricName + "." + "location" + resourceIdMetricName = "resourceid" + subscriptionIdMetricName = "subscriptionid" + + correlationDataMetricName = "correlationdata" + correlationDataRequestIdMetricName = correlationDataMetricName + "." + "requestid" + correlationDataClientRequestIdMetricName = correlationDataMetricName + "." + "client_requestid" + correlationDataIdMetricName = correlationDataMetricName + "." + "correlationid" + + operationTypeMetricName = "operationtype" + provisioningStateMetricName = "provisioningstate" + resultTypeMetricName = "resulttype" +) diff --git a/pkg/backend/metrics_provisioning.go b/pkg/backend/metrics_provisioning.go new file mode 100644 index 00000000000..ce7bad07039 --- /dev/null +++ b/pkg/backend/metrics_provisioning.go @@ -0,0 +1,28 @@ +package backend + +import ( + "time" + + "github.com/Azure/ARO-RP/pkg/api" +) + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +func (ocb *openShiftClusterBackend) emitProvisioningMetrics(doc *api.OpenShiftClusterDocument, provisioningState api.ProvisioningState) { + if doc.CorrelationData == nil { + return + } + + duration := time.Since(doc.CorrelationData.RequestTime).Milliseconds() + + ocb.m.EmitGauge("backend.openshiftcluster.duration", duration, map[string]string{ + "oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState), + "newProvisioningState": string(provisioningState), + }) + + ocb.m.EmitGauge("backend.openshiftcluster.count", 1, map[string]string{ + "oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState), + "newProvisioningState": string(provisioningState), + }) +} diff --git a/pkg/backend/metrics_test.go b/pkg/backend/metrics_test.go new file mode 100644 index 00000000000..8d73765faa8 --- /dev/null +++ b/pkg/backend/metrics_test.go @@ -0,0 +1,220 @@ +package backend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "fmt" + "reflect" + "testing" + + "github.com/golang/mock/gomock" + "github.com/sirupsen/logrus" + + "github.com/Azure/ARO-RP/pkg/api" + mock_env "github.com/Azure/ARO-RP/pkg/util/mocks/env" + mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics" +) + +func TestEmitMetrics(t *testing.T) { + controller := gomock.NewController(t) + emitter := mock_metrics.NewMockEmitter(controller) + env := mock_env.NewMockInterface(controller) + env.EXPECT().SubscriptionID().AnyTimes() + env.EXPECT().Domain().AnyTimes() + + log := logrus.NewEntry(&logrus.Logger{}) + + b := &backend{ + baseLog: log, + env: env, + m: emitter, + } + ocb := newOpenShiftClusterBackend(b) + + mockSubID := "00000000-0000-0000-0000-000000000000" + resourceGroup := "resourceGroup" + resourceID := fmt.Sprintf("/subscriptions/%s/resourcegroups/%s/providers/Microsoft.RedHatOpenShift/openShiftClusters/resourceName", mockSubID, resourceGroup) + + for _, tt := range []struct { + name string + operationType api.ProvisioningState + provisioningState api.ProvisioningState + doc *api.OpenShiftClusterDocument + backendErr error + managedDomain bool + }{ + { + name: "Pass default cluster install", + doc: &api.OpenShiftClusterDocument{ + CorrelationData: &api.CorrelationData{ + CorrelationID: "id", + ClientRequestID: "client request id", + RequestID: "request id", + }, + ResourceID: resourceID, + OpenShiftCluster: &api.OpenShiftCluster{ + Location: "eastus", + Properties: api.OpenShiftClusterProperties{ + ClusterProfile: api.ClusterProfile{ + Domain: "cluster.domain.example", + PullSecret: api.SecureString("super secret"), + FipsValidatedModules: api.FipsValidatedModulesDisabled, + }, + NetworkProfile: api.NetworkProfile{ + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 1, + }, + }, + PodCIDR: podCidrDefaultValue, + ServiceCIDR: serviceCidrDefaultValue, + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + }, + OperatorFlags: api.OperatorFlags{"testFlag": "true"}, + WorkerProfiles: api.ExampleOpenShiftClusterDocument().OpenShiftCluster.Properties.WorkerProfiles, + MasterProfile: api.ExampleOpenShiftClusterDocument().OpenShiftCluster.Properties.MasterProfile, + ServicePrincipalProfile: &api.ServicePrincipalProfile{}, + IngressProfiles: api.ExampleOpenShiftClusterDocument().OpenShiftCluster.Properties.IngressProfiles, + FeatureProfile: api.FeatureProfile{ + GatewayEnabled: true, + }, + }, + }, + }, + operationType: api.ProvisioningStateCreating, + provisioningState: api.ProvisioningStateSucceeded, + }, + { + name: "Pass workload identity cluster install", + doc: &api.OpenShiftClusterDocument{ + CorrelationData: &api.CorrelationData{ + CorrelationID: "id", + ClientRequestID: "client request id", + RequestID: "request id", + }, + ResourceID: resourceID, + OpenShiftCluster: &api.OpenShiftCluster{ + Location: "eastus", + Tags: map[string]string{"tag1": "true"}, + Properties: api.OpenShiftClusterProperties{ + ClusterProfile: api.ClusterProfile{ + Domain: "cluster.domain.example", + PullSecret: api.SecureString("super secret"), + FipsValidatedModules: api.FipsValidatedModulesEnabled, + }, + NetworkProfile: api.NetworkProfile{ + LoadBalancerProfile: &api.LoadBalancerProfile{}, + PodCIDR: "10.128.0.1/14", + ServiceCIDR: "172.30.0.1/16", + PreconfiguredNSG: api.PreconfiguredNSGEnabled, + }, + OperatorFlags: api.OperatorFlags{"testFlag": "true"}, + WorkerProfiles: []api.WorkerProfile{ + { + DiskEncryptionSetID: "testing/disk/encryptionset", + EncryptionAtHost: api.EncryptionAtHostEnabled, + }, + }, + MasterProfile: api.MasterProfile{ + DiskEncryptionSetID: "testing/disk/encryptionset", + EncryptionAtHost: api.EncryptionAtHostEnabled, + }, + PlatformWorkloadIdentityProfile: &api.PlatformWorkloadIdentityProfile{}, + IngressProfiles: []api.IngressProfile{ + { + Name: "PrivateIngressProfile", + Visibility: api.VisibilityPrivate, + }, + }, + FeatureProfile: api.FeatureProfile{ + GatewayEnabled: true, + }, + }, + }, + }, + operationType: api.ProvisioningStateCreating, + provisioningState: api.ProvisioningStateSucceeded, + }, + { + name: "Pass backend error", + backendErr: &api.CloudError{ + StatusCode: 200, + }, + doc: &api.OpenShiftClusterDocument{ + CorrelationData: &api.CorrelationData{ + CorrelationID: "id", + ClientRequestID: "client request id", + RequestID: "request id", + }, + ResourceID: resourceID, + OpenShiftCluster: &api.OpenShiftCluster{ + Location: "eastus", + Tags: map[string]string{"tag1": "true"}, + Properties: api.OpenShiftClusterProperties{ + ClusterProfile: api.ClusterProfile{ + Domain: "cluster", + PullSecret: api.SecureString("super secret"), + FipsValidatedModules: api.FipsValidatedModulesEnabled, + }, + NetworkProfile: api.NetworkProfile{ + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 1, + }, + }, + PodCIDR: "10.128.0.1/14", + ServiceCIDR: "172.30.0.1/16", + }, + OperatorFlags: api.OperatorFlags{"testFlag": "true"}, + WorkerProfiles: []api.WorkerProfile{ + { + DiskEncryptionSetID: "testing/disk/encryptionset", + EncryptionAtHost: api.EncryptionAtHostDisabled, + }, + }, + MasterProfile: api.MasterProfile{ + DiskEncryptionSetID: "testing/disk/encryptionset", + EncryptionAtHost: api.EncryptionAtHostDisabled, + }, + PlatformWorkloadIdentityProfile: &api.PlatformWorkloadIdentityProfile{}, + IngressProfiles: []api.IngressProfile{ + { + Name: "EmptyIngressProfile", + }, + }, + FeatureProfile: api.FeatureProfile{ + GatewayEnabled: true, + }, + }, + }, + }, + operationType: api.ProvisioningStateCreating, + provisioningState: api.ProvisioningStateSucceeded, + managedDomain: true, + }, + } { + t.Run(tt.name, func(t *testing.T) { + if tt.managedDomain { + t.Setenv("DOMAIN_NAME", "aro-managed.example") + } + + dimensions := map[string]string{} + ocb.gatherOperationMetrics(log, tt.operationType, tt.provisioningState, tt.backendErr, dimensions) + ocb.gatherCorrelationID(log, tt.doc, dimensions) + ocb.gatherMiscMetrics(log, tt.doc, dimensions) + ocb.gatherAuthMetrics(log, tt.doc, dimensions) + ocb.gatherNetworkMetrics(log, tt.doc, dimensions) + ocb.gatherNodeMetrics(log, tt.doc, dimensions) + + emitter.EXPECT().EmitGauge(ocb.getMetricName(tt.operationType), metricValue, dimensions).MaxTimes(1) + + d := ocb.emitMetrics(log, tt.doc, tt.operationType, tt.provisioningState, tt.backendErr) + + ok := reflect.DeepEqual(dimensions, d) + if !ok { + t.Errorf("%s != %s", dimensions, d) + } + }) + } +} diff --git a/pkg/backend/openshiftcluster.go b/pkg/backend/openshiftcluster.go index 2ad95b6f5f1..a7f9fddc9d5 100644 --- a/pkg/backend/openshiftcluster.go +++ b/pkg/backend/openshiftcluster.go @@ -58,7 +58,7 @@ func (ocb *openShiftClusterBackend) try(ctx context.Context) (bool, error) { if doc.Dequeues > maxDequeueCount { err := fmt.Errorf("dequeued %d times, failing", doc.Dequeues) - return true, ocb.endLease(ctx, log, nil, doc, api.ProvisioningStateFailed, err) + return true, ocb.endLease(ctx, log, nil, doc, api.ProvisioningStateFailed, api.ProvisioningStateFailed, err) } log.Print("dequeued") @@ -131,7 +131,7 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr m, err := ocb.newManager(ctx, log, ocb.env, ocb.dbOpenShiftClusters, ocb.dbGateway, ocb.dbOpenShiftVersions, ocb.aead, ocb.billing, doc, subscriptionDoc, hr, ocb.m) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, api.ProvisioningStateFailed, err) } switch doc.OpenShiftCluster.Properties.ProvisioningState { @@ -140,7 +140,7 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr err = m.Install(ctx) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateFailed, err) } // re-get document and check the state: // if Install = nil, we are done with the install. @@ -148,12 +148,12 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr // backend worker to pick up next install phase doc, err = ocb.dbOpenShiftClusters.Get(ctx, strings.ToLower(doc.OpenShiftCluster.ID)) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateFailed, err) } if doc.OpenShiftCluster.Properties.Install == nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateSucceeded, nil) } - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, nil) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateCreating, api.ProvisioningStateCreating, nil) case api.ProvisioningStateAdminUpdating: log.Printf("admin updating (type: %s)", doc.OpenShiftCluster.Properties.MaintenanceTask) @@ -161,23 +161,23 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr err = m.AdminUpdate(ctx) if err != nil { // Customer will continue to see the cluster in an ongoing maintenance state - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateFailed, err) } // Maintenance task is complete, so we can clear the maintenance state doc, err = ocb.setNoMaintenanceState(ctx, doc) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateFailed, err) } - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateAdminUpdating, api.ProvisioningStateSucceeded, nil) case api.ProvisioningStateUpdating: log.Print("updating") err = m.Update(ctx) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateUpdating, api.ProvisioningStateFailed, err) } - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateUpdating, api.ProvisioningStateSucceeded, nil) case api.ProvisioningStateDeleting: log.Print("deleting") @@ -185,12 +185,12 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr err = m.Delete(ctx) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateDeleting, api.ProvisioningStateFailed, err) } err = ocb.updateAsyncOperation(ctx, log, doc.AsyncOperationID, nil, api.ProvisioningStateSucceeded, "", nil) if err != nil { - return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateDeleting, api.ProvisioningStateFailed, err) } stop() @@ -288,7 +288,7 @@ func (ocb *openShiftClusterBackend) updateAsyncOperation(ctx context.Context, lo return nil } -func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.Entry, stop func(), doc *api.OpenShiftClusterDocument, provisioningState api.ProvisioningState, backendErr error) error { +func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.Entry, stop func(), doc *api.OpenShiftClusterDocument, operationType, provisioningState api.ProvisioningState, backendErr error) error { var adminUpdateError *string var failedProvisioningState api.ProvisioningState initialProvisioningState := doc.OpenShiftCluster.Properties.ProvisioningState @@ -306,7 +306,8 @@ func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.En return err } ocb.asyncOperationResultLog(log, initialProvisioningState, backendErr) - ocb.emitMetrics(doc, provisioningState) + ocb.emitMetrics(log, doc, operationType, provisioningState, nil) + ocb.emitProvisioningMetrics(doc, provisioningState) } if initialProvisioningState == api.ProvisioningStateAdminUpdating { @@ -324,6 +325,8 @@ func (ocb *openShiftClusterBackend) endLease(ctx context.Context, log *logrus.En stop() } + ocb.emitMetrics(log, doc, operationType, provisioningState, nil) + _, err := ocb.dbOpenShiftClusters.EndLease(ctx, doc.Key, provisioningState, failedProvisioningState, adminUpdateError) return err } @@ -362,24 +365,6 @@ func (ocb *openShiftClusterBackend) asyncOperationResultLog(log *logrus.Entry, i log.Info("long running operation failed") } -func (ocb *openShiftClusterBackend) emitMetrics(doc *api.OpenShiftClusterDocument, provisioningState api.ProvisioningState) { - if doc.CorrelationData == nil { - return - } - - duration := time.Since(doc.CorrelationData.RequestTime).Milliseconds() - - ocb.m.EmitGauge("backend.openshiftcluster.duration", duration, map[string]string{ - "oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState), - "newProvisioningState": string(provisioningState), - }) - - ocb.m.EmitGauge("backend.openshiftcluster.count", 1, map[string]string{ - "oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState), - "newProvisioningState": string(provisioningState), - }) -} - func (ocb *openShiftClusterBackend) setNoMaintenanceState(ctx context.Context, doc *api.OpenShiftClusterDocument) (*api.OpenShiftClusterDocument, error) { return ocb.dbOpenShiftClusters.Patch(ctx, doc.Key, func(doc *api.OpenShiftClusterDocument) error { doc.OpenShiftCluster.Properties.MaintenanceState = api.MaintenanceStateNone diff --git a/pkg/backend/openshiftcluster_test.go b/pkg/backend/openshiftcluster_test.go index a8de5fce56d..eff59b5e98b 100644 --- a/pkg/backend/openshiftcluster_test.go +++ b/pkg/backend/openshiftcluster_test.go @@ -51,6 +51,11 @@ func TestBackendTry(t *testing.T) { fixture: func(f *testdatabase.Fixture) { f.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{ Key: strings.ToLower(resourceID), + CorrelationData: &api.CorrelationData{ + CorrelationID: "correlationId", + ClientRequestID: "clientRequestId", + RequestID: "requestId", + }, OpenShiftCluster: &api.OpenShiftCluster{ ID: resourceID, Name: "resourceName", @@ -58,6 +63,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateCreating, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeUserDefinedRouting, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -78,6 +94,17 @@ func TestBackendTry(t *testing.T) { Install: &api.Install{ Phase: api.InstallPhaseBootstrap, }, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeUserDefinedRouting, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -104,6 +131,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateCreating, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -121,6 +159,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateSucceeded, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -147,6 +196,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateCreating, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -166,6 +226,17 @@ func TestBackendTry(t *testing.T) { Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateFailed, FailedProvisioningState: api.ProvisioningStateCreating, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -192,6 +263,17 @@ func TestBackendTry(t *testing.T) { LastAdminUpdateError: "oh no", MaintenanceTask: api.MaintenanceTaskEverything, MaintenanceState: api.MaintenanceStateUnplanned, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -210,6 +292,17 @@ func TestBackendTry(t *testing.T) { Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateSucceeded, MaintenanceState: api.MaintenanceStateNone, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -234,6 +327,17 @@ func TestBackendTry(t *testing.T) { FailedProvisioningState: api.ProvisioningStateUpdating, MaintenanceTask: api.MaintenanceTaskEverything, MaintenanceState: api.MaintenanceStateUnplanned, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -254,6 +358,17 @@ func TestBackendTry(t *testing.T) { FailedProvisioningState: api.ProvisioningStateUpdating, LastAdminUpdateError: "oh no!", MaintenanceState: api.MaintenanceStateUnplanned, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -274,6 +389,17 @@ func TestBackendTry(t *testing.T) { Location: "location", Properties: api.OpenShiftClusterProperties{ ProvisioningState: api.ProvisioningStateDeleting, + NetworkProfile: api.NetworkProfile{ + PodCIDR: "10.128.0.0/14", + ServiceCIDR: "172.30.0.0/16", + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + OutboundType: api.OutboundTypeLoadbalancer, + LoadBalancerProfile: &api.LoadBalancerProfile{ + ManagedOutboundIPs: &api.ManagedOutboundIPs{ + Count: 0, + }, + }, + }, }, }, }) @@ -297,6 +423,7 @@ func TestBackendTry(t *testing.T) { manager := mock_cluster.NewMockInterface(controller) _env := mock_env.NewMockInterface(controller) _env.EXPECT().LiveConfig().AnyTimes().Return(tlc) + _env.EXPECT().SubscriptionID().AnyTimes().Return(mockSubID) dbOpenShiftClusters, clientOpenShiftClusters := testdatabase.NewFakeOpenShiftClusters() dbSubscriptions, _ := testdatabase.NewFakeSubscriptions()