Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emit Cluster Feature Metrics for Cluster Operations #3631

Merged
merged 2 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 242 additions & 0 deletions pkg/backend/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
package backend

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"fmt"
"strconv"

"github.com/sirupsen/logrus"

"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/util/dns"
utillog "github.com/Azure/ARO-RP/pkg/util/log"
)

func (ocb *openShiftClusterBackend) emitMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, operationType, provisioningState api.ProvisioningState, backendErr error) map[string]string {
dimensions := map[string]string{}

ocb.gatherOperationMetrics(log, operationType, provisioningState, backendErr, dimensions)
ocb.gatherCorrelationID(log, doc, dimensions)
ocb.gatherMiscMetrics(log, doc, dimensions)
ocb.gatherAuthMetrics(log, doc, dimensions)
ocb.gatherNetworkMetrics(log, doc, dimensions)
ocb.gatherNodeMetrics(log, doc, dimensions)

ocb.logMetricDimensions(log, operationType, dimensions)
ocb.m.EmitGauge(ocb.getMetricName(operationType), metricValue, dimensions)

// dimensions is returned here for testing purposes
return dimensions
}

func (ocb *openShiftClusterBackend) getMetricName(operationType api.ProvisioningState) string {
return fmt.Sprintf("%s.%s", metricPackage, operationType)
}

func (ocb *openShiftClusterBackend) getResultType(backendErr error) utillog.ResultType {
var resultType utillog.ResultType
err, ok := backendErr.(*api.CloudError)
if ok {
resultType = utillog.MapStatusCodeToResultType(err.StatusCode)
}
return resultType
}

func (ocb *openShiftClusterBackend) getStringMetricValue(log *logrus.Entry, metricName, value string) string {
if value != "" {
return value
}

log.Warnf("%s %s", metricFailToCollectErr, metricName)
return empty
}

func (ocb *openShiftClusterBackend) logMetricDimensions(log *logrus.Entry, operationType api.ProvisioningState, dimensions map[string]string) {
for metric, value := range dimensions {
log.Info(fmt.Sprintf("%s.%s: %s = %s", metricPackage, operationType, metric, value))
}
}

func (ocb *openShiftClusterBackend) gatherCorrelationID(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) {
if doc.CorrelationData != nil {
dimensions[correlationDataIdMetricName] = ocb.getStringMetricValue(log, correlationDataIdMetricName, doc.CorrelationData.CorrelationID)
dimensions[correlationDataClientRequestIdMetricName] = ocb.getStringMetricValue(log, correlationDataClientRequestIdMetricName, doc.CorrelationData.ClientRequestID)
dimensions[correlationDataRequestIdMetricName] = ocb.getStringMetricValue(log, correlationDataRequestIdMetricName, doc.CorrelationData.RequestID)
} else {
log.Warnf("%s %s", metricFailToCollectErr, correlationDataMetricName)
dimensions[correlationDataIdMetricName] = empty
dimensions[correlationDataClientRequestIdMetricName] = empty
dimensions[correlationDataRequestIdMetricName] = empty
}
}

func (ocb *openShiftClusterBackend) gatherOperationMetrics(log *logrus.Entry, operationType, provisioningState api.ProvisioningState, backendErr error, dimensions map[string]string) {
// These are provided internally by endLease, not expected to be ""
dimensions[operationTypeMetricName] = operationType.String()
dimensions[provisioningStateMetricName] = provisioningState.String()

dimensions[resultTypeMetricName] = ocb.getStringMetricValue(log, resultTypeMetricName, string(ocb.getResultType(backendErr)))
}

func (ocb *openShiftClusterBackend) gatherMiscMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) {
dimensions[subscriptionIdMetricName] = ocb.getStringMetricValue(log, subscriptionIdMetricName, ocb.env.SubscriptionID())
dimensions[resourceIdMetricName] = ocb.getStringMetricValue(log, resourceIdMetricName, doc.ResourceID)

dimensions[clusterNameMetricName] = ocb.getStringMetricValue(log, clusterNameMetricName, doc.OpenShiftCluster.Name)
dimensions[clusterIdMetricName] = ocb.getStringMetricValue(log, clusterIdMetricName, doc.OpenShiftCluster.ID)
dimensions[locationMetricName] = ocb.getStringMetricValue(log, locationMetricName, doc.OpenShiftCluster.Location)
dimensions[ocpVersionMetricName] = ocb.getStringMetricValue(log, ocpVersionMetricName, doc.OpenShiftCluster.Properties.ClusterProfile.Version)
dimensions[rpVersionMetricName] = ocb.getStringMetricValue(log, rpVersionMetricName, doc.OpenShiftCluster.Properties.ProvisionedBy)
dimensions[resourecGroupMetricName] = ocb.getStringMetricValue(log, resourecGroupMetricName, doc.OpenShiftCluster.Properties.ClusterProfile.ResourceGroupID)

for flag, feature := range doc.OpenShiftCluster.Properties.OperatorFlags {
flagMetricName := fmt.Sprintf("%s-%s", operatorFlagsMetricName, flag)
dimensions[flagMetricName] = ocb.getStringMetricValue(log, flagMetricName, feature)
}

dimensions[asyncOperationsIdMetricName] = ocb.getStringMetricValue(log, asyncOperationsIdMetricName, doc.AsyncOperationID)

if doc.OpenShiftCluster.Properties.WorkerProfiles != nil {
dimensions[workerProfileCountMetricName] = strconv.FormatInt(int64(len(doc.OpenShiftCluster.Properties.WorkerProfiles)), 10)
} else {
dimensions[workerProfileCountMetricName] = ocb.getStringMetricValue(log, workerProfileCountMetricName, "")
}

if doc.OpenShiftCluster.Tags != nil {
dimensions[tagsMetricName] = enabled
} else {
dimensions[tagsMetricName] = disabled
}
}

func (ocb *openShiftClusterBackend) gatherNodeMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) {
if doc.OpenShiftCluster.Properties.MasterProfile.DiskEncryptionSetID != "" {
dimensions[masterProfileEncryptionSetIdMetricName] = enabled
} else {
dimensions[masterProfileEncryptionSetIdMetricName] = disabled
}

mp := doc.OpenShiftCluster.Properties.MasterProfile
dimensions[masterProfileVmSizeMetricName] = ocb.getStringMetricValue(log, masterProfileVmSizeMetricName, string(mp.VMSize))

if doc.OpenShiftCluster.Properties.MasterProfile.EncryptionAtHost == api.EncryptionAtHostEnabled {
dimensions[masterEncryptionAtHostMetricName] = string(api.EncryptionAtHostEnabled)
} else if doc.OpenShiftCluster.Properties.MasterProfile.EncryptionAtHost == api.EncryptionAtHostDisabled {
dimensions[masterEncryptionAtHostMetricName] = string(api.EncryptionAtHostDisabled)
} else {
log.Warnf("%s %s", metricFailToCollectErr, masterEncryptionAtHostMetricName)
dimensions[masterEncryptionAtHostMetricName] = unknown
}

if len(doc.OpenShiftCluster.Properties.WorkerProfiles) > 0 {
wp := doc.OpenShiftCluster.Properties.WorkerProfiles[0]
dimensions[workerVmDiskSizeMetricName] = strconv.FormatInt(int64(wp.DiskSizeGB), 10)
dimensions[workerVmSizeMetricName] = ocb.getStringMetricValue(log, workerVmSizeMetricName, string(wp.VMSize))
dimensions[workerVmDiskSizeMetricName] = strconv.FormatInt(int64(wp.DiskSizeGB), 10)

if wp.EncryptionAtHost == api.EncryptionAtHostEnabled {
dimensions[workerEncryptionAtHostMetricName] = string(api.EncryptionAtHostEnabled)
} else if wp.EncryptionAtHost == api.EncryptionAtHostDisabled {
dimensions[workerEncryptionAtHostMetricName] = string(api.EncryptionAtHostDisabled)
} else {
log.Warnf("%s %s", metricFailToCollectErr, workerEncryptionAtHostMetricName)
dimensions[workerEncryptionAtHostMetricName] = unknown
}
}

if doc.OpenShiftCluster.Properties.ClusterProfile.FipsValidatedModules == api.FipsValidatedModulesEnabled {
dimensions[fipsMetricName] = string(api.FipsValidatedModulesEnabled)
} else if doc.OpenShiftCluster.Properties.ClusterProfile.FipsValidatedModules == api.FipsValidatedModulesDisabled {
dimensions[fipsMetricName] = string(api.FipsValidatedModulesDisabled)
} else {
log.Warnf("%s %s", metricFailToCollectErr, fipsMetricName)
dimensions[fipsMetricName] = unknown
}
}

func (ocb *openShiftClusterBackend) gatherAuthMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) {
if doc.OpenShiftCluster.Properties.PlatformWorkloadIdentityProfile != nil {
dimensions[clusterIdentityMetricName] = clusterIdentityManagedIdMetricName
} else if doc.OpenShiftCluster.Properties.ServicePrincipalProfile != nil {
dimensions[clusterIdentityMetricName] = clusterIdentityServicePrincipalMetricName
} else {
log.Warnf("%s %s", metricFailToCollectErr, clusterIdentityMetricName)
dimensions[clusterIdentityMetricName] = unknown
}

if doc.OpenShiftCluster.Properties.ClusterProfile.PullSecret != "" {
dimensions[pullSecretMetricName] = enabled
} else {
dimensions[pullSecretMetricName] = disabled
}
}

func (ocb *openShiftClusterBackend) gatherNetworkMetrics(log *logrus.Entry, doc *api.OpenShiftClusterDocument, dimensions map[string]string) {
for _, p := range doc.OpenShiftCluster.Properties.IngressProfiles {
if p.Visibility == api.VisibilityPrivate {
dimensions[ingressProfileMetricName] = fmt.Sprintf("%s.%s", string(api.VisibilityPrivate), p.Name)
} else if p.Visibility == api.VisibilityPublic {
dimensions[ingressProfileMetricName] = fmt.Sprintf("%s.%s", string(api.VisibilityPublic), p.Name)
} else {
log.Warnf("%s %s", metricFailToCollectErr, ingressProfileMetricName)
dimensions[ingressProfileMetricName] = unknown
}
}

if doc.OpenShiftCluster.Properties.NetworkProfile.OutboundType == api.OutboundTypeUserDefinedRouting {
dimensions[networkProfileOutboundTypeMetricName] = string(api.OutboundTypeUserDefinedRouting)
} else if doc.OpenShiftCluster.Properties.NetworkProfile.OutboundType == api.OutboundTypeLoadbalancer {
dimensions[networkProfileOutboundTypeMetricName] = string(api.OutboundTypeLoadbalancer)
} else {
log.Warnf("%s %s", metricFailToCollectErr, networkProfileManagedOutboundIpsMetricName)
dimensions[networkProfileOutboundTypeMetricName] = unknown
}

if doc.OpenShiftCluster.Properties.NetworkProfile.PodCIDR != podCidrDefaultValue {
dimensions[podCidrMetricName] = custom
} else {
dimensions[podCidrMetricName] = defaultSet
}

if doc.OpenShiftCluster.Properties.NetworkProfile.ServiceCIDR != serviceCidrDefaultValue {
dimensions[serviceCidrMetricName] = custom
} else {
dimensions[serviceCidrMetricName] = defaultSet
}

domain, err := dns.ManagedDomain(ocb.env, doc.OpenShiftCluster.Properties.ClusterProfile.Domain)
if err != nil {
dimensions[clusterProfileDomainMetricName] = empty
log.Warnf("%s %s, due to %s", metricFailToCollectErr, clusterProfileDomainMetricName, err.Error())
} else {
if domain != "" {
dimensions[clusterProfileDomainMetricName] = custom
} else {
dimensions[clusterProfileDomainMetricName] = managed
}
}

if doc.OpenShiftCluster.Properties.NetworkProfile.LoadBalancerProfile.ManagedOutboundIPs != nil {
dimensions[networkProfileManagedOutboundIpsMetricName] = strconv.FormatInt(int64(doc.OpenShiftCluster.Properties.NetworkProfile.LoadBalancerProfile.ManagedOutboundIPs.Count), 10)
} else {
log.Warnf("%s %s", metricFailToCollectErr, networkProfileManagedOutboundIpsMetricName)
dimensions[networkProfileManagedOutboundIpsMetricName] = unknown
}

if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled {
dimensions[networkProfilePreConfiguredNSGMetricName] = string(api.PreconfiguredNSGEnabled)
} else if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGDisabled {
dimensions[networkProfilePreConfiguredNSGMetricName] = string(api.PreconfiguredNSGDisabled)
} else {
log.Warnf("%s %s", metricFailToCollectErr, networkProfilePreConfiguredNSGMetricName)
dimensions[networkProfilePreConfiguredNSGMetricName] = unknown
}

if doc.OpenShiftCluster.Properties.FeatureProfile.GatewayEnabled {
dimensions[featureProfileGatewayEnabledMetricName] = enabled
} else {
dimensions[featureProfileGatewayEnabledMetricName] = disabled
}
}
78 changes: 78 additions & 0 deletions pkg/backend/metrics_const.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package backend

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

const (
metricPackage = "backend.openshiftcluster"
metricValue int64 = 1
enabled = "Enabled"
disabled = "Disabled"
custom = "Custom"
defaultSet = "Default"
unknown = "unknown"
empty = "empty"
managed = "managed"
metricFailToCollectErr = "failed to collect metric:"

encryptionAtHostMetricName = "encryptionathost"
diskSizeMetricName = "disksize"
vmSizeMetricName = "vmsize"
countMetricName = "count"

workerProfileMetricName = "workprofile"
workerVmSizeMetricName = workerProfileMetricName + "." + vmSizeMetricName
workerVmDiskSizeMetricName = workerProfileMetricName + "." + diskSizeMetricName
workerEncryptionAtHostMetricName = workerProfileMetricName + "." + encryptionAtHostMetricName
workerProfileCountMetricName = workerProfileMetricName + "." + countMetricName

masterProfileMetricName = "masterprofile"
masterEncryptionAtHostMetricName = masterProfileMetricName + "." + encryptionAtHostMetricName
masterProfileEncryptionSetIdMetricName = masterProfileMetricName + "." + "diskencryptionsetid"
masterProfileVmSizeMetricName = masterProfileMetricName + "." + vmSizeMetricName

fipsMetricName = "fips"
clusterIdentityMetricName = "clusteridentity"
clusterIdentityManagedIdMetricName = managed + "id"
clusterIdentityServicePrincipalMetricName = "serviceprincipal"
pullSecretMetricName = "pullsecret"

ingressProfileMetricName = "ingressprofile"
networkProfileMetricName = "networkprofile"
networkProfileOutboundTypeMetricName = networkProfileMetricName + "." + "outboundtype"
networkProfileManagedOutboundIpsMetricName = networkProfileMetricName + "." + "managedoutboundips"
networkProfilePreConfiguredNSGMetricName = networkProfileMetricName + "." + "preconfigurednsg"
podCidrMetricName = networkProfileMetricName + "." + "podcidr"
serviceCidrMetricName = networkProfileMetricName + "." + "servicecidr"
podCidrDefaultValue = "10.128.0.0/14"
serviceCidrDefaultValue = "172.30.0.0/16"

featureProfileMetricName = "featureprofile"
featureProfileGatewayEnabledMetricName = featureProfileMetricName + "." + "gatewayenabled"

clusterProfileMetricName = "clusterprofile"
clusterProfileDomainMetricName = clusterProfileMetricName + "." + "domain"

tagsMetricName = "tags"
operatorFlagsMetricName = "operatorflags"

asyncOperationsIdMetricName = "async_operationsid"
openshiftClusterMetricName = "openshiftcluster"
rpVersionMetricName = openshiftClusterMetricName + "." + "rpversion"
ocpVersionMetricName = openshiftClusterMetricName + "." + "ocpversion"
clusterNameMetricName = openshiftClusterMetricName + "." + "clustername"
clusterIdMetricName = openshiftClusterMetricName + "." + "clusterid"
resourecGroupMetricName = openshiftClusterMetricName + "." + "resourcegroup"
locationMetricName = openshiftClusterMetricName + "." + "location"
resourceIdMetricName = "resourceid"
subscriptionIdMetricName = "subscriptionid"

correlationDataMetricName = "correlationdata"
correlationDataRequestIdMetricName = correlationDataMetricName + "." + "requestid"
correlationDataClientRequestIdMetricName = correlationDataMetricName + "." + "client_requestid"
correlationDataIdMetricName = correlationDataMetricName + "." + "correlationid"

operationTypeMetricName = "operationtype"
provisioningStateMetricName = "provisioningstate"
resultTypeMetricName = "resulttype"
)
28 changes: 28 additions & 0 deletions pkg/backend/metrics_provisioning.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package backend

import (
"time"

"github.com/Azure/ARO-RP/pkg/api"
)

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

func (ocb *openShiftClusterBackend) emitProvisioningMetrics(doc *api.OpenShiftClusterDocument, provisioningState api.ProvisioningState) {
if doc.CorrelationData == nil {
return
}

duration := time.Since(doc.CorrelationData.RequestTime).Milliseconds()

ocb.m.EmitGauge("backend.openshiftcluster.duration", duration, map[string]string{
"oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState),
"newProvisioningState": string(provisioningState),
})

ocb.m.EmitGauge("backend.openshiftcluster.count", 1, map[string]string{
"oldProvisioningState": string(doc.OpenShiftCluster.Properties.ProvisioningState),
"newProvisioningState": string(provisioningState),
})
}
Loading
Loading