diff --git a/pkg/api/admin/openshiftcluster.go b/pkg/api/admin/openshiftcluster.go index d8e6fdb1f40..5992258432f 100644 --- a/pkg/api/admin/openshiftcluster.go +++ b/pkg/api/admin/openshiftcluster.go @@ -54,6 +54,7 @@ type OpenShiftClusterProperties struct { ImageRegistryStorageAccountName string `json:"imageRegistryStorageAccountName,omitempty"` InfraID string `json:"infraId,omitempty"` HiveProfile HiveProfile `json:"hiveProfile,omitempty"` + PucmPending bool `json:"pucmPending,omitempty"` } // ProvisioningState represents a provisioning state. @@ -81,9 +82,10 @@ const ( type MaintenanceTask string const ( - MaintenanceTaskEverything MaintenanceTask = "Everything" - MaintenanceTaskOperator MaintenanceTask = "OperatorUpdate" - MaintenanceTaskRenewCerts MaintenanceTask = "CertificatesRenewal" + MaintenanceTaskEverything MaintenanceTask = "Everything" + MaintenanceTaskOperator MaintenanceTask = "OperatorUpdate" + MaintenanceTaskRenewCerts MaintenanceTask = "CertificatesRenewal" + MaintenanceTaskPucmPending MaintenanceTask = "PucmPending" ) // Operator feature flags diff --git a/pkg/api/admin/openshiftcluster_validatestatic.go b/pkg/api/admin/openshiftcluster_validatestatic.go index cad10cbf2c9..2b21bc125c3 100644 --- a/pkg/api/admin/openshiftcluster_validatestatic.go +++ b/pkg/api/admin/openshiftcluster_validatestatic.go @@ -29,7 +29,15 @@ func (sv openShiftClusterStaticValidator) validateDelta(oc, current *OpenShiftCl return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodePropertyChangeNotAllowed, err.Target, err.Message) } - if !(oc.Properties.MaintenanceTask == "" || oc.Properties.MaintenanceTask == MaintenanceTaskEverything || oc.Properties.MaintenanceTask == MaintenanceTaskOperator || oc.Properties.MaintenanceTask == MaintenanceTaskRenewCerts) { + return validateMaintenanceTask(oc.Properties.MaintenanceTask) +} + +func validateMaintenanceTask(task MaintenanceTask) error { + if !(task == "" || + task == MaintenanceTaskEverything || + task == MaintenanceTaskOperator || + task == MaintenanceTaskRenewCerts || + task == MaintenanceTaskPucmPending) { return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidParameter, "properties.maintenanceTask", "Invalid enum parameter.") } diff --git a/pkg/api/admin/openshiftcluster_validatestatic_test.go b/pkg/api/admin/openshiftcluster_validatestatic_test.go index a9ce5135655..0eff0ddbd52 100644 --- a/pkg/api/admin/openshiftcluster_validatestatic_test.go +++ b/pkg/api/admin/openshiftcluster_validatestatic_test.go @@ -676,6 +676,19 @@ func TestOpenShiftClusterStaticValidateDelta(t *testing.T) { oc.Properties.MaintenanceTask = "" }, }, + { + name: "maintenanceTask change to PucmPending allowed", + oc: func() *OpenShiftCluster { + return &OpenShiftCluster{ + Properties: OpenShiftClusterProperties{ + MaintenanceTask: MaintenanceTaskPucmPending, + }, + } + }, + modify: func(oc *OpenShiftCluster) { + oc.Properties.MaintenanceTask = "" + }, + }, { name: "maintenanceTask change to other values is disallowed", oc: func() *OpenShiftCluster { diff --git a/pkg/api/openshiftcluster.go b/pkg/api/openshiftcluster.go index 4201c544f15..cb5e938cf28 100644 --- a/pkg/api/openshiftcluster.go +++ b/pkg/api/openshiftcluster.go @@ -157,6 +157,8 @@ type OpenShiftClusterProperties struct { RegistryProfiles []*RegistryProfile `json:"registryProfiles,omitempty"` HiveProfile HiveProfile `json:"hiveProfile,omitempty"` + + PucmPending bool `json:"pucmPending,omitempty"` } // ProvisioningState represents a provisioning state @@ -175,9 +177,10 @@ const ( type MaintenanceTask string const ( - MaintenanceTaskEverything MaintenanceTask = "Everything" - MaintenanceTaskOperator MaintenanceTask = "OperatorUpdate" - MaintenanceTaskRenewCerts MaintenanceTask = "CertificatesRenewal" + MaintenanceTaskEverything MaintenanceTask = "Everything" + MaintenanceTaskOperator MaintenanceTask = "OperatorUpdate" + MaintenanceTaskRenewCerts MaintenanceTask = "CertificatesRenewal" + MaintenanceTaskPucmPending MaintenanceTask = "PucmPending" ) // Cluster-scoped flags diff --git a/pkg/backend/openshiftcluster.go b/pkg/backend/openshiftcluster.go index 4c258699fe7..560b61c772c 100644 --- a/pkg/backend/openshiftcluster.go +++ b/pkg/backend/openshiftcluster.go @@ -161,6 +161,10 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr if err != nil { return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) } + doc, err = ocb.setNoPucmPending(ctx, doc) + if err != nil { + return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err) + } return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil) case api.ProvisioningStateUpdating: @@ -361,3 +365,10 @@ func (ocb *openShiftClusterBackend) emitMetrics(doc *api.OpenShiftClusterDocumen "newProvisioningState": string(provisioningState), }) } + +func (ocb *openShiftClusterBackend) setNoPucmPending(ctx context.Context, doc *api.OpenShiftClusterDocument) (*api.OpenShiftClusterDocument, error) { + return ocb.dbOpenShiftClusters.Patch(ctx, doc.Key, func(doc *api.OpenShiftClusterDocument) error { + doc.OpenShiftCluster.Properties.PucmPending = false + return nil + }) +} diff --git a/pkg/frontend/openshiftcluster_putorpatch.go b/pkg/frontend/openshiftcluster_putorpatch.go index 047118cbb69..0ce1146f24e 100644 --- a/pkg/frontend/openshiftcluster_putorpatch.go +++ b/pkg/frontend/openshiftcluster_putorpatch.go @@ -205,14 +205,7 @@ func (f *frontend) _putOrPatchOpenShiftCluster(ctx context.Context, log *logrus. } } else { doc.OpenShiftCluster.Properties.LastProvisioningState = doc.OpenShiftCluster.Properties.ProvisioningState - - // TODO: Get rid of the special case - if apiVersion == admin.APIVersion { - doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateAdminUpdating - doc.OpenShiftCluster.Properties.LastAdminUpdateError = "" - } else { - doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateUpdating - } + setUpdateProvisioningState(doc, apiVersion) doc.Dequeues = 0 } @@ -312,3 +305,22 @@ func (f *frontend) ValidateNewCluster(ctx context.Context, subscription *api.Sub return nil } + +// setUpdateProvisioningState Sets either the admin update or update provisioning state +func setUpdateProvisioningState(doc *api.OpenShiftClusterDocument, apiVersion string) { + switch apiVersion { + case admin.APIVersion: + // For PUCM pending update, we don't want to set ProvisioningStateAdminUpdating + // The cluster monitoring stack uses that value to determine if PUCM is ongoing + if doc.OpenShiftCluster.Properties.MaintenanceTask != api.MaintenanceTaskPucmPending { + doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateAdminUpdating + doc.OpenShiftCluster.Properties.LastAdminUpdateError = "" + } else { + doc.OpenShiftCluster.Properties.PucmPending = true + doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateUpdating + } + default: + // Non-admin update (ex: customer cluster update) + doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateUpdating + } +} diff --git a/pkg/frontend/openshiftcluster_putorpatch_test.go b/pkg/frontend/openshiftcluster_putorpatch_test.go index 02b329ff8dd..0755554bff3 100644 --- a/pkg/frontend/openshiftcluster_putorpatch_test.go +++ b/pkg/frontend/openshiftcluster_putorpatch_test.go @@ -557,6 +557,91 @@ func TestPutOrPatchOpenShiftClusterAdminAPI(t *testing.T) { wantStatusCode: http.StatusBadRequest, wantError: `400: PropertyChangeNotAllowed: properties.registryProfiles: Changing property 'properties.registryProfiles' is not allowed.`, }, + { + name: "patch a cluster with pucm pending request", + request: func(oc *admin.OpenShiftCluster) { + oc.Properties.MaintenanceTask = admin.MaintenanceTaskPucmPending + }, + isPatch: true, + fixture: func(f *testdatabase.Fixture) { + f.AddSubscriptionDocuments(&api.SubscriptionDocument{ + ID: mockSubID, + Subscription: &api.Subscription{ + State: api.SubscriptionStateRegistered, + }, + }) + f.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{ + Key: strings.ToLower(testdatabase.GetResourcePath(mockSubID, "resourceName")), + OpenShiftCluster: &api.OpenShiftCluster{ + ID: testdatabase.GetResourcePath(mockSubID, "resourceName"), + Type: "Microsoft.RedHatOpenShift/openShiftClusters", + Tags: map[string]string{"tag": "will-be-kept"}, + Properties: api.OpenShiftClusterProperties{ + ProvisioningState: api.ProvisioningStateSucceeded, + MaintenanceTask: "", + }, + }, + }) + }, + wantSystemDataEnriched: true, + wantEnriched: []string{testdatabase.GetResourcePath(mockSubID, "resourceName")}, + wantDocuments: func(c *testdatabase.Checker) { + c.AddAsyncOperationDocuments(&api.AsyncOperationDocument{ + OpenShiftClusterKey: strings.ToLower(testdatabase.GetResourcePath(mockSubID, "resourceName")), + AsyncOperation: &api.AsyncOperation{ + InitialProvisioningState: api.ProvisioningStateUpdating, + ProvisioningState: api.ProvisioningStateUpdating, + }, + }) + c.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{ + Key: strings.ToLower(testdatabase.GetResourcePath(mockSubID, "resourceName")), + OpenShiftCluster: &api.OpenShiftCluster{ + ID: testdatabase.GetResourcePath(mockSubID, "resourceName"), + Type: "Microsoft.RedHatOpenShift/openShiftClusters", + Tags: map[string]string{"tag": "will-be-kept"}, + Properties: api.OpenShiftClusterProperties{ + ProvisioningState: api.ProvisioningStateUpdating, + LastProvisioningState: api.ProvisioningStateSucceeded, + ClusterProfile: api.ClusterProfile{ + FipsValidatedModules: api.FipsValidatedModulesDisabled, + }, + MaintenanceTask: api.MaintenanceTaskPucmPending, + NetworkProfile: api.NetworkProfile{ + OutboundType: api.OutboundTypeLoadbalancer, + PreconfiguredNSG: api.PreconfiguredNSGDisabled, + }, + MasterProfile: api.MasterProfile{ + EncryptionAtHost: api.EncryptionAtHostDisabled, + }, + PucmPending: true, + OperatorFlags: api.DefaultOperatorFlags(), + }, + }, + }) + }, + wantAsync: true, + wantStatusCode: http.StatusOK, + wantResponse: &admin.OpenShiftCluster{ + ID: testdatabase.GetResourcePath(mockSubID, "resourceName"), + Type: "Microsoft.RedHatOpenShift/openShiftClusters", + Tags: map[string]string{"tag": "will-be-kept"}, + Properties: admin.OpenShiftClusterProperties{ + ProvisioningState: admin.ProvisioningStateUpdating, + LastProvisioningState: admin.ProvisioningStateSucceeded, + ClusterProfile: admin.ClusterProfile{ + FipsValidatedModules: admin.FipsValidatedModulesDisabled, + }, + MaintenanceTask: admin.MaintenanceTaskPucmPending, + NetworkProfile: admin.NetworkProfile{ + OutboundType: admin.OutboundTypeLoadbalancer, + }, + MasterProfile: admin.MasterProfile{ + EncryptionAtHost: admin.EncryptionAtHostDisabled, + }, + OperatorFlags: admin.OperatorFlags(api.DefaultOperatorFlags()), + }, + }, + }, } { t.Run(tt.name, func(t *testing.T) { ti := newTestInfra(t). diff --git a/pkg/monitor/cluster/cluster.go b/pkg/monitor/cluster/cluster.go index 228405061b7..209a550ead9 100644 --- a/pkg/monitor/cluster/cluster.go +++ b/pkg/monitor/cluster/cluster.go @@ -174,6 +174,7 @@ func (mon *Monitor) Monitor(ctx context.Context) (errs []error) { mon.emitSummary, mon.emitHiveRegistrationStatus, mon.emitOperatorFlagsAndSupportBanner, + mon.emitPucmState, mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable } { err = f(ctx) diff --git a/pkg/monitor/cluster/maintenance.go b/pkg/monitor/cluster/maintenance.go new file mode 100644 index 00000000000..c70e1ed4662 --- /dev/null +++ b/pkg/monitor/cluster/maintenance.go @@ -0,0 +1,84 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + + "github.com/Azure/ARO-RP/pkg/api" +) + +/************************************************************** + Possible PUCM states: + + (1) PUCM pending + - We will do PUCM, so emit a maintenance pending signal + - Conditions: + * Field pucmPending is true + * Don't meet below conditions for in progress maintenance + + (2) Planned PUCM in progress + - Emit a planned maintenance in progress signal. + - If first PUCM attempt fails, leave cluster in this state + because we will need to retry PUCM in at a later time. + - Conditions: + * Field pucmPending is true + * One of: (a) provisoning state AdminUpdate or (2) AdminUpdate err is not nil + + (3) Unplanned PUCM in progress + - Emit an unplanned maintenance in progress signal. + - If first PUCM attempt fails, leave cluster in this state + because we will need to retry PUCM in at a later time. + - Conditions: + * Field pucmPending is false + * One of: (a) provisoning state AdminUpdate or (2) AdminUpdate err is not nil + + (4) No ongoinig or scheduled PUCM + - Don't emit a signal + - Conditions: + * Field pucmPending is false + * Provisioning state is not AdminUpdate and AdminUpdate err is not nil +**************************************************************/ + +type pucmState string + +func (p pucmState) String() string { + return string(p) +} + +const ( + pucmNone pucmState = "none" + pucmPending pucmState = "pending" + pucmPlanned pucmState = "planned" + pucmUnplanned pucmState = "unplanned" +) + +func (mon *Monitor) emitPucmState(ctx context.Context) error { + state := getPucmState(mon.oc.Properties) + mon.emitGauge("cluster.maintenance.pucm", 1, map[string]string{ + "state": state.String(), + }) + + return nil +} + +func getPucmState(clusterProperties api.OpenShiftClusterProperties) pucmState { + if pucmOngoing(clusterProperties) { + if clusterProperties.PucmPending { + return pucmPlanned + } + return pucmUnplanned + } + + if clusterProperties.PucmPending { + return pucmPending + } + + return pucmNone +} + +func pucmOngoing(clusterProperties api.OpenShiftClusterProperties) bool { + return clusterProperties.ProvisioningState == api.ProvisioningStateAdminUpdating || + clusterProperties.LastAdminUpdateError != "" +} diff --git a/pkg/monitor/cluster/maintenance_test.go b/pkg/monitor/cluster/maintenance_test.go new file mode 100644 index 00000000000..d9f79ee1fe9 --- /dev/null +++ b/pkg/monitor/cluster/maintenance_test.go @@ -0,0 +1,89 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "testing" + + "github.com/golang/mock/gomock" + + "github.com/Azure/ARO-RP/pkg/api" + mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics" +) + +func TestEmitPucmState(t *testing.T) { + for _, tt := range []struct { + name string + provisioningState api.ProvisioningState + pucmPending bool + adminUpdateErr string + expectedPucmState pucmState + }{ + { + name: "state none", + provisioningState: api.ProvisioningStateSucceeded, + expectedPucmState: pucmNone, + }, + { + name: "state pending", + provisioningState: api.ProvisioningStateSucceeded, + pucmPending: true, + expectedPucmState: pucmPending, + }, + { + name: "state unplanned - admin updating in flight and no admin update error", + provisioningState: api.ProvisioningStateAdminUpdating, + expectedPucmState: pucmUnplanned, + }, + { + name: "state planned - admin updating in flight and no admin update error", + provisioningState: api.ProvisioningStateAdminUpdating, + pucmPending: true, + expectedPucmState: pucmPlanned, + }, + { + name: "state unplanned - not admin updating but admin update error", + provisioningState: api.ProvisioningStateFailed, + adminUpdateErr: "PUCM failed", + expectedPucmState: pucmUnplanned, + }, + { + name: "state planned - not admin updating but admin update error", + provisioningState: api.ProvisioningStateFailed, + pucmPending: true, + adminUpdateErr: "PUCM failed", + expectedPucmState: pucmPlanned, + }, + } { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + + controller := gomock.NewController(t) + defer controller.Finish() + + m := mock_metrics.NewMockEmitter(controller) + oc := &api.OpenShiftCluster{ + Properties: api.OpenShiftClusterProperties{ + ProvisioningState: tt.provisioningState, + PucmPending: tt.pucmPending, + LastAdminUpdateError: tt.adminUpdateErr, + }, + } + mon := &Monitor{ + m: m, + oc: oc, + } + + m.EXPECT().EmitGauge("cluster.maintenance.pucm", int64(1), map[string]string{ + "state": tt.expectedPucmState.String(), + }) + + err := mon.emitPucmState(ctx) + if err != nil { + t.Fatal(err) + } + }) + } +}