Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PUCM Maintenance Signals #3021

Merged
merged 28 commits into from
Jul 18, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions pkg/api/admin/openshiftcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ type OpenShiftClusterProperties struct {
ImageRegistryStorageAccountName string `json:"imageRegistryStorageAccountName,omitempty"`
InfraID string `json:"infraId,omitempty"`
HiveProfile HiveProfile `json:"hiveProfile,omitempty"`
PucmPending bool `json:"pucmPending,omitempty"`
}

// ProvisioningState represents a provisioning state.
Expand Down Expand Up @@ -81,9 +82,10 @@ const (
type MaintenanceTask string

const (
MaintenanceTaskEverything MaintenanceTask = "Everything"
MaintenanceTaskOperator MaintenanceTask = "OperatorUpdate"
MaintenanceTaskRenewCerts MaintenanceTask = "CertificatesRenewal"
MaintenanceTaskEverything MaintenanceTask = "Everything"
MaintenanceTaskOperator MaintenanceTask = "OperatorUpdate"
MaintenanceTaskRenewCerts MaintenanceTask = "CertificatesRenewal"
MaintenanceTaskPucmPending MaintenanceTask = "PucmPending"
)

// Operator feature flags
Expand Down
6 changes: 5 additions & 1 deletion pkg/api/admin/openshiftcluster_validatestatic.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ func (sv openShiftClusterStaticValidator) validateDelta(oc, current *OpenShiftCl
return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodePropertyChangeNotAllowed, err.Target, err.Message)
}

if !(oc.Properties.MaintenanceTask == "" || oc.Properties.MaintenanceTask == MaintenanceTaskEverything || oc.Properties.MaintenanceTask == MaintenanceTaskOperator || oc.Properties.MaintenanceTask == MaintenanceTaskRenewCerts) {
if !(oc.Properties.MaintenanceTask == "" ||
oc.Properties.MaintenanceTask == MaintenanceTaskEverything ||
oc.Properties.MaintenanceTask == MaintenanceTaskOperator ||
oc.Properties.MaintenanceTask == MaintenanceTaskRenewCerts ||
oc.Properties.MaintenanceTask == MaintenanceTaskPucmPending) {
return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidParameter, "properties.maintenanceTask", "Invalid enum parameter.")
niontive marked this conversation as resolved.
Show resolved Hide resolved
}

Expand Down
13 changes: 13 additions & 0 deletions pkg/api/admin/openshiftcluster_validatestatic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,19 @@ func TestOpenShiftClusterStaticValidateDelta(t *testing.T) {
oc.Properties.MaintenanceTask = ""
},
},
{
name: "maintenanceTask change to PucmPending allowed",
oc: func() *OpenShiftCluster {
return &OpenShiftCluster{
Properties: OpenShiftClusterProperties{
MaintenanceTask: MaintenanceTaskPucmPending,
},
}
},
modify: func(oc *OpenShiftCluster) {
oc.Properties.MaintenanceTask = ""
},
},
{
name: "maintenanceTask change to other values is disallowed",
oc: func() *OpenShiftCluster {
Expand Down
9 changes: 6 additions & 3 deletions pkg/api/openshiftcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ type OpenShiftClusterProperties struct {
RegistryProfiles []*RegistryProfile `json:"registryProfiles,omitempty"`

HiveProfile HiveProfile `json:"hiveProfile,omitempty"`

PucmPending bool `json:"pucmPending,omitempty"`
}

// ProvisioningState represents a provisioning state
Expand All @@ -175,9 +177,10 @@ const (
type MaintenanceTask string

const (
MaintenanceTaskEverything MaintenanceTask = "Everything"
MaintenanceTaskOperator MaintenanceTask = "OperatorUpdate"
MaintenanceTaskRenewCerts MaintenanceTask = "CertificatesRenewal"
MaintenanceTaskEverything MaintenanceTask = "Everything"
MaintenanceTaskOperator MaintenanceTask = "OperatorUpdate"
MaintenanceTaskRenewCerts MaintenanceTask = "CertificatesRenewal"
MaintenanceTaskPucmPending MaintenanceTask = "PucmPending"
)

// Cluster-scoped flags
Expand Down
11 changes: 11 additions & 0 deletions pkg/backend/openshiftcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err)
}
doc, err = ocb.setNoPucmPending(ctx, doc)
AldoFusterTurpin marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err)
}
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil)

case api.ProvisioningStateUpdating:
Expand Down Expand Up @@ -361,3 +365,10 @@ func (ocb *openShiftClusterBackend) emitMetrics(doc *api.OpenShiftClusterDocumen
"newProvisioningState": string(provisioningState),
})
}

func (ocb *openShiftClusterBackend) setNoPucmPending(ctx context.Context, doc *api.OpenShiftClusterDocument) (*api.OpenShiftClusterDocument, error) {
return ocb.dbOpenShiftClusters.Patch(ctx, doc.Key, func(doc *api.OpenShiftClusterDocument) error {
doc.OpenShiftCluster.Properties.PucmPending = false
return nil
})
}
niontive marked this conversation as resolved.
Show resolved Hide resolved
28 changes: 20 additions & 8 deletions pkg/frontend/openshiftcluster_putorpatch.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,14 +205,7 @@ func (f *frontend) _putOrPatchOpenShiftCluster(ctx context.Context, log *logrus.
}
} else {
doc.OpenShiftCluster.Properties.LastProvisioningState = doc.OpenShiftCluster.Properties.ProvisioningState

// TODO: Get rid of the special case
if apiVersion == admin.APIVersion {
doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateAdminUpdating
doc.OpenShiftCluster.Properties.LastAdminUpdateError = ""
} else {
doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateUpdating
}
setUpdateProvisioningState(doc, apiVersion)
doc.Dequeues = 0
}

Expand Down Expand Up @@ -312,3 +305,22 @@ func (f *frontend) ValidateNewCluster(ctx context.Context, subscription *api.Sub

return nil
}

// Sets either the admin update or update provisioning state
func setUpdateProvisioningState(doc *api.OpenShiftClusterDocument, apiVersion string) {
niontive marked this conversation as resolved.
Show resolved Hide resolved
switch apiVersion {
case admin.APIVersion:
// For PUCM pending update, we don't want to set ProvisioningStateAdminUpdating
// The cluster monitoring stack uses that value to determine if PUCM is ongoing
if doc.OpenShiftCluster.Properties.MaintenanceTask != api.MaintenanceTaskPucmPending {
doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateAdminUpdating
doc.OpenShiftCluster.Properties.LastAdminUpdateError = ""
} else {
doc.OpenShiftCluster.Properties.PucmPending = true
doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateUpdating
}
default:
// Non-admin update (ex: customer cluster update)
doc.OpenShiftCluster.Properties.ProvisioningState = api.ProvisioningStateUpdating
}
}
85 changes: 85 additions & 0 deletions pkg/frontend/openshiftcluster_putorpatch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,91 @@ func TestPutOrPatchOpenShiftClusterAdminAPI(t *testing.T) {
wantStatusCode: http.StatusBadRequest,
wantError: `400: PropertyChangeNotAllowed: properties.registryProfiles: Changing property 'properties.registryProfiles' is not allowed.`,
},
{
name: "patch a cluster with pucm pending request",
request: func(oc *admin.OpenShiftCluster) {
oc.Properties.MaintenanceTask = admin.MaintenanceTaskPucmPending
},
isPatch: true,
fixture: func(f *testdatabase.Fixture) {
f.AddSubscriptionDocuments(&api.SubscriptionDocument{
ID: mockSubID,
Subscription: &api.Subscription{
State: api.SubscriptionStateRegistered,
},
})
f.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{
Key: strings.ToLower(testdatabase.GetResourcePath(mockSubID, "resourceName")),
OpenShiftCluster: &api.OpenShiftCluster{
ID: testdatabase.GetResourcePath(mockSubID, "resourceName"),
Type: "Microsoft.RedHatOpenShift/openShiftClusters",
Tags: map[string]string{"tag": "will-be-kept"},
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateSucceeded,
MaintenanceTask: "",
},
},
})
},
wantSystemDataEnriched: true,
wantEnriched: []string{testdatabase.GetResourcePath(mockSubID, "resourceName")},
wantDocuments: func(c *testdatabase.Checker) {
c.AddAsyncOperationDocuments(&api.AsyncOperationDocument{
OpenShiftClusterKey: strings.ToLower(testdatabase.GetResourcePath(mockSubID, "resourceName")),
AsyncOperation: &api.AsyncOperation{
InitialProvisioningState: api.ProvisioningStateUpdating,
ProvisioningState: api.ProvisioningStateUpdating,
},
})
c.AddOpenShiftClusterDocuments(&api.OpenShiftClusterDocument{
Key: strings.ToLower(testdatabase.GetResourcePath(mockSubID, "resourceName")),
OpenShiftCluster: &api.OpenShiftCluster{
ID: testdatabase.GetResourcePath(mockSubID, "resourceName"),
Type: "Microsoft.RedHatOpenShift/openShiftClusters",
Tags: map[string]string{"tag": "will-be-kept"},
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateUpdating,
LastProvisioningState: api.ProvisioningStateSucceeded,
ClusterProfile: api.ClusterProfile{
FipsValidatedModules: api.FipsValidatedModulesDisabled,
},
MaintenanceTask: api.MaintenanceTaskPucmPending,
NetworkProfile: api.NetworkProfile{
OutboundType: api.OutboundTypeLoadbalancer,
PreconfiguredNSG: api.PreconfiguredNSGDisabled,
},
MasterProfile: api.MasterProfile{
EncryptionAtHost: api.EncryptionAtHostDisabled,
},
PucmPending: true,
OperatorFlags: api.DefaultOperatorFlags(),
},
},
})
},
wantAsync: true,
wantStatusCode: http.StatusOK,
wantResponse: &admin.OpenShiftCluster{
ID: testdatabase.GetResourcePath(mockSubID, "resourceName"),
Type: "Microsoft.RedHatOpenShift/openShiftClusters",
Tags: map[string]string{"tag": "will-be-kept"},
Properties: admin.OpenShiftClusterProperties{
ProvisioningState: admin.ProvisioningStateUpdating,
LastProvisioningState: admin.ProvisioningStateSucceeded,
ClusterProfile: admin.ClusterProfile{
FipsValidatedModules: admin.FipsValidatedModulesDisabled,
},
MaintenanceTask: admin.MaintenanceTaskPucmPending,
NetworkProfile: admin.NetworkProfile{
OutboundType: admin.OutboundTypeLoadbalancer,
},
MasterProfile: admin.MasterProfile{
EncryptionAtHost: admin.EncryptionAtHostDisabled,
},
OperatorFlags: admin.OperatorFlags(api.DefaultOperatorFlags()),
},
},
},
} {
t.Run(tt.name, func(t *testing.T) {
ti := newTestInfra(t).
Expand Down
1 change: 1 addition & 0 deletions pkg/monitor/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ func (mon *Monitor) Monitor(ctx context.Context) (errs []error) {
mon.emitSummary,
mon.emitHiveRegistrationStatus,
mon.emitOperatorFlagsAndSupportBanner,
mon.emitPucmState,
mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable
} {
err = f(ctx)
Expand Down
82 changes: 82 additions & 0 deletions pkg/monitor/cluster/maintenance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package cluster

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"

"github.com/Azure/ARO-RP/pkg/api"
)

/**************************************************************
Possible PUCM states:

(1) PUCM pending
- We will do PUCM, so emit a maintenance pending signal
- Conditions:
* Field pucmPending is true
* Don't meet below conditions for in progress maintenance

(2) Planned PUCM in progress
- Emit a planned maintenance in progress signal.
- If first PUCM attempt fails, leave cluster in this state
because we will need to retry PUCM in at a later time.
- Conditions:
* Field pucmPending is true
* One of: (a) provisoning state AdminUpdate or (2) AdminUpdate err is not nil

(3) Unplanned PUCM in progress
- Emit an unplanned maintenance in progress signal.
- If first PUCM attempt fails, leave cluster in this state
because we will need to retry PUCM in at a later time.
- Conditions:
* Field pucmPending is false
* One of: (a) provisoning state AdminUpdate or (2) AdminUpdate err is not nil

(4) No ongoinig or scheduled PUCM
- Don't emit a signal
- Conditions:
* Field pucmPending is false
* Provisioning state is not AdminUpdate and AdminUpdate err is not nil
**************************************************************/

type pucmState string

func (p pucmState) String() string {
return string(p)
}
niontive marked this conversation as resolved.
Show resolved Hide resolved

const (
pucmNone pucmState = "none"
pucmPending pucmState = "pending"
pucmPlannedOngoing pucmState = "planned_ongoing"
pucmUnplannedOngoing pucmState = "unplanned_ongoing"
)
niontive marked this conversation as resolved.
Show resolved Hide resolved

func (mon *Monitor) emitPucmState(ctx context.Context) error {
state := getPucmState(mon.oc.Properties)
mon.emitGauge("cluster.maintenance.pucm", 1, map[string]string{
"state": state.String(),
})

return nil
}

func getPucmState(clusterProperties api.OpenShiftClusterProperties) pucmState {
if pucmOngoing(clusterProperties) {
if clusterProperties.PucmPending {
return pucmPlannedOngoing
}
return pucmUnplannedOngoing
} else if clusterProperties.PucmPending {
niontive marked this conversation as resolved.
Show resolved Hide resolved
return pucmPending
}

return pucmNone
}

func pucmOngoing(clusterProperties api.OpenShiftClusterProperties) bool {
return clusterProperties.ProvisioningState == api.ProvisioningStateAdminUpdating ||
clusterProperties.LastAdminUpdateError != ""
}
Loading
Loading