Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PUCM Maintenance Signals #3021

Merged
merged 28 commits into from
Jul 18, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/api/admin/openshiftcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ type OpenShiftClusterProperties struct {
ImageRegistryStorageAccountName string `json:"imageRegistryStorageAccountName,omitempty"`
InfraID string `json:"infraId,omitempty"`
HiveProfile HiveProfile `json:"hiveProfile,omitempty"`
PucmPending bool `json:"pucmPending,omitempty"`
}

// ProvisioningState represents a provisioning state.
Expand Down
2 changes: 2 additions & 0 deletions pkg/api/openshiftcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ type OpenShiftClusterProperties struct {
RegistryProfiles []*RegistryProfile `json:"registryProfiles,omitempty"`

HiveProfile HiveProfile `json:"hiveProfile,omitempty"`

PucmPending bool `json:"pucmPending,omitempty"`
}

// ProvisioningState represents a provisioning state
Expand Down
1 change: 1 addition & 0 deletions pkg/backend/openshiftcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ func (ocb *openShiftClusterBackend) handle(ctx context.Context, log *logrus.Entr
if err != nil {
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateFailed, err)
}
doc.OpenShiftCluster.Properties.PucmPending = false
return ocb.endLease(ctx, log, stop, doc, api.ProvisioningStateSucceeded, nil)

case api.ProvisioningStateUpdating:
Expand Down
1 change: 1 addition & 0 deletions pkg/monitor/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ func (mon *Monitor) Monitor(ctx context.Context) (errs []error) {
return
}
for _, f := range []func(context.Context) error{
mon.emitPucmState,
niontive marked this conversation as resolved.
Show resolved Hide resolved
mon.emitAroOperatorHeartbeat,
mon.emitAroOperatorConditions,
mon.emitNSGReconciliation,
Expand Down
89 changes: 89 additions & 0 deletions pkg/monitor/cluster/maintenance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package cluster

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"

"github.com/Azure/ARO-RP/pkg/api"
)

/**************************************************************
Possible PUCM states:

(1) PUCM pending
- We will do PUCM, so emit a maintenance pending signal
- Conditions:
* Field pucmPending is true
* Don't meet below conditions for in progress maintenance

(2) Planned PUCM in progress
- Emit a planned maintenance in progress signal.
- If first PUCM attempt fails, leave cluster in this state
because we will need to retry PUCM in at a later time.
- Conditions:
* Field pucmPending is true
* One of: (a) provisoning state AdminUpdate or (2) AdminUpdate err is not nil

(3) Unplanned PUCM in progress
- Emit an unplanned maintenance in progress signal.
- If first PUCM attempt fails, leave cluster in this state
because we will need to retry PUCM in at a later time.
- Conditions:
* Field pucmPending is false
* One of: (a) provisoning state AdminUpdate or (2) AdminUpdate err is not nil

(4) No ongoinig or scheduled PUCM
- Don't emit a signal
- Conditions:
* Field pucmPending is false
* Provisioning state is not AdminUpdate and AdminUpdate err is not nil
**************************************************************/

type pucmState string

func (p pucmState) String() string {
return string(p)
}
niontive marked this conversation as resolved.
Show resolved Hide resolved

const (
pucmNone pucmState = "none"
pucmPending pucmState = "pending"
pucmPlannedOngoing pucmState = "planned_ongoing"
pucmUnplannedOngoing pucmState = "unplanned_ongoing"
)
niontive marked this conversation as resolved.
Show resolved Hide resolved

func (mon *Monitor) emitPucmState(ctx context.Context) error {
state := getPucmState(mon.oc.Properties)
mon.emitGauge("cluster.maintenance.pucm", 1, map[string]string{
"state": state.String(),
})

return nil
}

func getPucmState(clusterProperties api.OpenShiftClusterProperties) pucmState {
var state pucmState

if pucmOngoing(clusterProperties) {
if clusterProperties.PucmPending {
state = pucmPlannedOngoing
} else {
state = pucmUnplannedOngoing
}
} else {
if clusterProperties.PucmPending {
state = pucmPending
} else {
state = pucmNone
}
}

return state
niontive marked this conversation as resolved.
Show resolved Hide resolved
}

func pucmOngoing(clusterProperties api.OpenShiftClusterProperties) bool {
return clusterProperties.ProvisioningState == api.ProvisioningStateAdminUpdating ||
clusterProperties.LastAdminUpdateError != ""
}
97 changes: 97 additions & 0 deletions pkg/monitor/cluster/maintenance_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package cluster

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"testing"

"github.com/golang/mock/gomock"

"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/metrics"
mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics"
)

func TestEmitPucmState(t *testing.T) {
niontive marked this conversation as resolved.
Show resolved Hide resolved
ctx := context.Background()

controller := gomock.NewController(t)
defer controller.Finish()

m := mock_metrics.NewMockEmitter(controller)

// Unplanned ongoing
oc := &api.OpenShiftCluster{
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateAdminUpdating,
},
}
mon := getMonitor(oc, m)
m.EXPECT().EmitGauge("cluster.maintenance.pucm", int64(1), map[string]string{
"state": pucmUnplannedOngoing.String(),
})

err := mon.emitPucmState(ctx)
if err != nil {
t.Fatal(err)
}

// Planned ongoing
oc = &api.OpenShiftCluster{
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateAdminUpdating,
PucmPending: true,
},
}
mon = getMonitor(oc, m)
m.EXPECT().EmitGauge("cluster.maintenance.pucm", int64(1), map[string]string{
"state": pucmPlannedOngoing.String(),
})

err = mon.emitPucmState(ctx)
if err != nil {
t.Fatal(err)
}

// Pending
oc = &api.OpenShiftCluster{
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateSucceeded,
PucmPending: true,
},
}
mon = getMonitor(oc, m)
m.EXPECT().EmitGauge("cluster.maintenance.pucm", int64(1), map[string]string{
"state": pucmPending.String(),
})

err = mon.emitPucmState(ctx)
if err != nil {
t.Fatal(err)
}

// None
oc = &api.OpenShiftCluster{
Properties: api.OpenShiftClusterProperties{
ProvisioningState: api.ProvisioningStateSucceeded,
},
}
mon = getMonitor(oc, m)
m.EXPECT().EmitGauge("cluster.maintenance.pucm", int64(1), map[string]string{
"state": pucmNone.String(),
})

err = mon.emitPucmState(ctx)
if err != nil {
t.Fatal(err)
}
}

func getMonitor(oc *api.OpenShiftCluster, m metrics.Emitter) *Monitor {
return &Monitor{
m: m,
oc: oc,
}
}