Skip to content

Commit

Permalink
Changes for retry logic - Min Worker Node Err
Browse files Browse the repository at this point in the history
  • Loading branch information
LiniSusan committed Mar 15, 2024
1 parent 5115f21 commit 0ea2892
Show file tree
Hide file tree
Showing 13 changed files with 139 additions and 83 deletions.
17 changes: 9 additions & 8 deletions pkg/cluster/arooperator.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,27 @@ func (m *manager) ensureAROOperator(ctx context.Context) error {
return err
}

func (m *manager) aroDeploymentReady(ctx context.Context) (bool, error) {
func (m *manager) aroDeploymentReady(ctx context.Context) (ok bool, retry bool, err error) {
if !m.isIngressProfileAvailable() {
// If the ingress profile is not available, ARO operator update/deploy will fail.
m.log.Error("skip aroDeploymentReady")
return true, nil
return true, false, nil
}
return m.aroOperatorDeployer.IsReady(ctx)
ok, err = m.aroOperatorDeployer.IsReady(ctx)
return ok, false, err
}

func (m *manager) ensureAROOperatorRunningDesiredVersion(ctx context.Context) (bool, error) {
func (m *manager) ensureAROOperatorRunningDesiredVersion(ctx context.Context) (ok bool, retry bool, err error) {
if !m.isIngressProfileAvailable() {
// If the ingress profile is not available, ARO operator update/deploy will fail.
m.log.Error("skip ensureAROOperatorRunningDesiredVersion")
return true, nil
return true, false, nil
}
ok, err := m.aroOperatorDeployer.IsRunningDesiredVersion(ctx)
ok, err = m.aroOperatorDeployer.IsRunningDesiredVersion(ctx)
if !ok || err != nil {
return false, err
return false, false, err
}
return true, nil
return true, false, nil
}

func (m *manager) ensureCredentialsRequest(ctx context.Context) error {
Expand Down
4 changes: 2 additions & 2 deletions pkg/cluster/arooperator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ func TestAroDeploymentReady(t *testing.T) {
aroOperatorDeployer: dep,
}

ok, err := m.aroDeploymentReady(ctx)
ok, _, err := m.aroDeploymentReady(ctx)
if err != nil || ok != tt.wantRes {
t.Error(err)
}
Expand Down Expand Up @@ -311,7 +311,7 @@ func TestEnsureAROOperatorRunningDesiredVersion(t *testing.T) {
aroOperatorDeployer: dep,
}

ok, err := m.ensureAROOperatorRunningDesiredVersion(ctx)
ok, _, err := m.ensureAROOperatorRunningDesiredVersion(ctx)
if err != nil || ok != tt.wantRes {
t.Error(err)
}
Expand Down
102 changes: 75 additions & 27 deletions pkg/cluster/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,78 +6,122 @@ package cluster
import (
"context"
"errors"
"fmt"
"net/http"
"time"

configv1 "github.com/openshift/api/config/v1"
corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/Azure/ARO-RP/pkg/api"
)

const minimumWorkerNodes = 2

// condition functions should return an error only if it's not able to be retried
// if a condition function encounters a error when retrying it should return false, nil.

func (m *manager) apiServersReady(ctx context.Context) (bool, error) {
func (m *manager) apiServersReady(ctx context.Context) (operatorCheck bool, retry bool, err error) {
apiserver, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "kube-apiserver", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
operatorCheck = isOperatorAvailable(apiserver)
if operatorCheck {
return operatorCheck, false, nil
}
return operatorCheck, true, nil
}

func getErrMessage(err error, messageifany string) error {
message := "Minimum number of worker nodes have not been successfully created. Please retry and if the issue persists, raise an Azure support ticket"
if err != nil {
message = "Error: " + err.Error() + "Message: " + messageifany + message
} else {
message = messageifany + message
}
return isOperatorAvailable(apiserver), nil
cloudError := api.NewCloudError(
http.StatusInternalServerError,
api.CloudErrorCodeDeploymentFailed,
"",
message,
)
return cloudError
}

func (m *manager) minimumWorkerNodesReady(ctx context.Context) (bool, error) {
func (m *manager) minimumWorkerNodesReady(ctx context.Context) (nodeCheck bool, retry bool, err error) {
nodes, err := m.kubernetescli.CoreV1().Nodes().List(ctx, metav1.ListOptions{
LabelSelector: "node-role.kubernetes.io/worker",
})
if err != nil {
return false, nil
return false, true, getErrMessage(err, "")
}

readyWorkers := 0
message := ""
for _, node := range nodes.Items {
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
readyWorkers++
} else {
messageString := fmt.Sprintf("%+v - Status:%+v, Message: %+v\n", node, cond.Status, cond.Message)
message += messageString
}
}
}

return readyWorkers >= minimumWorkerNodes, nil
minWorkerAchieved := readyWorkers >= minimumWorkerNodes
if minWorkerAchieved {
return minWorkerAchieved, false, nil
} else {
if message == "" {
message = "Check the config and versions"
}
return false, true, getErrMessage(err, message)
}
}

func (m *manager) operatorConsoleExists(ctx context.Context) (bool, error) {
_, err := m.operatorcli.OperatorV1().Consoles().Get(ctx, consoleConfigResourceName, metav1.GetOptions{})
return err == nil, nil
func (m *manager) operatorConsoleExists(ctx context.Context) (errorcheck bool, retry bool, err error) {
_, err = m.operatorcli.OperatorV1().Consoles().Get(ctx, consoleConfigResourceName, metav1.GetOptions{})
return err == nil, false, nil
}

func (m *manager) operatorConsoleReady(ctx context.Context) (bool, error) {
func (m *manager) operatorConsoleReady(ctx context.Context) (consoleOperatorcheck bool, retry bool, err error) {
consoleOperator, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "console", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
return isOperatorAvailable(consoleOperator), nil
consoleOperatorcheck = isOperatorAvailable(consoleOperator)
if consoleOperatorcheck {
return consoleOperatorcheck, false, nil
}
return consoleOperatorcheck, true, nil
}

func (m *manager) clusterVersionReady(ctx context.Context) (bool, error) {
func (m *manager) clusterVersionReady(ctx context.Context) (cvcheck bool, retry bool, err error) {
cv, err := m.configcli.ConfigV1().ClusterVersions().Get(ctx, "version", metav1.GetOptions{})
if err == nil {
for _, cond := range cv.Status.Conditions {
if cond.Type == configv1.OperatorAvailable && cond.Status == configv1.ConditionTrue {
return true, nil
return true, false, nil
}
}
}
return false, nil
return false, true, nil
}

func (m *manager) ingressControllerReady(ctx context.Context) (bool, error) {
func (m *manager) ingressControllerReady(ctx context.Context) (ingressOperatorcheck bool, retry bool, err error) {
ingressOperator, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "ingress", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
ingressOperatorcheck = isOperatorAvailable(ingressOperator)
if ingressOperatorcheck {
return ingressOperatorcheck, false, nil
}
return isOperatorAvailable(ingressOperator), nil
return ingressOperatorcheck, true, nil
}

func isOperatorAvailable(operator *configv1.ClusterOperator) bool {
Expand All @@ -94,45 +138,49 @@ func isOperatorAvailable(operator *configv1.ClusterOperator) bool {
// Checking for a change to the lastSyncCloudCredsSecretResourceVersion attribute of the CredentialRequest's status would be a neater way of checking
// whether it was reconciled, but we would would have to save the value prior to updating the kube-system/azure-credentials Secret so that we'd have
// an old value to compare to.
func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, error) {
func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (credcheck bool, retry bool, err error) {
// If the CSP hasn't been updated, the CredentialsRequest does not need to be reconciled.
secret, err := m.servicePrincipalUpdated(ctx)
if err != nil {
return false, err
return false, false, err
} else if secret == nil {
return true, nil
return true, false, nil
}

u, err := m.dynamiccli.Resource(CredentialsRequestGroupVersionResource).Namespace("openshift-cloud-credential-operator").Get(ctx, "openshift-azure-operator", metav1.GetOptions{})
if err != nil {
// If the CredentialsRequest is not found, it may have just recently been reconciled.
// Return nil to retry until we hit the condition timeout.
if kerrors.IsNotFound(err) {
return false, nil
return false, true, nil
}
return false, err
return false, false, err
}

cr := u.UnstructuredContent()
var status map[string]interface{}
if s, ok := cr["status"]; ok {
status = s.(map[string]interface{})
} else {
return false, errors.New("unable to access status of openshift-azure-operator CredentialsRequest")
return false, false, errors.New("unable to access status of openshift-azure-operator CredentialsRequest")
}

var lastSyncTimestamp string
if lst, ok := status["lastSyncTimestamp"]; ok {
lastSyncTimestamp = lst.(string)
} else {
return false, errors.New("unable to access status.lastSyncTimestamp of openshift-azure-operator CredentialsRequest")
return false, false, errors.New("unable to access status.lastSyncTimestamp of openshift-azure-operator CredentialsRequest")
}

timestamp, err := time.Parse(time.RFC3339, lastSyncTimestamp)
if err != nil {
return false, err
return false, false, err
}

timeSinceLastSync := time.Since(timestamp)
return timeSinceLastSync.Minutes() < 5, nil
timeSinceLastSyncCheck := timeSinceLastSync.Minutes() < 5
if timeSinceLastSyncCheck {
return true, false, nil
}
return false, true, nil
}
10 changes: 5 additions & 5 deletions pkg/cluster/condition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func TestOperatorConsoleExists(t *testing.T) {
},
}),
}
ready, err := m.operatorConsoleExists(ctx)
ready, _, err := m.operatorConsoleExists(ctx)
if err != nil {
t.Error(errMustBeNilMsg)
}
Expand Down Expand Up @@ -182,8 +182,8 @@ func TestMinimumWorkerNodesReady(t *testing.T) {
},
}),
}
ready, err := m.minimumWorkerNodesReady(ctx)
if err != nil {
ready, retry, err := m.minimumWorkerNodesReady(ctx)
if err != nil && !retry {
t.Error(errMustBeNilMsg)
}
if ready != tt.want {
Expand Down Expand Up @@ -231,7 +231,7 @@ func TestClusterVersionReady(t *testing.T) {
},
}),
}
ready, err := m.clusterVersionReady(ctx)
ready, _, err := m.clusterVersionReady(ctx)
if err != nil {
t.Error(errMustBeNilMsg)
}
Expand Down Expand Up @@ -389,7 +389,7 @@ func TestAroCredentialsRequestReconciled(t *testing.T) {
},
}

result, err := m.aroCredentialsRequestReconciled(ctx)
result, _, err := m.aroCredentialsRequestReconciled(ctx)
if result != tt.want {
t.Errorf("Result was %v, wanted %v", result, tt.want)
}
Expand Down
10 changes: 6 additions & 4 deletions pkg/cluster/hive.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@ func (m *manager) hiveEnsureResources(ctx context.Context) error {
return m.hiveClusterManager.CreateOrUpdate(ctx, m.subscriptionDoc, m.doc)
}

func (m *manager) hiveClusterDeploymentReady(ctx context.Context) (bool, error) {
func (m *manager) hiveClusterDeploymentReady(ctx context.Context) (deploymentReady bool, retry bool, err error) {
m.log.Info("waiting for cluster deployment to become ready")
return m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
deploymentReady, err = m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
return deploymentReady, false, err
}

func (m *manager) hiveClusterInstallationComplete(ctx context.Context) (bool, error) {
func (m *manager) hiveClusterInstallationComplete(ctx context.Context) (installationComplete bool, retry bool, err error) {
m.log.Info("waiting for cluster installation to complete")
return m.hiveClusterManager.IsClusterInstallationComplete(ctx, m.doc)
installationComplete, err = m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
return installationComplete, false, err
}

func (m *manager) hiveResetCorrelationData(ctx context.Context) error {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/hive_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func TestHiveClusterDeploymentReady(t *testing.T) {
}
m.hiveClusterManager = hiveMock

result, err := m.hiveClusterDeploymentReady(context.Background())
result, _, err := m.hiveClusterDeploymentReady(context.Background())
utilerror.AssertErrorMessage(t, err, tt.wantErr)

if tt.wantResult != result {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/install_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func failingFunc(context.Context) error { return errors.New("oh no!") }

func successfulActionStep(context.Context) error { return nil }

func successfulConditionStep(context.Context) (bool, error) { return true, nil }
func successfulConditionStep(context.Context) (bool, bool, error) { return true, false, nil }

type fakeMetricsEmitter struct {
Metrics map[string]int64
Expand Down
10 changes: 5 additions & 5 deletions pkg/containerinstall/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,22 +110,22 @@ func (m *manager) startContainer(ctx context.Context, version *api.OpenShiftVers
return err
}

func (m *manager) containerFinished(context.Context) (bool, error) {
func (m *manager) containerFinished(context.Context) (bool, retry bool, err error) {
containerName := "installer-" + m.clusterUUID
inspectData, err := containers.Inspect(m.conn, containerName, nil)
if err != nil {
return false, err
return false, false, err
}

if inspectData.State.Status == "exited" || inspectData.State.Status == "stopped" {
if inspectData.State.ExitCode != 0 {
getContainerLogs(m.conn, m.log, containerName)
return true, fmt.Errorf("container exited with %d", inspectData.State.ExitCode)
return true, false, fmt.Errorf("container exited with %d", inspectData.State.ExitCode)
}
m.success = true
return true, nil
return true, false, nil
}
return false, nil
return false, true, nil
}

func (m *manager) createSecrets(ctx context.Context, doc *api.OpenShiftClusterDocument, sub *api.SubscriptionDocument) error {
Expand Down
Loading

0 comments on commit 0ea2892

Please sign in to comment.