Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

retry logic update #3463

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions pkg/cluster/arooperator.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,27 @@ func (m *manager) ensureAROOperator(ctx context.Context) error {
return err
}

func (m *manager) aroDeploymentReady(ctx context.Context) (bool, error) {
func (m *manager) aroDeploymentReady(ctx context.Context) (ok bool, retry bool, err error) {
if !m.isIngressProfileAvailable() {
// If the ingress profile is not available, ARO operator update/deploy will fail.
m.log.Error("skip aroDeploymentReady")
return true, nil
return true, false, nil
}
return m.aroOperatorDeployer.IsReady(ctx)
ok, err = m.aroOperatorDeployer.IsReady(ctx)
return ok, false, err
}

func (m *manager) ensureAROOperatorRunningDesiredVersion(ctx context.Context) (bool, error) {
func (m *manager) ensureAROOperatorRunningDesiredVersion(ctx context.Context) (ok bool, retry bool, err error) {
if !m.isIngressProfileAvailable() {
// If the ingress profile is not available, ARO operator update/deploy will fail.
m.log.Error("skip ensureAROOperatorRunningDesiredVersion")
return true, nil
return true, false, nil
}
ok, err := m.aroOperatorDeployer.IsRunningDesiredVersion(ctx)
ok, err = m.aroOperatorDeployer.IsRunningDesiredVersion(ctx)
if !ok || err != nil {
return false, err
return false, false, err
}
return true, nil
return true, false, nil
}

func (m *manager) ensureCredentialsRequest(ctx context.Context) error {
Expand Down
4 changes: 2 additions & 2 deletions pkg/cluster/arooperator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ func TestAroDeploymentReady(t *testing.T) {
aroOperatorDeployer: dep,
}

ok, err := m.aroDeploymentReady(ctx)
ok, _, err := m.aroDeploymentReady(ctx)
if err != nil || ok != tt.wantRes {
t.Error(err)
}
Expand Down Expand Up @@ -311,7 +311,7 @@ func TestEnsureAROOperatorRunningDesiredVersion(t *testing.T) {
aroOperatorDeployer: dep,
}

ok, err := m.ensureAROOperatorRunningDesiredVersion(ctx)
ok, _, err := m.ensureAROOperatorRunningDesiredVersion(ctx)
if err != nil || ok != tt.wantRes {
t.Error(err)
}
Expand Down
97 changes: 69 additions & 28 deletions pkg/cluster/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,78 +6,114 @@ package cluster
import (
"context"
"errors"
"fmt"
"net/http"
"time"

configv1 "github.com/openshift/api/config/v1"
corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/Azure/ARO-RP/pkg/api"
)

const minimumWorkerNodes = 2

// condition functions should return an error only if it's not able to be retried
// if a condition function encounters a error when retrying it should return false, nil.

func (m *manager) apiServersReady(ctx context.Context) (bool, error) {
func (m *manager) apiServersReady(ctx context.Context) (bool, retry bool, err error) {
apiserver, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "kube-apiserver", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
return isOperatorAvailable(apiserver), nil
return isOperatorAvailable(apiserver), true, nil
}

func (m *manager) minimumWorkerNodesReady(ctx context.Context) (bool, error) {
func getErrMessage(err error, messageifany string) error {
message := "Minimum number of worker nodes have not been successfully created. Please retry and if the issue persists, raise an Azure support ticket"
if err != nil {
message = "Error: " + err.Error() + "Message: " + messageifany + message
} else {
message = messageifany + message
}
cloudError := api.NewCloudError(
http.StatusInternalServerError,
api.CloudErrorCodeDeploymentFailed,
"",
message,
)
return cloudError
}

func (m *manager) minimumWorkerNodesReady(ctx context.Context) (nodeCheck bool, retry bool, err error) {
nodes, err := m.kubernetescli.CoreV1().Nodes().List(ctx, metav1.ListOptions{
LabelSelector: "node-role.kubernetes.io/worker",
})
if err != nil {
return false, nil
return false, true, getErrMessage(err, "")
}

readyWorkers := 0
message := ""
for _, node := range nodes.Items {
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
readyWorkers++
} else {
messageString := fmt.Sprintf("%+v - Status:%+v, Message: %+v\n", node, cond.Status, cond.Message)
message += messageString
}
}
}

return readyWorkers >= minimumWorkerNodes, nil
minWorkerAchieved := readyWorkers >= minimumWorkerNodes
if minWorkerAchieved {
return minWorkerAchieved, false, nil
} else {
if message == "" {
message = "Check the config and versions"
}
return false, true, getErrMessage(err, message)
}
}

func (m *manager) operatorConsoleExists(ctx context.Context) (bool, error) {
_, err := m.operatorcli.OperatorV1().Consoles().Get(ctx, consoleConfigResourceName, metav1.GetOptions{})
return err == nil, nil
func (m *manager) operatorConsoleExists(ctx context.Context) (errorcheck bool, retry bool, err error) {
_, err = m.operatorcli.OperatorV1().Consoles().Get(ctx, consoleConfigResourceName, metav1.GetOptions{})
return err == nil, false, nil
}

func (m *manager) operatorConsoleReady(ctx context.Context) (bool, error) {
func (m *manager) operatorConsoleReady(ctx context.Context) (consoleOperatorcheck bool, retry bool, err error) {
consoleOperator, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "console", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
return isOperatorAvailable(consoleOperator), nil
return isOperatorAvailable(consoleOperator), true, nil
}

func (m *manager) clusterVersionReady(ctx context.Context) (bool, error) {
func (m *manager) clusterVersionReady(ctx context.Context) (cvcheck bool, retry bool, err error) {
cv, err := m.configcli.ConfigV1().ClusterVersions().Get(ctx, "version", metav1.GetOptions{})
if err == nil {
for _, cond := range cv.Status.Conditions {
if cond.Type == configv1.OperatorAvailable && cond.Status == configv1.ConditionTrue {
return true, nil
return true, false, nil
}
}
}
return false, nil
return false, true, nil
}

func (m *manager) ingressControllerReady(ctx context.Context) (bool, error) {
func (m *manager) ingressControllerReady(ctx context.Context) (ingressOperatorcheck bool, retry bool, err error) {
ingressOperator, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "ingress", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
ingressOperatorcheck = isOperatorAvailable(ingressOperator)
if ingressOperatorcheck {
return ingressOperatorcheck, false, nil
} else {
return ingressOperatorcheck, true, nil
}
return isOperatorAvailable(ingressOperator), nil
}

func isOperatorAvailable(operator *configv1.ClusterOperator) bool {
Expand All @@ -94,45 +130,50 @@ func isOperatorAvailable(operator *configv1.ClusterOperator) bool {
// Checking for a change to the lastSyncCloudCredsSecretResourceVersion attribute of the CredentialRequest's status would be a neater way of checking
// whether it was reconciled, but we would would have to save the value prior to updating the kube-system/azure-credentials Secret so that we'd have
// an old value to compare to.
func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, error) {
func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (credcheck bool, retry bool, err error) {
// If the CSP hasn't been updated, the CredentialsRequest does not need to be reconciled.
secret, err := m.servicePrincipalUpdated(ctx)
if err != nil {
return false, err
return false, false, err
} else if secret == nil {
return true, nil
return true, false, nil
}

u, err := m.dynamiccli.Resource(CredentialsRequestGroupVersionResource).Namespace("openshift-cloud-credential-operator").Get(ctx, "openshift-azure-operator", metav1.GetOptions{})
if err != nil {
// If the CredentialsRequest is not found, it may have just recently been reconciled.
// Return nil to retry until we hit the condition timeout.
if kerrors.IsNotFound(err) {
return false, nil
return false, true, nil
}
return false, err
return false, false, err
}

cr := u.UnstructuredContent()
var status map[string]interface{}
if s, ok := cr["status"]; ok {
status = s.(map[string]interface{})
} else {
return false, errors.New("unable to access status of openshift-azure-operator CredentialsRequest")
return false, false, errors.New("unable to access status of openshift-azure-operator CredentialsRequest")
}

var lastSyncTimestamp string
if lst, ok := status["lastSyncTimestamp"]; ok {
lastSyncTimestamp = lst.(string)
} else {
return false, errors.New("unable to access status.lastSyncTimestamp of openshift-azure-operator CredentialsRequest")
return false, false, errors.New("unable to access status.lastSyncTimestamp of openshift-azure-operator CredentialsRequest")
}

timestamp, err := time.Parse(time.RFC3339, lastSyncTimestamp)
if err != nil {
return false, err
return false, false, err
}

timeSinceLastSync := time.Since(timestamp)
return timeSinceLastSync.Minutes() < 5, nil
timeSinceLastSyncCheck := timeSinceLastSync.Minutes() < 5
if timeSinceLastSyncCheck {
return true, false, nil
} else {
return false, true, nil
}
}
10 changes: 5 additions & 5 deletions pkg/cluster/condition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func TestOperatorConsoleExists(t *testing.T) {
},
}),
}
ready, err := m.operatorConsoleExists(ctx)
ready, _, err := m.operatorConsoleExists(ctx)
if err != nil {
t.Error(errMustBeNilMsg)
}
Expand Down Expand Up @@ -182,8 +182,8 @@ func TestMinimumWorkerNodesReady(t *testing.T) {
},
}),
}
ready, err := m.minimumWorkerNodesReady(ctx)
if err != nil {
ready, retry, err := m.minimumWorkerNodesReady(ctx)
if err != nil && !retry {
t.Error(errMustBeNilMsg)
}
if ready != tt.want {
Expand Down Expand Up @@ -231,7 +231,7 @@ func TestClusterVersionReady(t *testing.T) {
},
}),
}
ready, err := m.clusterVersionReady(ctx)
ready, _, err := m.clusterVersionReady(ctx)
if err != nil {
t.Error(errMustBeNilMsg)
}
Expand Down Expand Up @@ -389,7 +389,7 @@ func TestAroCredentialsRequestReconciled(t *testing.T) {
},
}

result, err := m.aroCredentialsRequestReconciled(ctx)
result, _, err := m.aroCredentialsRequestReconciled(ctx)
if result != tt.want {
t.Errorf("Result was %v, wanted %v", result, tt.want)
}
Expand Down
26 changes: 22 additions & 4 deletions pkg/cluster/hive.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,32 @@ func (m *manager) hiveEnsureResources(ctx context.Context) error {
return m.hiveClusterManager.CreateOrUpdate(ctx, m.subscriptionDoc, m.doc)
}

func (m *manager) hiveClusterDeploymentReady(ctx context.Context) (bool, error) {
func (m *manager) hiveClusterDeploymentReady(ctx context.Context) (deploymentReady bool, retry bool, err error) {
m.log.Info("waiting for cluster deployment to become ready")
return m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
deploymentReady, err = m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
if err != nil {
return deploymentReady, false, err
} else {
if !deploymentReady {
return deploymentReady, true, err
} else {
return deploymentReady, false, err
}
}
}

func (m *manager) hiveClusterInstallationComplete(ctx context.Context) (bool, error) {
func (m *manager) hiveClusterInstallationComplete(ctx context.Context) (installationComplete bool, retry bool, err error) {
m.log.Info("waiting for cluster installation to complete")
return m.hiveClusterManager.IsClusterInstallationComplete(ctx, m.doc)
installationComplete, err = m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
if err != nil {
return installationComplete, false, err
} else {
if !installationComplete {
return installationComplete, true, err
} else {
return installationComplete, false, err
}
}
}

func (m *manager) hiveResetCorrelationData(ctx context.Context) error {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/hive_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func TestHiveClusterDeploymentReady(t *testing.T) {
}
m.hiveClusterManager = hiveMock

result, err := m.hiveClusterDeploymentReady(context.Background())
result, _, err := m.hiveClusterDeploymentReady(context.Background())
utilerror.AssertErrorMessage(t, err, tt.wantErr)

if tt.wantResult != result {
Expand Down
Loading
Loading