Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes for retry logic - Min Worker Node Err #3464

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions pkg/cluster/arooperator.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,27 @@ func (m *manager) ensureAROOperator(ctx context.Context) error {
return err
}

func (m *manager) aroDeploymentReady(ctx context.Context) (bool, error) {
func (m *manager) aroDeploymentReady(ctx context.Context) (ok bool, retry bool, err error) {
if !m.isIngressProfileAvailable() {
// If the ingress profile is not available, ARO operator update/deploy will fail.
m.log.Error("skip aroDeploymentReady")
return true, nil
return true, false, nil
}
return m.aroOperatorDeployer.IsReady(ctx)
ok, err = m.aroOperatorDeployer.IsReady(ctx)
return ok, false, err
}

func (m *manager) ensureAROOperatorRunningDesiredVersion(ctx context.Context) (bool, error) {
func (m *manager) ensureAROOperatorRunningDesiredVersion(ctx context.Context) (ok bool, retry bool, err error) {
if !m.isIngressProfileAvailable() {
// If the ingress profile is not available, ARO operator update/deploy will fail.
m.log.Error("skip ensureAROOperatorRunningDesiredVersion")
return true, nil
return true, false, nil
}
ok, err := m.aroOperatorDeployer.IsRunningDesiredVersion(ctx)
ok, err = m.aroOperatorDeployer.IsRunningDesiredVersion(ctx)
if !ok || err != nil {
return false, err
return false, false, err
}
return true, nil
return true, false, nil
}

func (m *manager) ensureCredentialsRequest(ctx context.Context) error {
Expand Down
4 changes: 2 additions & 2 deletions pkg/cluster/arooperator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ func TestAroDeploymentReady(t *testing.T) {
aroOperatorDeployer: dep,
}

ok, err := m.aroDeploymentReady(ctx)
ok, _, err := m.aroDeploymentReady(ctx)
if err != nil || ok != tt.wantRes {
t.Error(err)
}
Expand Down Expand Up @@ -311,7 +311,7 @@ func TestEnsureAROOperatorRunningDesiredVersion(t *testing.T) {
aroOperatorDeployer: dep,
}

ok, err := m.ensureAROOperatorRunningDesiredVersion(ctx)
ok, _, err := m.ensureAROOperatorRunningDesiredVersion(ctx)
if err != nil || ok != tt.wantRes {
t.Error(err)
}
Expand Down
102 changes: 75 additions & 27 deletions pkg/cluster/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,78 +6,122 @@ package cluster
import (
"context"
"errors"
"fmt"
"net/http"
"time"

configv1 "github.com/openshift/api/config/v1"
corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/Azure/ARO-RP/pkg/api"
)

const minimumWorkerNodes = 2

// condition functions should return an error only if it's not able to be retried
// if a condition function encounters a error when retrying it should return false, nil.

func (m *manager) apiServersReady(ctx context.Context) (bool, error) {
func (m *manager) apiServersReady(ctx context.Context) (operatorCheck bool, retry bool, err error) {
apiserver, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "kube-apiserver", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
operatorCheck = isOperatorAvailable(apiserver)
if operatorCheck {
return operatorCheck, false, nil
}
return operatorCheck, true, nil
}

func getErrMessage(err error, messageifany string) error {
message := "Minimum number of worker nodes have not been successfully created. Please retry and if the issue persists, raise an Azure support ticket"
if err != nil {
message = "Error: " + err.Error() + "Message: " + messageifany + message
} else {
message = messageifany + message
}
return isOperatorAvailable(apiserver), nil
cloudError := api.NewCloudError(
http.StatusInternalServerError,
api.CloudErrorCodeDeploymentFailed,
"",
message,
)
return cloudError
}

func (m *manager) minimumWorkerNodesReady(ctx context.Context) (bool, error) {
func (m *manager) minimumWorkerNodesReady(ctx context.Context) (nodeCheck bool, retry bool, err error) {
nodes, err := m.kubernetescli.CoreV1().Nodes().List(ctx, metav1.ListOptions{
LabelSelector: "node-role.kubernetes.io/worker",
})
if err != nil {
return false, nil
return false, true, getErrMessage(err, "")
}

readyWorkers := 0
message := ""
for _, node := range nodes.Items {
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
readyWorkers++
} else {
messageString := fmt.Sprintf("%+v - Status:%+v, Message: %+v\n", node, cond.Status, cond.Message)
message += messageString
}
}
}

return readyWorkers >= minimumWorkerNodes, nil
minWorkerAchieved := readyWorkers >= minimumWorkerNodes
if minWorkerAchieved {
return minWorkerAchieved, false, nil
} else {
if message == "" {
message = "Check the config and versions"
}
return false, true, getErrMessage(err, message)
}
}

func (m *manager) operatorConsoleExists(ctx context.Context) (bool, error) {
_, err := m.operatorcli.OperatorV1().Consoles().Get(ctx, consoleConfigResourceName, metav1.GetOptions{})
return err == nil, nil
func (m *manager) operatorConsoleExists(ctx context.Context) (errorcheck bool, retry bool, err error) {
_, err = m.operatorcli.OperatorV1().Consoles().Get(ctx, consoleConfigResourceName, metav1.GetOptions{})
return err == nil, false, nil
}

func (m *manager) operatorConsoleReady(ctx context.Context) (bool, error) {
func (m *manager) operatorConsoleReady(ctx context.Context) (consoleOperatorcheck bool, retry bool, err error) {
consoleOperator, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "console", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
return isOperatorAvailable(consoleOperator), nil
consoleOperatorcheck = isOperatorAvailable(consoleOperator)
if consoleOperatorcheck {
return consoleOperatorcheck, false, nil
}
return consoleOperatorcheck, true, nil
}

func (m *manager) clusterVersionReady(ctx context.Context) (bool, error) {
func (m *manager) clusterVersionReady(ctx context.Context) (cvcheck bool, retry bool, err error) {
cv, err := m.configcli.ConfigV1().ClusterVersions().Get(ctx, "version", metav1.GetOptions{})
if err == nil {
for _, cond := range cv.Status.Conditions {
if cond.Type == configv1.OperatorAvailable && cond.Status == configv1.ConditionTrue {
return true, nil
return true, false, nil
}
}
}
return false, nil
return false, true, nil
}

func (m *manager) ingressControllerReady(ctx context.Context) (bool, error) {
func (m *manager) ingressControllerReady(ctx context.Context) (ingressOperatorcheck bool, retry bool, err error) {
ingressOperator, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "ingress", metav1.GetOptions{})
if err != nil {
return false, nil
return false, true, nil
}
ingressOperatorcheck = isOperatorAvailable(ingressOperator)
if ingressOperatorcheck {
return ingressOperatorcheck, false, nil
}
return isOperatorAvailable(ingressOperator), nil
return ingressOperatorcheck, true, nil
}

func isOperatorAvailable(operator *configv1.ClusterOperator) bool {
Expand All @@ -94,45 +138,49 @@ func isOperatorAvailable(operator *configv1.ClusterOperator) bool {
// Checking for a change to the lastSyncCloudCredsSecretResourceVersion attribute of the CredentialRequest's status would be a neater way of checking
// whether it was reconciled, but we would would have to save the value prior to updating the kube-system/azure-credentials Secret so that we'd have
// an old value to compare to.
func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, error) {
func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (credcheck bool, retry bool, err error) {
// If the CSP hasn't been updated, the CredentialsRequest does not need to be reconciled.
secret, err := m.servicePrincipalUpdated(ctx)
if err != nil {
return false, err
return false, false, err
} else if secret == nil {
return true, nil
return true, false, nil
}

u, err := m.dynamiccli.Resource(CredentialsRequestGroupVersionResource).Namespace("openshift-cloud-credential-operator").Get(ctx, "openshift-azure-operator", metav1.GetOptions{})
if err != nil {
// If the CredentialsRequest is not found, it may have just recently been reconciled.
// Return nil to retry until we hit the condition timeout.
if kerrors.IsNotFound(err) {
return false, nil
return false, true, nil
}
return false, err
return false, false, err
}

cr := u.UnstructuredContent()
var status map[string]interface{}
if s, ok := cr["status"]; ok {
status = s.(map[string]interface{})
} else {
return false, errors.New("unable to access status of openshift-azure-operator CredentialsRequest")
return false, false, errors.New("unable to access status of openshift-azure-operator CredentialsRequest")
}

var lastSyncTimestamp string
if lst, ok := status["lastSyncTimestamp"]; ok {
lastSyncTimestamp = lst.(string)
} else {
return false, errors.New("unable to access status.lastSyncTimestamp of openshift-azure-operator CredentialsRequest")
return false, false, errors.New("unable to access status.lastSyncTimestamp of openshift-azure-operator CredentialsRequest")
}

timestamp, err := time.Parse(time.RFC3339, lastSyncTimestamp)
if err != nil {
return false, err
return false, false, err
}

timeSinceLastSync := time.Since(timestamp)
return timeSinceLastSync.Minutes() < 5, nil
timeSinceLastSyncCheck := timeSinceLastSync.Minutes() < 5
if timeSinceLastSyncCheck {
return true, false, nil
}
return false, true, nil
}
10 changes: 5 additions & 5 deletions pkg/cluster/condition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func TestOperatorConsoleExists(t *testing.T) {
},
}),
}
ready, err := m.operatorConsoleExists(ctx)
ready, _, err := m.operatorConsoleExists(ctx)
if err != nil {
t.Error(errMustBeNilMsg)
}
Expand Down Expand Up @@ -182,8 +182,8 @@ func TestMinimumWorkerNodesReady(t *testing.T) {
},
}),
}
ready, err := m.minimumWorkerNodesReady(ctx)
if err != nil {
ready, retry, err := m.minimumWorkerNodesReady(ctx)
if err != nil && !retry {
t.Error(errMustBeNilMsg)
}
if ready != tt.want {
Expand Down Expand Up @@ -231,7 +231,7 @@ func TestClusterVersionReady(t *testing.T) {
},
}),
}
ready, err := m.clusterVersionReady(ctx)
ready, _, err := m.clusterVersionReady(ctx)
if err != nil {
t.Error(errMustBeNilMsg)
}
Expand Down Expand Up @@ -389,7 +389,7 @@ func TestAroCredentialsRequestReconciled(t *testing.T) {
},
}

result, err := m.aroCredentialsRequestReconciled(ctx)
result, _, err := m.aroCredentialsRequestReconciled(ctx)
if result != tt.want {
t.Errorf("Result was %v, wanted %v", result, tt.want)
}
Expand Down
10 changes: 6 additions & 4 deletions pkg/cluster/hive.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@ func (m *manager) hiveEnsureResources(ctx context.Context) error {
return m.hiveClusterManager.CreateOrUpdate(ctx, m.subscriptionDoc, m.doc)
}

func (m *manager) hiveClusterDeploymentReady(ctx context.Context) (bool, error) {
func (m *manager) hiveClusterDeploymentReady(ctx context.Context) (deploymentReady bool, retry bool, err error) {
m.log.Info("waiting for cluster deployment to become ready")
return m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
deploymentReady, err = m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
return deploymentReady, false, err
}

func (m *manager) hiveClusterInstallationComplete(ctx context.Context) (bool, error) {
func (m *manager) hiveClusterInstallationComplete(ctx context.Context) (installationComplete bool, retry bool, err error) {
m.log.Info("waiting for cluster installation to complete")
return m.hiveClusterManager.IsClusterInstallationComplete(ctx, m.doc)
installationComplete, err = m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc)
return installationComplete, false, err
}

func (m *manager) hiveResetCorrelationData(ctx context.Context) error {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/hive_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func TestHiveClusterDeploymentReady(t *testing.T) {
}
m.hiveClusterManager = hiveMock

result, err := m.hiveClusterDeploymentReady(context.Background())
result, _, err := m.hiveClusterDeploymentReady(context.Background())
utilerror.AssertErrorMessage(t, err, tt.wantErr)

if tt.wantResult != result {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/install_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func failingFunc(context.Context) error { return errors.New("oh no!") }

func successfulActionStep(context.Context) error { return nil }

func successfulConditionStep(context.Context) (bool, error) { return true, nil }
func successfulConditionStep(context.Context) (bool, bool, error) { return true, false, nil }

type fakeMetricsEmitter struct {
Metrics map[string]int64
Expand Down
10 changes: 5 additions & 5 deletions pkg/containerinstall/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,22 +110,22 @@ func (m *manager) startContainer(ctx context.Context, version *api.OpenShiftVers
return err
}

func (m *manager) containerFinished(context.Context) (bool, error) {
func (m *manager) containerFinished(context.Context) (bool, retry bool, err error) {
containerName := "installer-" + m.clusterUUID
inspectData, err := containers.Inspect(m.conn, containerName, nil)
if err != nil {
return false, err
return false, false, err
}

if inspectData.State.Status == "exited" || inspectData.State.Status == "stopped" {
if inspectData.State.ExitCode != 0 {
getContainerLogs(m.conn, m.log, containerName)
return true, fmt.Errorf("container exited with %d", inspectData.State.ExitCode)
return true, false, fmt.Errorf("container exited with %d", inspectData.State.ExitCode)
}
m.success = true
return true, nil
return true, false, nil
}
return false, nil
return false, true, nil
}

func (m *manager) createSecrets(ctx context.Context, doc *api.OpenShiftClusterDocument, sub *api.SubscriptionDocument) error {
Expand Down
Loading
Loading