diff --git a/pkg/cluster/arooperator.go b/pkg/cluster/arooperator.go index 7afbb5b3c9d..89f5420129b 100644 --- a/pkg/cluster/arooperator.go +++ b/pkg/cluster/arooperator.go @@ -39,26 +39,27 @@ func (m *manager) ensureAROOperator(ctx context.Context) error { return err } -func (m *manager) aroDeploymentReady(ctx context.Context) (bool, error) { +func (m *manager) aroDeploymentReady(ctx context.Context) (ok bool, retry bool, err error) { if !m.isIngressProfileAvailable() { // If the ingress profile is not available, ARO operator update/deploy will fail. m.log.Error("skip aroDeploymentReady") - return true, nil + return true, false, nil } - return m.aroOperatorDeployer.IsReady(ctx) + ok, err = m.aroOperatorDeployer.IsReady(ctx) + return ok, false, err } -func (m *manager) ensureAROOperatorRunningDesiredVersion(ctx context.Context) (bool, error) { +func (m *manager) ensureAROOperatorRunningDesiredVersion(ctx context.Context) (ok bool, retry bool, err error) { if !m.isIngressProfileAvailable() { // If the ingress profile is not available, ARO operator update/deploy will fail. m.log.Error("skip ensureAROOperatorRunningDesiredVersion") - return true, nil + return true, false, nil } - ok, err := m.aroOperatorDeployer.IsRunningDesiredVersion(ctx) + ok, err = m.aroOperatorDeployer.IsRunningDesiredVersion(ctx) if !ok || err != nil { - return false, err + return false, false, err } - return true, nil + return true, false, nil } func (m *manager) ensureCredentialsRequest(ctx context.Context) error { diff --git a/pkg/cluster/arooperator_test.go b/pkg/cluster/arooperator_test.go index 34bf6bc167d..570e3dff63d 100644 --- a/pkg/cluster/arooperator_test.go +++ b/pkg/cluster/arooperator_test.go @@ -215,7 +215,7 @@ func TestAroDeploymentReady(t *testing.T) { aroOperatorDeployer: dep, } - ok, err := m.aroDeploymentReady(ctx) + ok, _, err := m.aroDeploymentReady(ctx) if err != nil || ok != tt.wantRes { t.Error(err) } @@ -311,7 +311,7 @@ func TestEnsureAROOperatorRunningDesiredVersion(t *testing.T) { aroOperatorDeployer: dep, } - ok, err := m.ensureAROOperatorRunningDesiredVersion(ctx) + ok, _, err := m.ensureAROOperatorRunningDesiredVersion(ctx) if err != nil || ok != tt.wantRes { t.Error(err) } diff --git a/pkg/cluster/condition.go b/pkg/cluster/condition.go index a5f1fbd9140..61001831606 100644 --- a/pkg/cluster/condition.go +++ b/pkg/cluster/condition.go @@ -6,12 +6,16 @@ package cluster import ( "context" "errors" + "fmt" + "net/http" "time" configv1 "github.com/openshift/api/config/v1" corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/Azure/ARO-RP/pkg/api" ) const minimumWorkerNodes = 2 @@ -19,65 +23,105 @@ const minimumWorkerNodes = 2 // condition functions should return an error only if it's not able to be retried // if a condition function encounters a error when retrying it should return false, nil. -func (m *manager) apiServersReady(ctx context.Context) (bool, error) { +func (m *manager) apiServersReady(ctx context.Context) (operatorCheck bool, retry bool, err error) { apiserver, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "kube-apiserver", metav1.GetOptions{}) if err != nil { - return false, nil + return false, true, nil + } + operatorCheck = isOperatorAvailable(apiserver) + if operatorCheck { + return operatorCheck, false, nil + } + return operatorCheck, true, nil +} + +func getErrMessage(err error, messageifany string) error { + message := "Minimum number of worker nodes have not been successfully created. Please retry and if the issue persists, raise an Azure support ticket" + if err != nil { + message = "Error: " + err.Error() + "Message: " + messageifany + message + } else { + message = messageifany + message } - return isOperatorAvailable(apiserver), nil + cloudError := api.NewCloudError( + http.StatusInternalServerError, + api.CloudErrorCodeDeploymentFailed, + "", + message, + ) + return cloudError } -func (m *manager) minimumWorkerNodesReady(ctx context.Context) (bool, error) { +func (m *manager) minimumWorkerNodesReady(ctx context.Context) (nodeCheck bool, retry bool, err error) { nodes, err := m.kubernetescli.CoreV1().Nodes().List(ctx, metav1.ListOptions{ LabelSelector: "node-role.kubernetes.io/worker", }) if err != nil { - return false, nil + return false, true, getErrMessage(err, "") } readyWorkers := 0 + message := "" for _, node := range nodes.Items { for _, cond := range node.Status.Conditions { if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { readyWorkers++ + } else { + messageString := fmt.Sprintf("%+v - Status:%+v, Message: %+v\n", node, cond.Status, cond.Message) + message += messageString } } } - return readyWorkers >= minimumWorkerNodes, nil + minWorkerAchieved := readyWorkers >= minimumWorkerNodes + if minWorkerAchieved { + return minWorkerAchieved, false, nil + } else { + if message == "" { + message = "Check the config and versions" + } + return false, true, getErrMessage(err, message) + } } -func (m *manager) operatorConsoleExists(ctx context.Context) (bool, error) { - _, err := m.operatorcli.OperatorV1().Consoles().Get(ctx, consoleConfigResourceName, metav1.GetOptions{}) - return err == nil, nil +func (m *manager) operatorConsoleExists(ctx context.Context) (errorcheck bool, retry bool, err error) { + _, err = m.operatorcli.OperatorV1().Consoles().Get(ctx, consoleConfigResourceName, metav1.GetOptions{}) + return err == nil, false, nil } -func (m *manager) operatorConsoleReady(ctx context.Context) (bool, error) { +func (m *manager) operatorConsoleReady(ctx context.Context) (consoleOperatorcheck bool, retry bool, err error) { consoleOperator, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "console", metav1.GetOptions{}) if err != nil { - return false, nil + return false, true, nil } - return isOperatorAvailable(consoleOperator), nil + consoleOperatorcheck = isOperatorAvailable(consoleOperator) + if consoleOperatorcheck { + return consoleOperatorcheck, false, nil + } + return consoleOperatorcheck, true, nil } -func (m *manager) clusterVersionReady(ctx context.Context) (bool, error) { +func (m *manager) clusterVersionReady(ctx context.Context) (cvcheck bool, retry bool, err error) { cv, err := m.configcli.ConfigV1().ClusterVersions().Get(ctx, "version", metav1.GetOptions{}) if err == nil { for _, cond := range cv.Status.Conditions { if cond.Type == configv1.OperatorAvailable && cond.Status == configv1.ConditionTrue { - return true, nil + return true, false, nil } } } - return false, nil + return false, true, nil } -func (m *manager) ingressControllerReady(ctx context.Context) (bool, error) { +func (m *manager) ingressControllerReady(ctx context.Context) (ingressOperatorcheck bool, retry bool, err error) { ingressOperator, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "ingress", metav1.GetOptions{}) if err != nil { - return false, nil + return false, true, nil + } + ingressOperatorcheck = isOperatorAvailable(ingressOperator) + if ingressOperatorcheck { + return ingressOperatorcheck, false, nil } - return isOperatorAvailable(ingressOperator), nil + return ingressOperatorcheck, true, nil } func isOperatorAvailable(operator *configv1.ClusterOperator) bool { @@ -94,13 +138,13 @@ func isOperatorAvailable(operator *configv1.ClusterOperator) bool { // Checking for a change to the lastSyncCloudCredsSecretResourceVersion attribute of the CredentialRequest's status would be a neater way of checking // whether it was reconciled, but we would would have to save the value prior to updating the kube-system/azure-credentials Secret so that we'd have // an old value to compare to. -func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, error) { +func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (credcheck bool, retry bool, err error) { // If the CSP hasn't been updated, the CredentialsRequest does not need to be reconciled. secret, err := m.servicePrincipalUpdated(ctx) if err != nil { - return false, err + return false, false, err } else if secret == nil { - return true, nil + return true, false, nil } u, err := m.dynamiccli.Resource(CredentialsRequestGroupVersionResource).Namespace("openshift-cloud-credential-operator").Get(ctx, "openshift-azure-operator", metav1.GetOptions{}) @@ -108,9 +152,9 @@ func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, er // If the CredentialsRequest is not found, it may have just recently been reconciled. // Return nil to retry until we hit the condition timeout. if kerrors.IsNotFound(err) { - return false, nil + return false, true, nil } - return false, err + return false, false, err } cr := u.UnstructuredContent() @@ -118,21 +162,25 @@ func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, er if s, ok := cr["status"]; ok { status = s.(map[string]interface{}) } else { - return false, errors.New("unable to access status of openshift-azure-operator CredentialsRequest") + return false, false, errors.New("unable to access status of openshift-azure-operator CredentialsRequest") } var lastSyncTimestamp string if lst, ok := status["lastSyncTimestamp"]; ok { lastSyncTimestamp = lst.(string) } else { - return false, errors.New("unable to access status.lastSyncTimestamp of openshift-azure-operator CredentialsRequest") + return false, false, errors.New("unable to access status.lastSyncTimestamp of openshift-azure-operator CredentialsRequest") } timestamp, err := time.Parse(time.RFC3339, lastSyncTimestamp) if err != nil { - return false, err + return false, false, err } timeSinceLastSync := time.Since(timestamp) - return timeSinceLastSync.Minutes() < 5, nil + timeSinceLastSyncCheck := timeSinceLastSync.Minutes() < 5 + if timeSinceLastSyncCheck { + return true, false, nil + } + return false, true, nil } diff --git a/pkg/cluster/condition_test.go b/pkg/cluster/condition_test.go index 74ded8740b6..f73bcb26a77 100644 --- a/pkg/cluster/condition_test.go +++ b/pkg/cluster/condition_test.go @@ -53,7 +53,7 @@ func TestOperatorConsoleExists(t *testing.T) { }, }), } - ready, err := m.operatorConsoleExists(ctx) + ready, _, err := m.operatorConsoleExists(ctx) if err != nil { t.Error(errMustBeNilMsg) } @@ -182,8 +182,8 @@ func TestMinimumWorkerNodesReady(t *testing.T) { }, }), } - ready, err := m.minimumWorkerNodesReady(ctx) - if err != nil { + ready, retry, err := m.minimumWorkerNodesReady(ctx) + if err != nil && !retry { t.Error(errMustBeNilMsg) } if ready != tt.want { @@ -231,7 +231,7 @@ func TestClusterVersionReady(t *testing.T) { }, }), } - ready, err := m.clusterVersionReady(ctx) + ready, _, err := m.clusterVersionReady(ctx) if err != nil { t.Error(errMustBeNilMsg) } @@ -389,7 +389,7 @@ func TestAroCredentialsRequestReconciled(t *testing.T) { }, } - result, err := m.aroCredentialsRequestReconciled(ctx) + result, _, err := m.aroCredentialsRequestReconciled(ctx) if result != tt.want { t.Errorf("Result was %v, wanted %v", result, tt.want) } diff --git a/pkg/cluster/hive.go b/pkg/cluster/hive.go index c19a2e09792..7949bdd7103 100644 --- a/pkg/cluster/hive.go +++ b/pkg/cluster/hive.go @@ -34,14 +34,16 @@ func (m *manager) hiveEnsureResources(ctx context.Context) error { return m.hiveClusterManager.CreateOrUpdate(ctx, m.subscriptionDoc, m.doc) } -func (m *manager) hiveClusterDeploymentReady(ctx context.Context) (bool, error) { +func (m *manager) hiveClusterDeploymentReady(ctx context.Context) (deploymentReady bool, retry bool, err error) { m.log.Info("waiting for cluster deployment to become ready") - return m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc) + deploymentReady, err = m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc) + return deploymentReady, false, err } -func (m *manager) hiveClusterInstallationComplete(ctx context.Context) (bool, error) { +func (m *manager) hiveClusterInstallationComplete(ctx context.Context) (installationComplete bool, retry bool, err error) { m.log.Info("waiting for cluster installation to complete") - return m.hiveClusterManager.IsClusterInstallationComplete(ctx, m.doc) + installationComplete, err = m.hiveClusterManager.IsClusterDeploymentReady(ctx, m.doc) + return installationComplete, false, err } func (m *manager) hiveResetCorrelationData(ctx context.Context) error { diff --git a/pkg/cluster/hive_test.go b/pkg/cluster/hive_test.go index 5f7021f726e..ad0dc944fdc 100644 --- a/pkg/cluster/hive_test.go +++ b/pkg/cluster/hive_test.go @@ -63,7 +63,7 @@ func TestHiveClusterDeploymentReady(t *testing.T) { } m.hiveClusterManager = hiveMock - result, err := m.hiveClusterDeploymentReady(context.Background()) + result, _, err := m.hiveClusterDeploymentReady(context.Background()) utilerror.AssertErrorMessage(t, err, tt.wantErr) if tt.wantResult != result { diff --git a/pkg/cluster/install_test.go b/pkg/cluster/install_test.go index 9d8507780e5..f54100fe955 100644 --- a/pkg/cluster/install_test.go +++ b/pkg/cluster/install_test.go @@ -35,7 +35,7 @@ func failingFunc(context.Context) error { return errors.New("oh no!") } func successfulActionStep(context.Context) error { return nil } -func successfulConditionStep(context.Context) (bool, error) { return true, nil } +func successfulConditionStep(context.Context) (bool, bool, error) { return true, false, nil } type fakeMetricsEmitter struct { Metrics map[string]int64 diff --git a/pkg/containerinstall/install.go b/pkg/containerinstall/install.go index 40eca69e8ab..caa7ed47f10 100644 --- a/pkg/containerinstall/install.go +++ b/pkg/containerinstall/install.go @@ -110,22 +110,22 @@ func (m *manager) startContainer(ctx context.Context, version *api.OpenShiftVers return err } -func (m *manager) containerFinished(context.Context) (bool, error) { +func (m *manager) containerFinished(context.Context) (bool, retry bool, err error) { containerName := "installer-" + m.clusterUUID inspectData, err := containers.Inspect(m.conn, containerName, nil) if err != nil { - return false, err + return false, false, err } if inspectData.State.Status == "exited" || inspectData.State.Status == "stopped" { if inspectData.State.ExitCode != 0 { getContainerLogs(m.conn, m.log, containerName) - return true, fmt.Errorf("container exited with %d", inspectData.State.ExitCode) + return true, false, fmt.Errorf("container exited with %d", inspectData.State.ExitCode) } m.success = true - return true, nil + return true, false, nil } - return false, nil + return false, true, nil } func (m *manager) createSecrets(ctx context.Context, doc *api.OpenShiftClusterDocument, sub *api.SubscriptionDocument) error { diff --git a/pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go b/pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go index 8889655651b..0c817dc881d 100644 --- a/pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go +++ b/pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go @@ -436,24 +436,24 @@ func (e *etcdrenew) deleteEtcdSecrets(ctx context.Context) error { } // Checks if the new revision is put on the etcd and validates if all the nodes are running the same revision -func (e *etcdrenew) isEtcdRevised(ctx context.Context) (bool, error) { +func (e *etcdrenew) isEtcdRevised(ctx context.Context) (etcdCheck bool, retry bool, err error) { isAtRevision := true rawEtcd, err := e.k.KubeGet(ctx, "etcd.operator.openshift.io", "", "cluster") if err != nil { e.log.Warnf(err.Error()) - return false, nil + return false, true, nil } etcd := &operatorv1.Etcd{} err = codec.NewDecoderBytes(rawEtcd, &codec.JsonHandle{}).Decode(etcd) if err != nil { e.log.Warnf(err.Error()) - return false, nil + return false, true, nil } // no new revision is observed. if e.lastRevision == etcd.Status.LatestAvailableRevision { e.log.Infof("last revision is %d, latest available revision is %d", e.lastRevision, etcd.Status.LatestAvailableRevision) - return false, nil + return false, true, nil } for _, s := range etcd.Status.NodeStatuses { e.log.Infof("Current Revision for node %s is %d, expected revision is %d", s.NodeName, s.CurrentRevision, etcd.Status.LatestAvailableRevision) @@ -462,8 +462,10 @@ func (e *etcdrenew) isEtcdRevised(ctx context.Context) (bool, error) { break } } - - return isAtRevision, nil + if isAtRevision { + return isAtRevision, false, nil + } + return isAtRevision, true, nil } // Applies the backedup etcd secret and applies them on the cluster diff --git a/pkg/util/mocks/env/core.go b/pkg/util/mocks/env/core.go index 8129b642cad..844bc378538 100644 --- a/pkg/util/mocks/env/core.go +++ b/pkg/util/mocks/env/core.go @@ -8,12 +8,13 @@ import ( context "context" reflect "reflect" - azureclient "github.com/Azure/ARO-RP/pkg/util/azureclient" - liveconfig "github.com/Azure/ARO-RP/pkg/util/liveconfig" azcore "github.com/Azure/azure-sdk-for-go/sdk/azcore" autorest "github.com/Azure/go-autorest/autorest" gomock "github.com/golang/mock/gomock" logrus "github.com/sirupsen/logrus" + + azureclient "github.com/Azure/ARO-RP/pkg/util/azureclient" + liveconfig "github.com/Azure/ARO-RP/pkg/util/liveconfig" ) // MockCore is a mock of Core interface. diff --git a/pkg/util/steps/condition.go b/pkg/util/steps/condition.go index d4af64ea77a..7a449ef96a6 100644 --- a/pkg/util/steps/condition.go +++ b/pkg/util/steps/condition.go @@ -40,7 +40,7 @@ var timeoutConditionErrors = map[string]string{ // condition has been met and an error. // // Suitable for polling external sources for readiness. -type conditionFunction func(context.Context) (bool, error) +type conditionFunction func(context.Context) (bool, bool, error) // Condition returns a Step suitable for checking whether subsequent Steps can // be executed. @@ -86,8 +86,8 @@ func (c conditionStep) run(ctx context.Context, log *logrus.Entry) error { // We use the outer context, not the timeout context, as we do not want // to time out the condition function itself, only stop retrying once // timeoutCtx's timeout has fired. - cnd, cndErr := c.f(ctx) - if errors.Is(cndErr, wait.ErrWaitTimeout) { + cnd, retry, cndErr := c.f(ctx) + if errors.Is(cndErr, wait.ErrWaitTimeout) && !retry { return cnd, fmt.Errorf("condition encountered internal timeout: %w", cndErr) } diff --git a/pkg/util/steps/condition_test.go b/pkg/util/steps/condition_test.go index f6466b59b43..f37bb007f36 100644 --- a/pkg/util/steps/condition_test.go +++ b/pkg/util/steps/condition_test.go @@ -11,17 +11,19 @@ import ( // functionnames that will be used in the conditionFunction below // All the keys of map timeoutConditionErrors -func attachNSGs(context.Context) (bool, error) { return false, nil } -func apiServersReady(context.Context) (bool, error) { return false, nil } -func minimumWorkerNodesReady(context.Context) (bool, error) { return false, nil } -func operatorConsoleExists(context.Context) (bool, error) { return false, nil } -func operatorConsoleReady(context.Context) (bool, error) { return false, nil } -func clusterVersionReady(context.Context) (bool, error) { return false, nil } -func ingressControllerReady(context.Context) (bool, error) { return false, nil } -func aroDeploymentReady(context.Context) (bool, error) { return false, nil } -func ensureAROOperatorRunningDesiredVersion(context.Context) (bool, error) { return false, nil } -func hiveClusterDeploymentReady(context.Context) (bool, error) { return false, nil } -func hiveClusterInstallationComplete(context.Context) (bool, error) { return false, nil } +func attachNSGs(context.Context) (bool, bool, error) { return false, false, nil } +func apiServersReady(context.Context) (bool, bool, error) { return false, false, nil } +func minimumWorkerNodesReady(context.Context) (bool, bool, error) { return false, false, nil } +func operatorConsoleExists(context.Context) (bool, bool, error) { return false, false, nil } +func operatorConsoleReady(context.Context) (bool, bool, error) { return false, false, nil } +func clusterVersionReady(context.Context) (bool, bool, error) { return false, false, nil } +func ingressControllerReady(context.Context) (bool, bool, error) { return false, false, nil } +func aroDeploymentReady(context.Context) (bool, bool, error) { return false, false, nil } +func ensureAROOperatorRunningDesiredVersion(context.Context) (bool, bool, error) { + return false, false, nil +} +func hiveClusterDeploymentReady(context.Context) (bool, bool, error) { return false, false, nil } +func hiveClusterInstallationComplete(context.Context) (bool, bool, error) { return false, false, nil } func TestEnrichConditionTimeoutError(t *testing.T) { for _, tt := range []struct { diff --git a/pkg/util/steps/runner_test.go b/pkg/util/steps/runner_test.go index 4b3cf30ec3f..eff434336db 100644 --- a/pkg/util/steps/runner_test.go +++ b/pkg/util/steps/runner_test.go @@ -19,16 +19,16 @@ import ( testlog "github.com/Azure/ARO-RP/test/util/log" ) -func successfulFunc(context.Context) error { return nil } -func failingFunc(context.Context) error { return errors.New("oh no!") } -func alwaysFalseCondition(context.Context) (bool, error) { return false, nil } -func alwaysTrueCondition(context.Context) (bool, error) { return true, nil } -func timingOutCondition(ctx context.Context) (bool, error) { +func successfulFunc(context.Context) error { return nil } +func failingFunc(context.Context) error { return errors.New("oh no!") } +func alwaysFalseCondition(context.Context) (bool, bool, error) { return false, true, nil } +func alwaysTrueCondition(context.Context) (bool, bool, error) { return true, false, nil } +func timingOutCondition(ctx context.Context) (bool, bool, error) { time.Sleep(60 * time.Millisecond) - return false, nil + return false, true, nil } -func internalTimeoutCondition(ctx context.Context) (bool, error) { - return false, wait.ErrWaitTimeout +func internalTimeoutCondition(ctx context.Context) (bool, bool, error) { + return false, false, wait.ErrWaitTimeout } func currentTimeFunc() time.Time {