From 1ec2b3dcfec9eb0dc47b1ca4098a9034a404efdd Mon Sep 17 00:00:00 2001 From: Sebastian Widmer Date: Wed, 17 Apr 2024 13:56:03 +0200 Subject: [PATCH] Ensure pools without upgrades are unpaused (#73) We immediately set the upgrade job to the final state "success" if pools did not need any upgrade. The pause/unpause functionality ignores jobs in final states to not interfere with running ones when doing a full reconcile. This left pools without upgrades stuck. This PR ensures we unpause all pools before setting the final "success" state. --- controllers/upgradejob_controller.go | 12 ++- controllers/upgradejob_controller_test.go | 103 ++++++++++++++++++++++ 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/controllers/upgradejob_controller.go b/controllers/upgradejob_controller.go index 311dcc9..3ce9f51 100644 --- a/controllers/upgradejob_controller.go +++ b/controllers/upgradejob_controller.go @@ -176,7 +176,7 @@ func (r *UpgradeJobReconciler) reconcileStartedJob(ctx context.Context, uj *mana return ctrl.Result{}, fmt.Errorf("failed to lock cluster version: %w", err) } - if err := r.pauseUnpauseMachineConfigPools(ctx, uj); err != nil { + if err := r.pauseUnpauseMachineConfigPools(ctx, uj, false); err != nil { return ctrl.Result{}, fmt.Errorf("failed to pause machine config pools: %w", err) } @@ -300,6 +300,11 @@ func (r *UpgradeJobReconciler) reconcileStartedJob(ctx context.Context, uj *mana return ctrl.Result{}, nil } + // Ensure pools that were paused but did not need an upgrade are unpaused + if err := r.pauseUnpauseMachineConfigPools(ctx, uj, true); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to ensure machine config pools are unpaused: %w", err) + } + // Set the upgrade as successful r.setStatusCondition(&uj.Status.Conditions, metav1.Condition{ Type: managedupgradev1beta1.UpgradeJobConditionSucceeded, @@ -846,16 +851,17 @@ func findTrackedHookJob(ujhookName, event string, uj managedupgradev1beta1.Upgra // pauseUnpauseMachineConfigPools pauses or unpauses the machine config pools that match the given selectors in .Spec.MachineConfigPools and have a delay set. // The decision to pause or unpause is based on `pool.DelayUpgrade.DelayMin` relative to the startAfter time of the upgrade job. +// If ensureUnpause is true, it will unpause the pools even if the delay has not expired. // It sets a timeout condition and returns an error if the delay is expired. // It also returns an error if the machine config pools cannot be listed or updated. -func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Context, uj *managedupgradev1beta1.UpgradeJob) error { +func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Context, uj *managedupgradev1beta1.UpgradeJob, ensureUnpause bool) error { var controllerManagesPools bool var controllerPausedPools bool for _, pool := range uj.Spec.MachineConfigPools { if pool.DelayUpgrade == (managedupgradev1beta1.UpgradeJobMachineConfigPoolDelayUpgradeSpec{}) { continue } - shouldPause := r.timeSinceStartAfter(uj) < pool.DelayUpgrade.DelayMin.Duration + shouldPause := !ensureUnpause && r.timeSinceStartAfter(uj) < pool.DelayUpgrade.DelayMin.Duration sel, err := metav1.LabelSelectorAsSelector(pool.MatchLabels) if err != nil { return fmt.Errorf("failed to parse machine config pool selector: %w", err) diff --git a/controllers/upgradejob_controller_test.go b/controllers/upgradejob_controller_test.go index e79205c..d90070d 100644 --- a/controllers/upgradejob_controller_test.go +++ b/controllers/upgradejob_controller_test.go @@ -1289,6 +1289,109 @@ func Test_UpgradeJobReconciler_Reconcile_PausedMachineConfigPools_UnpauseExpire( require.Equal(t, managedupgradev1beta1.UpgradeJobReasonUnpausingPoolsExpired, failedCond.Reason, "should set reason to unpausing pools expired") } +// Test_UpgradeJobReconciler_Reconcile_PausedMachineConfigPools_EnsureUnpause tests that the upgrade job reconciler +// will unpause machine config pools at the end of an upgrade even if they did not require any upgrades +func Test_UpgradeJobReconciler_Reconcile_PausedMachineConfigPools_EnsureUnpause(t *testing.T) { + ctx := context.Background() + clock := mockClock{now: time.Date(2022, 12, 4, 22, 45, 0, 0, time.UTC)} + + ucv := &configv1.ClusterVersion{ + ObjectMeta: metav1.ObjectMeta{ + Name: "version", + }, + Status: configv1.ClusterVersionStatus{ + AvailableUpdates: []configv1.Release{ + {Version: "4.5.13"}, + }, + Conditions: []configv1.ClusterOperatorStatusCondition{ + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionFalse, + }, + }, + }, + } + + workerPool := &machineconfigurationv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + Labels: map[string]string{"name": "worker"}, + }, + Status: machineconfigurationv1.MachineConfigPoolStatus{ + MachineCount: 3, + UpdatedMachineCount: 3, + }, + } + + upgradeJob := &managedupgradev1beta1.UpgradeJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "upgrade-1234-4-5-13", + Namespace: "appuio-openshift-upgrade-controller", + }, + Spec: managedupgradev1beta1.UpgradeJobSpec{ + StartBefore: metav1.NewTime(clock.Now().Add(3 * time.Hour)), + StartAfter: metav1.NewTime(clock.Now().Add(-time.Minute)), + DesiredVersion: &configv1.Update{ + Version: "4.5.13", + }, + UpgradeJobConfig: managedupgradev1beta1.UpgradeJobConfig{ + UpgradeTimeout: metav1.Duration{Duration: 12 * time.Hour}, + MachineConfigPools: []managedupgradev1beta1.UpgradeJobMachineConfigPoolSpec{ + { + MatchLabels: &metav1.LabelSelector{ + MatchLabels: map[string]string{"name": "worker"}, + }, + DelayUpgrade: managedupgradev1beta1.UpgradeJobMachineConfigPoolDelayUpgradeSpec{ + DelayMin: metav1.Duration{Duration: 1 * time.Hour}, + DelayMax: metav1.Duration{Duration: 2 * time.Hour}, + }, + }, + }, + }, + }, + } + + client := controllerClient(t, ucv, upgradeJob, workerPool) + + subject := &UpgradeJobReconciler{ + Client: client, + Scheme: client.Scheme(), + + Clock: &clock, + + ManagedUpstreamClusterVersionName: "version", + } + + t.Log("check that upgrade job is started and machine config pools are paused") + reconcileNTimes(t, subject, ctx, requestForObject(upgradeJob), 10) + require.NoError(t, client.Get(ctx, requestForObject(upgradeJob).NamespacedName, upgradeJob)) + startedCond := apimeta.FindStatusCondition(upgradeJob.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionStarted) + require.NotNil(t, startedCond, "should have started upgrade") + pausedCond := apimeta.FindStatusCondition(upgradeJob.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionMachineConfigPoolsPaused) + require.NotNil(t, pausedCond, "should have paused mcp upgrades") + require.Equal(t, metav1.ConditionTrue, pausedCond.Status) + require.NoError(t, client.Get(ctx, requestForObject(workerPool).NamespacedName, workerPool)) + require.True(t, workerPool.Spec.Paused, "should have paused worker mcp") + + t.Log("finish the upgrade") + require.NoError(t, client.Get(ctx, requestForObject(ucv).NamespacedName, ucv)) + ucv.Status.History = append(ucv.Status.History, configv1.UpdateHistory{ + State: configv1.CompletedUpdate, + Version: upgradeJob.Spec.DesiredVersion.Version, + Image: upgradeJob.Spec.DesiredVersion.Image, + }) + require.NoError(t, client.Status().Update(ctx, ucv)) + reconcileNTimes(t, subject, ctx, requestForObject(upgradeJob), 5) + + t.Log("check that job is done and ensure machine config pools are unpaused") + require.NoError(t, client.Get(ctx, requestForObject(upgradeJob).NamespacedName, upgradeJob)) + succeededCond := apimeta.FindStatusCondition(upgradeJob.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionSucceeded) + require.NotNil(t, succeededCond) + require.Equal(t, metav1.ConditionTrue, succeededCond.Status) + require.NoError(t, client.Get(ctx, requestForObject(workerPool).NamespacedName, workerPool)) + require.False(t, workerPool.Spec.Paused, "should have unpaused worker mcp for completed job") +} + func Test_JobFromClusterVersionHandler(t *testing.T) { ucv := &configv1.ClusterVersion{ ObjectMeta: metav1.ObjectMeta{