From 846f0de17892c99fa21a64a315f6765ecc0aac3b Mon Sep 17 00:00:00 2001 From: allenxu404 Date: Wed, 2 Aug 2023 17:46:56 +0800 Subject: [PATCH] enhance uninstall and backup deletion for restore finalizer Signed-off-by: allenxu404 --- pkg/cmd/cli/uninstall/uninstall.go | 130 ++++++++++++++---- pkg/controller/backup_deletion_controller.go | 37 ++++- .../backup_deletion_controller_test.go | 4 - 3 files changed, 137 insertions(+), 34 deletions(-) diff --git a/pkg/cmd/cli/uninstall/uninstall.go b/pkg/cmd/cli/uninstall/uninstall.go index ec2de648fc..feff423a4b 100644 --- a/pkg/cmd/cli/uninstall/uninstall.go +++ b/pkg/cmd/cli/uninstall/uninstall.go @@ -25,23 +25,31 @@ import ( "github.com/spf13/cobra" "github.com/spf13/pflag" + appsv1api "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apiextv1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/labels" kubeerrs "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/wait" kbclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" "github.com/vmware-tanzu/velero/pkg/client" "github.com/vmware-tanzu/velero/pkg/cmd" "github.com/vmware-tanzu/velero/pkg/cmd/cli" + "github.com/vmware-tanzu/velero/pkg/controller" "github.com/vmware-tanzu/velero/pkg/install" + kubeutil "github.com/vmware-tanzu/velero/pkg/util/kube" ) +var gracefulDeletionMaximumDuration = 1 * time.Minute + // uninstallOptions collects all the options for uninstalling Velero from a Kubernetes cluster. type uninstallOptions struct { wait bool // deprecated @@ -117,7 +125,6 @@ func Run(ctx context.Context, kbClient kbclient.Client, namespace string) error } // CRDs - veleroLabelSelector := labels.SelectorFromSet(install.Labels()) opts := []kbclient.DeleteAllOfOption{ kbclient.InNamespace(namespace), @@ -160,8 +167,7 @@ func Run(ctx context.Context, kbClient kbclient.Client, namespace string) error } func deleteNamespace(ctx context.Context, kbClient kbclient.Client, namespace string) error { - // delete resources with finalizer attached first to ensure finalizer can be handled by corresponding controller and resources can be deleted successfully before controller's pod is deleted. - // otherwise the process of deleting namespace will get stuck in deleting those resources forever. + // Deal with resources with attached finalizers to ensure proper handling of those finalizers. if err := deleteResourcesWithFinalizer(ctx, kbClient, namespace); err != nil { return errors.Wrap(err, "Fail to remove finalizer from restores") } @@ -211,57 +217,135 @@ func deleteNamespace(ctx context.Context, kbClient kbclient.Client, namespace st return nil } +// A few things needed to be noticed here: +// 1. When we delete resources with attached finalizers, the corresponding controller will deal with the finalizer then resources can be deleted successfully. +// So it is important to delete these resources before deleting the pod that runs that controller. +// 2. The controller may encounter errors while handling the finalizer, in such case, the controller will keep trying until it succeeds. +// So it is important to set a timeout, once the process exceed the timeout, we will forcedly delete the resources by removing the finalizer from them, +// otherwise the deletion process may get stuck indefinitely. +// 3. There is only restore finalizer supported as of v1.12. If any new finalizers are added in the future, the corresponding deletion logic can be +// incorporated into this function. func deleteResourcesWithFinalizer(ctx context.Context, kbClient kbclient.Client, namespace string) error { - //check if restore crd exists + fmt.Println("Waiting for resource with attached finalizer to be deleted") + return deleteRestore(ctx, kbClient, namespace) +} + +func deleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error { + // Check if restore crd exists, if it does not exist, return immediately. + var err error v1crd := &apiextv1.CustomResourceDefinition{} key := kbclient.ObjectKey{Name: "restores.velero.io"} - if err := kbClient.Get(ctx, key, v1crd); err != nil { + if err = kbClient.Get(ctx, key, v1crd); err != nil { if apierrors.IsNotFound(err) { return nil } else { - return err + return errors.Wrap(err, "Error getting restore crd") } } - // delete all the restores + // First attempt to gracefully delete all the restore within a specified time frame, If the process exceeds the timeout limit, + // it is likely that there may be errors during the finalization of restores. In such cases, we should proceed with forcefully deleting the restores. + err = gracefullyDeleteRestore(ctx, kbClient, namespace) + if err != nil && err != wait.ErrWaitTimeout { + return errors.Wrap(err, "Error deleting restores") + } + if err == wait.ErrWaitTimeout { + err = forcedlyDeleteRestore(ctx, kbClient, namespace) + if err != nil { + return errors.Wrap(err, "Error deleting restores") + } + } + + return nil +} + +func gracefullyDeleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error { + var err error restoreList := &velerov1api.RestoreList{} - if err := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil { - return err + if err = kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil { + return errors.Wrap(err, "Error getting restores during graceful deletion") } for i := range restoreList.Items { - if err := kbClient.Delete(ctx, &restoreList.Items[i]); err != nil { + if err = kbClient.Delete(ctx, &restoreList.Items[i]); err != nil { if apierrors.IsNotFound(err) { continue } - return err + return errors.Wrap(err, "Error deleting restores during graceful deletion") } } - fmt.Println("Waiting for resource with finalizer attached to be deleted") - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - var err error - checkFunc := func() { + // Wait for the deletion of all the restores within a specified time frame + err = wait.PollImmediate(time.Second, gracefulDeletionMaximumDuration, func() (bool, error) { restoreList := &velerov1api.RestoreList{} - if err = kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil { - cancel() - return + if errList := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); errList != nil { + return false, errList } if len(restoreList.Items) > 0 { fmt.Print(".") + return false, nil } else { + return true, nil + } + }) + + return err +} + +func forcedlyDeleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error { + // Delete velero deployment first in case: + // 1. finalizers will be added back by restore controller after they are removed at next step; + // 2. new restores attached with finalizer will be created by restore controller after we remove all the restores' finalizer at next step; + deploy := &appsv1api.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "velero", + Name: namespace, + }, + } + + err := kbClient.Delete(ctx, deploy) + if err != nil && !apierrors.IsNotFound(err) { + return errors.Wrap(err, "Error deleting velero deployment during force deletion") + } + + ctxc, cancel := context.WithCancel(ctx) + defer cancel() + + checkFunc := func() { + deploy := &appsv1api.Deployment{} + key := kbclient.ObjectKey{Namespace: namespace, Name: "velero"} + + if err = kbClient.Get(ctxc, key, deploy); err != nil { + if apierrors.IsNotFound(err) { + err = nil + } cancel() return } } - // wait until all the restores are deleted - wait.Until(checkFunc, 100*time.Millisecond, ctx.Done()) + // Wait until velero deployment are deleted. + wait.Until(checkFunc, 100*time.Millisecond, ctxc.Done()) if err != nil { - return err + return errors.Wrap(err, "Error deleting velero deployment during force deletion") + } + + // Remove all the restores' finalizer so they can be deleted during the deletion of velero namespace. + restoreList := &velerov1api.RestoreList{} + if err := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil { + return errors.Wrap(err, "Error getting restores during force deletion") + } + + for i := range restoreList.Items { + if controllerutil.ContainsFinalizer(&restoreList.Items[i], controller.ExternalResourcesFinalizer) { + update := &restoreList.Items[i] + original := update.DeepCopy() + controllerutil.RemoveFinalizer(update, controller.ExternalResourcesFinalizer) + if err := kubeutil.PatchResource(original, update, kbClient); err != nil { + return errors.Wrap(err, "Error removing restore finalizer during force deletion") + } + } } return nil diff --git a/pkg/controller/backup_deletion_controller.go b/pkg/controller/backup_deletion_controller.go index a654958be0..5c9168a294 100644 --- a/pkg/controller/backup_deletion_controller.go +++ b/pkg/controller/backup_deletion_controller.go @@ -34,7 +34,9 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" kubeerrs "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/utils/clock" + ctrl "sigs.k8s.io/controller-runtime" "github.com/vmware-tanzu/velero/internal/credentials" @@ -363,24 +365,45 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque }); err != nil { log.WithError(errors.WithStack(err)).Error("Error listing restore API objects") } else { + // Restore files in object storage will be handled by restore finalizer, so we simply need to initiate a delete request on restores here. for i, restore := range restoreList.Items { if restore.Spec.BackupName != backup.Name { continue } restoreLog := log.WithField("restore", kube.NamespaceAndName(&restoreList.Items[i])) - restoreLog.Info("Deleting restore log/results from backup storage") - if err := backupStore.DeleteRestore(restore.Name); err != nil { - errs = append(errs, err.Error()) - // if we couldn't delete the restore files, don't delete the API object - continue - } - restoreLog.Info("Deleting restore referencing backup") if err := r.Delete(ctx, &restoreList.Items[i]); err != nil { errs = append(errs, errors.Wrapf(err, "error deleting restore %s", kube.NamespaceAndName(&restoreList.Items[i])).Error()) } } + + // Wait for the deletion of restores within certain amount of time. + // Notice that there could be potential errors during the finalization process, which may result in the failure to delete the restore. + // Therefore, it is advisable to set a timeout period for waiting. + err := wait.PollImmediate(time.Second, time.Minute, func() (bool, error) { + restoreList := &velerov1api.RestoreList{} + if err := r.List(ctx, restoreList, &client.ListOptions{Namespace: backup.Namespace, LabelSelector: selector}); err != nil { + return false, err + } + cnt := 0 + for _, restore := range restoreList.Items { + if restore.Spec.BackupName != backup.Name { + continue + } + cnt++ + } + + if cnt > 0 { + return false, nil + } else { + return true, nil + } + }) + if err != nil { + log.WithError(err).Error("Error polling for deletion of restores") + errs = append(errs, errors.Wrapf(err, "error deleting restore %s", err).Error()) + } } if len(errs) == 0 { diff --git a/pkg/controller/backup_deletion_controller_test.go b/pkg/controller/backup_deletion_controller_test.go index edf9d08794..c808f249d4 100644 --- a/pkg/controller/backup_deletion_controller_test.go +++ b/pkg/controller/backup_deletion_controller_test.go @@ -322,8 +322,6 @@ func TestBackupDeletionControllerReconcile(t *testing.T) { td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil) td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(io.NopCloser(bytes.NewReader([]byte("hello world"))), nil) td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil) - td.backupStore.On("DeleteRestore", "restore-1").Return(nil) - td.backupStore.On("DeleteRestore", "restore-2").Return(nil) _, err := td.controller.Reconcile(context.TODO(), td.req) require.NoError(t, err) @@ -363,8 +361,6 @@ func TestBackupDeletionControllerReconcile(t *testing.T) { assert.Nil(t, err) td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName) - td.backupStore.AssertCalled(t, "DeleteRestore", "restore-1") - td.backupStore.AssertCalled(t, "DeleteRestore", "restore-2") // Make sure snapshot was deleted assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())