Skip to content

Commit

Permalink
enhance uninstall and backup deletion for restore finalizer
Browse files Browse the repository at this point in the history
Signed-off-by: allenxu404 <[email protected]>
  • Loading branch information
allenxu404 committed Aug 3, 2023
1 parent 3e631ca commit 846f0de
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 34 deletions.
130 changes: 107 additions & 23 deletions pkg/cmd/cli/uninstall/uninstall.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,31 @@ import (
"github.com/spf13/cobra"
"github.com/spf13/pflag"

appsv1api "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
apiextv1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/labels"
kubeerrs "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/wait"
kbclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/client"
"github.com/vmware-tanzu/velero/pkg/cmd"
"github.com/vmware-tanzu/velero/pkg/cmd/cli"
"github.com/vmware-tanzu/velero/pkg/controller"
"github.com/vmware-tanzu/velero/pkg/install"
kubeutil "github.com/vmware-tanzu/velero/pkg/util/kube"
)

var gracefulDeletionMaximumDuration = 1 * time.Minute

// uninstallOptions collects all the options for uninstalling Velero from a Kubernetes cluster.
type uninstallOptions struct {
wait bool // deprecated
Expand Down Expand Up @@ -117,7 +125,6 @@ func Run(ctx context.Context, kbClient kbclient.Client, namespace string) error
}

// CRDs

veleroLabelSelector := labels.SelectorFromSet(install.Labels())
opts := []kbclient.DeleteAllOfOption{
kbclient.InNamespace(namespace),
Expand Down Expand Up @@ -160,8 +167,7 @@ func Run(ctx context.Context, kbClient kbclient.Client, namespace string) error
}

func deleteNamespace(ctx context.Context, kbClient kbclient.Client, namespace string) error {
// delete resources with finalizer attached first to ensure finalizer can be handled by corresponding controller and resources can be deleted successfully before controller's pod is deleted.
// otherwise the process of deleting namespace will get stuck in deleting those resources forever.
// Deal with resources with attached finalizers to ensure proper handling of those finalizers.
if err := deleteResourcesWithFinalizer(ctx, kbClient, namespace); err != nil {
return errors.Wrap(err, "Fail to remove finalizer from restores")
}
Expand Down Expand Up @@ -211,57 +217,135 @@ func deleteNamespace(ctx context.Context, kbClient kbclient.Client, namespace st
return nil
}

// A few things needed to be noticed here:
// 1. When we delete resources with attached finalizers, the corresponding controller will deal with the finalizer then resources can be deleted successfully.
// So it is important to delete these resources before deleting the pod that runs that controller.
// 2. The controller may encounter errors while handling the finalizer, in such case, the controller will keep trying until it succeeds.
// So it is important to set a timeout, once the process exceed the timeout, we will forcedly delete the resources by removing the finalizer from them,
// otherwise the deletion process may get stuck indefinitely.
// 3. There is only restore finalizer supported as of v1.12. If any new finalizers are added in the future, the corresponding deletion logic can be
// incorporated into this function.
func deleteResourcesWithFinalizer(ctx context.Context, kbClient kbclient.Client, namespace string) error {
//check if restore crd exists
fmt.Println("Waiting for resource with attached finalizer to be deleted")
return deleteRestore(ctx, kbClient, namespace)
}

func deleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error {
// Check if restore crd exists, if it does not exist, return immediately.
var err error
v1crd := &apiextv1.CustomResourceDefinition{}
key := kbclient.ObjectKey{Name: "restores.velero.io"}
if err := kbClient.Get(ctx, key, v1crd); err != nil {
if err = kbClient.Get(ctx, key, v1crd); err != nil {
if apierrors.IsNotFound(err) {
return nil
} else {
return err
return errors.Wrap(err, "Error getting restore crd")
}
}

// delete all the restores
// First attempt to gracefully delete all the restore within a specified time frame, If the process exceeds the timeout limit,
// it is likely that there may be errors during the finalization of restores. In such cases, we should proceed with forcefully deleting the restores.
err = gracefullyDeleteRestore(ctx, kbClient, namespace)
if err != nil && err != wait.ErrWaitTimeout {
return errors.Wrap(err, "Error deleting restores")
}
if err == wait.ErrWaitTimeout {
err = forcedlyDeleteRestore(ctx, kbClient, namespace)
if err != nil {
return errors.Wrap(err, "Error deleting restores")
}
}

return nil
}

func gracefullyDeleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error {
var err error
restoreList := &velerov1api.RestoreList{}
if err := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil {
return err
if err = kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil {
return errors.Wrap(err, "Error getting restores during graceful deletion")
}

for i := range restoreList.Items {
if err := kbClient.Delete(ctx, &restoreList.Items[i]); err != nil {
if err = kbClient.Delete(ctx, &restoreList.Items[i]); err != nil {
if apierrors.IsNotFound(err) {
continue
}
return err
return errors.Wrap(err, "Error deleting restores during graceful deletion")
}
}

fmt.Println("Waiting for resource with finalizer attached to be deleted")
ctx, cancel := context.WithCancel(ctx)
defer cancel()

var err error
checkFunc := func() {
// Wait for the deletion of all the restores within a specified time frame
err = wait.PollImmediate(time.Second, gracefulDeletionMaximumDuration, func() (bool, error) {
restoreList := &velerov1api.RestoreList{}
if err = kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil {
cancel()
return
if errList := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); errList != nil {
return false, errList
}

if len(restoreList.Items) > 0 {
fmt.Print(".")
return false, nil
} else {
return true, nil
}
})

return err
}

func forcedlyDeleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error {
// Delete velero deployment first in case:
// 1. finalizers will be added back by restore controller after they are removed at next step;
// 2. new restores attached with finalizer will be created by restore controller after we remove all the restores' finalizer at next step;
deploy := &appsv1api.Deployment{
ObjectMeta: metav1.ObjectMeta{
Namespace: "velero",
Name: namespace,
},
}

err := kbClient.Delete(ctx, deploy)
if err != nil && !apierrors.IsNotFound(err) {
return errors.Wrap(err, "Error deleting velero deployment during force deletion")
}

ctxc, cancel := context.WithCancel(ctx)
defer cancel()

checkFunc := func() {
deploy := &appsv1api.Deployment{}
key := kbclient.ObjectKey{Namespace: namespace, Name: "velero"}

if err = kbClient.Get(ctxc, key, deploy); err != nil {
if apierrors.IsNotFound(err) {
err = nil
}
cancel()
return
}
}

// wait until all the restores are deleted
wait.Until(checkFunc, 100*time.Millisecond, ctx.Done())
// Wait until velero deployment are deleted.
wait.Until(checkFunc, 100*time.Millisecond, ctxc.Done())
if err != nil {
return err
return errors.Wrap(err, "Error deleting velero deployment during force deletion")
}

// Remove all the restores' finalizer so they can be deleted during the deletion of velero namespace.
restoreList := &velerov1api.RestoreList{}
if err := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil {
return errors.Wrap(err, "Error getting restores during force deletion")
}

for i := range restoreList.Items {
if controllerutil.ContainsFinalizer(&restoreList.Items[i], controller.ExternalResourcesFinalizer) {
update := &restoreList.Items[i]
original := update.DeepCopy()
controllerutil.RemoveFinalizer(update, controller.ExternalResourcesFinalizer)
if err := kubeutil.PatchResource(original, update, kbClient); err != nil {
return errors.Wrap(err, "Error removing restore finalizer during force deletion")
}
}
}

return nil
Expand Down
37 changes: 30 additions & 7 deletions pkg/controller/backup_deletion_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ import (
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
kubeerrs "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/utils/clock"

ctrl "sigs.k8s.io/controller-runtime"

"github.com/vmware-tanzu/velero/internal/credentials"
Expand Down Expand Up @@ -363,24 +365,45 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}); err != nil {
log.WithError(errors.WithStack(err)).Error("Error listing restore API objects")
} else {
// Restore files in object storage will be handled by restore finalizer, so we simply need to initiate a delete request on restores here.
for i, restore := range restoreList.Items {
if restore.Spec.BackupName != backup.Name {
continue
}
restoreLog := log.WithField("restore", kube.NamespaceAndName(&restoreList.Items[i]))

restoreLog.Info("Deleting restore log/results from backup storage")
if err := backupStore.DeleteRestore(restore.Name); err != nil {
errs = append(errs, err.Error())
// if we couldn't delete the restore files, don't delete the API object
continue
}

restoreLog.Info("Deleting restore referencing backup")
if err := r.Delete(ctx, &restoreList.Items[i]); err != nil {
errs = append(errs, errors.Wrapf(err, "error deleting restore %s", kube.NamespaceAndName(&restoreList.Items[i])).Error())
}
}

// Wait for the deletion of restores within certain amount of time.
// Notice that there could be potential errors during the finalization process, which may result in the failure to delete the restore.
// Therefore, it is advisable to set a timeout period for waiting.
err := wait.PollImmediate(time.Second, time.Minute, func() (bool, error) {
restoreList := &velerov1api.RestoreList{}
if err := r.List(ctx, restoreList, &client.ListOptions{Namespace: backup.Namespace, LabelSelector: selector}); err != nil {
return false, err
}
cnt := 0
for _, restore := range restoreList.Items {
if restore.Spec.BackupName != backup.Name {
continue
}
cnt++
}

if cnt > 0 {
return false, nil
} else {
return true, nil
}
})
if err != nil {
log.WithError(err).Error("Error polling for deletion of restores")
errs = append(errs, errors.Wrapf(err, "error deleting restore %s", err).Error())
}
}

if len(errs) == 0 {
Expand Down
4 changes: 0 additions & 4 deletions pkg/controller/backup_deletion_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,6 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(io.NopCloser(bytes.NewReader([]byte("hello world"))), nil)
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)
td.backupStore.On("DeleteRestore", "restore-1").Return(nil)
td.backupStore.On("DeleteRestore", "restore-2").Return(nil)

_, err := td.controller.Reconcile(context.TODO(), td.req)
require.NoError(t, err)
Expand Down Expand Up @@ -363,8 +361,6 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
assert.Nil(t, err)

td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
td.backupStore.AssertCalled(t, "DeleteRestore", "restore-1")
td.backupStore.AssertCalled(t, "DeleteRestore", "restore-2")

// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
Expand Down

0 comments on commit 846f0de

Please sign in to comment.