Skip to content

Commit

Permalink
Merge pull request #8518 from Lyndon-Li/fail-fs-backup-on-windows-nodes
Browse files Browse the repository at this point in the history
fs-backup for clusters with windows nodes
  • Loading branch information
Lyndon-Li authored Dec 24, 2024
2 parents 9dcfe16 + 4e0a0e0 commit 78c97d9
Show file tree
Hide file tree
Showing 11 changed files with 329 additions and 22 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/8518-Lyndon-Li
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make fs-backup work on linux nodes with the new Velero deployment and disable fs-backup if the source/target pod is running in non-linux node (#8424)
2 changes: 2 additions & 0 deletions pkg/cmd/cli/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,9 @@ func (o *Options) Run(c *cobra.Command, f client.Factory) error {
if _, err = install.NodeAgentIsReady(dynamicFactory, o.Namespace); err != nil {
return errors.Wrap(err, errorMsg)
}
}

if o.UseNodeAgentWindows {
fmt.Println("Waiting for node-agent-windows daemonset to be ready.")
if _, err = install.NodeAgentWindowsIsReady(dynamicFactory, o.Namespace); err != nil {
return errors.Wrap(err, errorMsg)
Expand Down
19 changes: 15 additions & 4 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/restore"
"github.com/vmware-tanzu/velero/pkg/uploader"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/kube"
"github.com/vmware-tanzu/velero/pkg/util/logging"
)

Expand Down Expand Up @@ -471,10 +472,20 @@ func (s *server) veleroResourcesExist() error {

func (s *server) checkNodeAgent() {
// warn if node agent does not exist
if err := nodeagent.IsRunning(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found; pod volume backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent")
if kube.WithLinuxNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnLinux(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for linux nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for linux nodes")
}
}

if kube.WithWindowsNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnWindows(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for Windows nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for Windows nodes")
}
}
}

Expand Down
18 changes: 14 additions & 4 deletions pkg/nodeagent/node_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@ import (
)

const (
// daemonSet is the name of the Velero node agent daemonset.
// daemonSet is the name of the Velero node agent daemonset on linux nodes.
daemonSet = "node-agent"

// daemonsetWindows is the name of the Velero node agent daemonset on Windows nodes.
daemonsetWindows = "node-agent-windows"

// nodeAgentRole marks pods with node-agent role on all nodes.
nodeAgentRole = "node-agent"
)
Expand Down Expand Up @@ -92,9 +95,16 @@ type Configs struct {
PodResources *kube.PodResources `json:"podResources,omitempty"`
}

// IsRunning checks if the node agent daemonset is running properly. If not, return the error found
func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonSet, metav1.GetOptions{}); apierrors.IsNotFound(err) {
func IsRunningOnLinux(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonSet)
}

func IsRunningOnWindows(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonsetWindows)
}

func isRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string, daemonset string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonset, metav1.GetOptions{}); apierrors.IsNotFound(err) {
return ErrDaemonSetNotFound
} else if err != nil {
return err
Expand Down
6 changes: 3 additions & 3 deletions pkg/nodeagent/node_agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ type reactor struct {
}

func TestIsRunning(t *testing.T) {
daemonSet := &appsv1.DaemonSet{
ds := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: "fake-ns",
Name: "node-agent",
Expand Down Expand Up @@ -80,7 +80,7 @@ func TestIsRunning(t *testing.T) {
name: "succeed",
namespace: "fake-ns",
kubeClientObj: []runtime.Object{
daemonSet,
ds,
},
},
}
Expand All @@ -93,7 +93,7 @@ func TestIsRunning(t *testing.T) {
fakeKubeClient.Fake.PrependReactor(reactor.verb, reactor.resource, reactor.reactorFunc)
}

err := IsRunning(context.TODO(), fakeKubeClient, test.namespace)
err := isRunning(context.TODO(), fakeKubeClient, test.namespace, daemonSet)
if test.expectErr == "" {
assert.NoError(t, err)
} else {
Expand Down
6 changes: 6 additions & 0 deletions pkg/podvolume/backupper.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ func (b *backupper) BackupPodVolumes(backup *velerov1api.Backup, pod *corev1api.
return nil, pvcSummary, nil
}

if err := kube.IsLinuxNode(b.ctx, pod.Spec.NodeName, b.crClient); err != nil {
err := errors.Wrapf(err, "Pod %s/%s is not running in linux node(%s), skip", pod.Namespace, pod.Name, pod.Spec.NodeName)
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)
return nil, pvcSummary, []error{err}
}

err := nodeagent.IsRunningInNode(b.ctx, backup.Namespace, pod.Spec.NodeName, b.crClient)
if err != nil {
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)
Expand Down
44 changes: 41 additions & 3 deletions pkg/podvolume/backupper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,14 @@ func createPVBObj(fail bool, withSnapshot bool, index int, uploaderType string)
return pvbObj
}

func createNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
}

func createWindowsNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
}

func TestBackupPodVolumes(t *testing.T) {
scheme := runtime.NewScheme()
velerov1api.AddToScheme(scheme)
Expand Down Expand Up @@ -358,13 +366,32 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
},
{
name: "pod is not running on Linux node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createWindowsNodeObj(),
},
sourcePod: createPodObj(false, false, false, 2),
uploaderType: "kopia",
errs: []string{
"Pod fake-ns/fake-pod is not running in linux node(fake-node-name), skip",
},
},
{
name: "node-agent pod is not running in node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
sourcePod: createPodObj(true, false, false, 2),
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
"daemonset pod not found in running state in node fake-node-name",
Expand All @@ -379,6 +406,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
mockGetRepositoryType: true,
Expand All @@ -395,6 +423,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
Expand All @@ -410,6 +439,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
Expand All @@ -427,6 +457,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
Expand All @@ -448,6 +479,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
},
Expand All @@ -471,6 +503,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, true),
Expand All @@ -482,6 +515,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "volume not mounted by pod should be skipped",
Expand All @@ -492,6 +526,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, false),
Expand All @@ -503,6 +538,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "return completed pvbs",
Expand All @@ -512,6 +548,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, true, 1),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVObj(1, false),
},
Expand All @@ -522,6 +559,7 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
pvbs: 1,
errs: []string{},
},
}
// TODO add more verification around PVCBackupSummary returned by "BackupPodVolumes"
Expand Down Expand Up @@ -568,8 +606,8 @@ func TestBackupPodVolumes(t *testing.T) {

pvbs, _, errs := bp.BackupPodVolumes(backupObj, test.sourcePod, test.volumes, nil, velerotest.NewLogger())

if errs == nil {
assert.Nil(t, test.errs)
if test.errs == nil {
assert.NoError(t, err)
} else {
for i := 0; i < len(errs); i++ {
assert.EqualError(t, errs[i], test.errs[i])
Expand Down
8 changes: 7 additions & 1 deletion pkg/podvolume/restorer.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
return nil
}

if err := nodeagent.IsRunning(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
if err := nodeagent.IsRunningOnLinux(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
return []error{errors.Wrapf(err, "error to check node agent status")}
}

Expand Down Expand Up @@ -213,6 +213,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
} else if err != nil {
r.log.WithError(err).Error("Failed to check node-agent pod status, disengage")
} else {
if err := kube.IsLinuxNode(checkCtx, nodeName, r.crClient); err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("Restored pod is not running in linux node")
r.nodeAgentCheck <- errors.Wrapf(err, "restored pod %s/%s is not running in linux node(%s)", data.Pod.Namespace, data.Pod.Name, nodeName)
return
}

err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.crClient)
if err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore")
Expand Down
35 changes: 28 additions & 7 deletions pkg/podvolume/restorer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ import (
"k8s.io/client-go/kubernetes"
kubefake "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/tools/cache"
ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake"

"github.com/vmware-tanzu/velero/internal/volume"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
Expand Down Expand Up @@ -314,13 +313,38 @@ func TestRestorePodVolumes(t *testing.T) {
},
},
},
{
name: "pod is not running on linux nodes",
pvbs: []*velerov1api.PodVolumeBackup{
createPVBObj(true, true, 1, "kopia"),
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createWindowsNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
},
restoredPod: createPodObj(true, true, true, 1),
sourceNamespace: "fake-ns",
bsl: "fake-bsl",
runtimeScheme: scheme,
errs: []expectError{
{
err: "restored pod fake-ns/fake-pod is not running in linux node(fake-node-name): os type windows for node fake-node-name is not linux",
},
},
},
{
name: "node-agent pod is not running",
pvbs: []*velerov1api.PodVolumeBackup{
createPVBObj(true, true, 1, "kopia"),
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
},
Expand All @@ -344,6 +368,7 @@ func TestRestorePodVolumes(t *testing.T) {
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
createNodeAgentPodObj(true),
Expand All @@ -368,11 +393,6 @@ func TestRestorePodVolumes(t *testing.T) {
ctx = test.ctx
}

fakeClientBuilder := ctrlfake.NewClientBuilder()
if test.runtimeScheme != nil {
fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme)
}

objClient := append(test.ctlClientObj, test.kubeClientObj...)
objClient = append(objClient, test.veleroClientObj...)

Expand Down Expand Up @@ -438,7 +458,8 @@ func TestRestorePodVolumes(t *testing.T) {
for i := 0; i < len(errs); i++ {
j := 0
for ; j < len(test.errs); j++ {
if errs[i].Error() == test.errs[j].err {
err := errs[i].Error()
if err == test.errs[j].err {
break
}
}
Expand Down
Loading

0 comments on commit 78c97d9

Please sign in to comment.