diff --git a/changelogs/unreleased/8518-Lyndon-Li b/changelogs/unreleased/8518-Lyndon-Li new file mode 100644 index 0000000000..94a8a01587 --- /dev/null +++ b/changelogs/unreleased/8518-Lyndon-Li @@ -0,0 +1 @@ +Make fs-backup work on linux nodes with the new Velero deployment and disable fs-backup if the source/target pod is running in non-linux node (#8424) \ No newline at end of file diff --git a/pkg/cmd/cli/install/install.go b/pkg/cmd/cli/install/install.go index c5d5feb339..f455d6d4f3 100644 --- a/pkg/cmd/cli/install/install.go +++ b/pkg/cmd/cli/install/install.go @@ -398,7 +398,9 @@ func (o *Options) Run(c *cobra.Command, f client.Factory) error { if _, err = install.NodeAgentIsReady(dynamicFactory, o.Namespace); err != nil { return errors.Wrap(err, errorMsg) } + } + if o.UseNodeAgentWindows { fmt.Println("Waiting for node-agent-windows daemonset to be ready.") if _, err = install.NodeAgentWindowsIsReady(dynamicFactory, o.Namespace); err != nil { return errors.Wrap(err, errorMsg) diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index a0828e69ef..d9f6960999 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -82,6 +82,7 @@ import ( "github.com/vmware-tanzu/velero/pkg/restore" "github.com/vmware-tanzu/velero/pkg/uploader" "github.com/vmware-tanzu/velero/pkg/util/filesystem" + "github.com/vmware-tanzu/velero/pkg/util/kube" "github.com/vmware-tanzu/velero/pkg/util/logging" ) @@ -471,10 +472,20 @@ func (s *server) veleroResourcesExist() error { func (s *server) checkNodeAgent() { // warn if node agent does not exist - if err := nodeagent.IsRunning(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound { - s.logger.Warn("Velero node agent not found; pod volume backups/restores will not work until it's created") - } else if err != nil { - s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent") + if kube.WithLinuxNode(s.ctx, s.crClient, s.logger) { + if err := nodeagent.IsRunningOnLinux(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound { + s.logger.Warn("Velero node agent not found for linux nodes; pod volume backups/restores and data mover backups/restores will not work until it's created") + } else if err != nil { + s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for linux nodes") + } + } + + if kube.WithWindowsNode(s.ctx, s.crClient, s.logger) { + if err := nodeagent.IsRunningOnWindows(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound { + s.logger.Warn("Velero node agent not found for Windows nodes; pod volume backups/restores and data mover backups/restores will not work until it's created") + } else if err != nil { + s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for Windows nodes") + } } } diff --git a/pkg/nodeagent/node_agent.go b/pkg/nodeagent/node_agent.go index 898ea1e018..8ed6aacdd2 100644 --- a/pkg/nodeagent/node_agent.go +++ b/pkg/nodeagent/node_agent.go @@ -33,9 +33,12 @@ import ( ) const ( - // daemonSet is the name of the Velero node agent daemonset. + // daemonSet is the name of the Velero node agent daemonset on linux nodes. daemonSet = "node-agent" + // daemonsetWindows is the name of the Velero node agent daemonset on Windows nodes. + daemonsetWindows = "node-agent-windows" + // nodeAgentRole marks pods with node-agent role on all nodes. nodeAgentRole = "node-agent" ) @@ -92,9 +95,16 @@ type Configs struct { PodResources *kube.PodResources `json:"podResources,omitempty"` } -// IsRunning checks if the node agent daemonset is running properly. If not, return the error found -func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error { - if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonSet, metav1.GetOptions{}); apierrors.IsNotFound(err) { +func IsRunningOnLinux(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error { + return isRunning(ctx, kubeClient, namespace, daemonSet) +} + +func IsRunningOnWindows(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error { + return isRunning(ctx, kubeClient, namespace, daemonsetWindows) +} + +func isRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string, daemonset string) error { + if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonset, metav1.GetOptions{}); apierrors.IsNotFound(err) { return ErrDaemonSetNotFound } else if err != nil { return err diff --git a/pkg/nodeagent/node_agent_test.go b/pkg/nodeagent/node_agent_test.go index 700acdec8f..11cb833594 100644 --- a/pkg/nodeagent/node_agent_test.go +++ b/pkg/nodeagent/node_agent_test.go @@ -40,7 +40,7 @@ type reactor struct { } func TestIsRunning(t *testing.T) { - daemonSet := &appsv1.DaemonSet{ + ds := &appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{ Namespace: "fake-ns", Name: "node-agent", @@ -80,7 +80,7 @@ func TestIsRunning(t *testing.T) { name: "succeed", namespace: "fake-ns", kubeClientObj: []runtime.Object{ - daemonSet, + ds, }, }, } @@ -93,7 +93,7 @@ func TestIsRunning(t *testing.T) { fakeKubeClient.Fake.PrependReactor(reactor.verb, reactor.resource, reactor.reactorFunc) } - err := IsRunning(context.TODO(), fakeKubeClient, test.namespace) + err := isRunning(context.TODO(), fakeKubeClient, test.namespace, daemonSet) if test.expectErr == "" { assert.NoError(t, err) } else { diff --git a/pkg/podvolume/backupper.go b/pkg/podvolume/backupper.go index 0a0c63eff1..29452344e2 100644 --- a/pkg/podvolume/backupper.go +++ b/pkg/podvolume/backupper.go @@ -206,6 +206,12 @@ func (b *backupper) BackupPodVolumes(backup *velerov1api.Backup, pod *corev1api. return nil, pvcSummary, nil } + if err := kube.IsLinuxNode(b.ctx, pod.Spec.NodeName, b.crClient); err != nil { + err := errors.Wrapf(err, "Pod %s/%s is not running in linux node(%s), skip", pod.Namespace, pod.Name, pod.Spec.NodeName) + skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log) + return nil, pvcSummary, []error{err} + } + err := nodeagent.IsRunningInNode(b.ctx, backup.Namespace, pod.Spec.NodeName, b.crClient) if err != nil { skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log) diff --git a/pkg/podvolume/backupper_test.go b/pkg/podvolume/backupper_test.go index c21ad5ebb9..9414368307 100644 --- a/pkg/podvolume/backupper_test.go +++ b/pkg/podvolume/backupper_test.go @@ -303,6 +303,14 @@ func createPVBObj(fail bool, withSnapshot bool, index int, uploaderType string) return pvbObj } +func createNodeObj() *corev1api.Node { + return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result() +} + +func createWindowsNodeObj() *corev1api.Node { + return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result() +} + func TestBackupPodVolumes(t *testing.T) { scheme := runtime.NewScheme() velerov1api.AddToScheme(scheme) @@ -358,13 +366,32 @@ func TestBackupPodVolumes(t *testing.T) { uploaderType: "kopia", bsl: "fake-bsl", }, + { + name: "pod is not running on Linux node", + volumes: []string{ + "fake-volume-1", + "fake-volume-2", + }, + kubeClientObj: []runtime.Object{ + createNodeAgentPodObj(true), + createWindowsNodeObj(), + }, + sourcePod: createPodObj(false, false, false, 2), + uploaderType: "kopia", + errs: []string{ + "Pod fake-ns/fake-pod is not running in linux node(fake-node-name), skip", + }, + }, { name: "node-agent pod is not running in node", volumes: []string{ "fake-volume-1", "fake-volume-2", }, - sourcePod: createPodObj(true, false, false, 2), + sourcePod: createPodObj(true, false, false, 2), + kubeClientObj: []runtime.Object{ + createNodeObj(), + }, uploaderType: "kopia", errs: []string{ "daemonset pod not found in running state in node fake-node-name", @@ -379,6 +406,7 @@ func TestBackupPodVolumes(t *testing.T) { sourcePod: createPodObj(true, false, false, 2), kubeClientObj: []runtime.Object{ createNodeAgentPodObj(true), + createNodeObj(), }, uploaderType: "kopia", mockGetRepositoryType: true, @@ -395,6 +423,7 @@ func TestBackupPodVolumes(t *testing.T) { sourcePod: createPodObj(true, false, false, 2), kubeClientObj: []runtime.Object{ createNodeAgentPodObj(true), + createNodeObj(), }, uploaderType: "kopia", errs: []string{ @@ -410,6 +439,7 @@ func TestBackupPodVolumes(t *testing.T) { sourcePod: createPodObj(true, false, false, 2), kubeClientObj: []runtime.Object{ createNodeAgentPodObj(true), + createNodeObj(), }, ctlClientObj: []runtime.Object{ createBackupRepoObj(), @@ -427,6 +457,7 @@ func TestBackupPodVolumes(t *testing.T) { sourcePod: createPodObj(true, true, false, 2), kubeClientObj: []runtime.Object{ createNodeAgentPodObj(true), + createNodeObj(), }, ctlClientObj: []runtime.Object{ createBackupRepoObj(), @@ -448,6 +479,7 @@ func TestBackupPodVolumes(t *testing.T) { sourcePod: createPodObj(true, true, false, 2), kubeClientObj: []runtime.Object{ createNodeAgentPodObj(true), + createNodeObj(), createPVCObj(1), createPVCObj(2), }, @@ -471,6 +503,7 @@ func TestBackupPodVolumes(t *testing.T) { sourcePod: createPodObj(true, true, false, 2), kubeClientObj: []runtime.Object{ createNodeAgentPodObj(true), + createNodeObj(), createPVCObj(1), createPVCObj(2), createPVObj(1, true), @@ -482,6 +515,7 @@ func TestBackupPodVolumes(t *testing.T) { runtimeScheme: scheme, uploaderType: "kopia", bsl: "fake-bsl", + errs: []string{}, }, { name: "volume not mounted by pod should be skipped", @@ -492,6 +526,7 @@ func TestBackupPodVolumes(t *testing.T) { sourcePod: createPodObj(true, true, false, 2), kubeClientObj: []runtime.Object{ createNodeAgentPodObj(true), + createNodeObj(), createPVCObj(1), createPVCObj(2), createPVObj(1, false), @@ -503,6 +538,7 @@ func TestBackupPodVolumes(t *testing.T) { runtimeScheme: scheme, uploaderType: "kopia", bsl: "fake-bsl", + errs: []string{}, }, { name: "return completed pvbs", @@ -512,6 +548,7 @@ func TestBackupPodVolumes(t *testing.T) { sourcePod: createPodObj(true, true, true, 1), kubeClientObj: []runtime.Object{ createNodeAgentPodObj(true), + createNodeObj(), createPVCObj(1), createPVObj(1, false), }, @@ -522,6 +559,7 @@ func TestBackupPodVolumes(t *testing.T) { uploaderType: "kopia", bsl: "fake-bsl", pvbs: 1, + errs: []string{}, }, } // TODO add more verification around PVCBackupSummary returned by "BackupPodVolumes" @@ -568,8 +606,8 @@ func TestBackupPodVolumes(t *testing.T) { pvbs, _, errs := bp.BackupPodVolumes(backupObj, test.sourcePod, test.volumes, nil, velerotest.NewLogger()) - if errs == nil { - assert.Nil(t, test.errs) + if test.errs == nil { + assert.NoError(t, err) } else { for i := 0; i < len(errs); i++ { assert.EqualError(t, errs[i], test.errs[i]) diff --git a/pkg/podvolume/restorer.go b/pkg/podvolume/restorer.go index 4b3e4354dd..18e7717631 100644 --- a/pkg/podvolume/restorer.go +++ b/pkg/podvolume/restorer.go @@ -122,7 +122,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo return nil } - if err := nodeagent.IsRunning(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil { + if err := nodeagent.IsRunningOnLinux(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil { return []error{errors.Wrapf(err, "error to check node agent status")} } @@ -213,6 +213,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo } else if err != nil { r.log.WithError(err).Error("Failed to check node-agent pod status, disengage") } else { + if err := kube.IsLinuxNode(checkCtx, nodeName, r.crClient); err != nil { + r.log.WithField("node", nodeName).WithError(err).Error("Restored pod is not running in linux node") + r.nodeAgentCheck <- errors.Wrapf(err, "restored pod %s/%s is not running in linux node(%s)", data.Pod.Namespace, data.Pod.Name, nodeName) + return + } + err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.crClient) if err != nil { r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore") diff --git a/pkg/podvolume/restorer_test.go b/pkg/podvolume/restorer_test.go index 5d52cf21d2..1af4da4294 100644 --- a/pkg/podvolume/restorer_test.go +++ b/pkg/podvolume/restorer_test.go @@ -33,7 +33,6 @@ import ( "k8s.io/client-go/kubernetes" kubefake "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/tools/cache" - ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/vmware-tanzu/velero/internal/volume" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" @@ -314,6 +313,30 @@ func TestRestorePodVolumes(t *testing.T) { }, }, }, + { + name: "pod is not running on linux nodes", + pvbs: []*velerov1api.PodVolumeBackup{ + createPVBObj(true, true, 1, "kopia"), + }, + kubeClientObj: []runtime.Object{ + createNodeAgentDaemonset(), + createWindowsNodeObj(), + createPVCObj(1), + createPodObj(true, true, true, 1), + }, + ctlClientObj: []runtime.Object{ + createBackupRepoObj(), + }, + restoredPod: createPodObj(true, true, true, 1), + sourceNamespace: "fake-ns", + bsl: "fake-bsl", + runtimeScheme: scheme, + errs: []expectError{ + { + err: "restored pod fake-ns/fake-pod is not running in linux node(fake-node-name): os type windows for node fake-node-name is not linux", + }, + }, + }, { name: "node-agent pod is not running", pvbs: []*velerov1api.PodVolumeBackup{ @@ -321,6 +344,7 @@ func TestRestorePodVolumes(t *testing.T) { }, kubeClientObj: []runtime.Object{ createNodeAgentDaemonset(), + createNodeObj(), createPVCObj(1), createPodObj(true, true, true, 1), }, @@ -344,6 +368,7 @@ func TestRestorePodVolumes(t *testing.T) { }, kubeClientObj: []runtime.Object{ createNodeAgentDaemonset(), + createNodeObj(), createPVCObj(1), createPodObj(true, true, true, 1), createNodeAgentPodObj(true), @@ -368,11 +393,6 @@ func TestRestorePodVolumes(t *testing.T) { ctx = test.ctx } - fakeClientBuilder := ctrlfake.NewClientBuilder() - if test.runtimeScheme != nil { - fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme) - } - objClient := append(test.ctlClientObj, test.kubeClientObj...) objClient = append(objClient, test.veleroClientObj...) @@ -438,7 +458,8 @@ func TestRestorePodVolumes(t *testing.T) { for i := 0; i < len(errs); i++ { j := 0 for ; j < len(test.errs); j++ { - if errs[i].Error() == test.errs[j].err { + err := errs[i].Error() + if err == test.errs[j].err { break } } diff --git a/pkg/util/kube/node.go b/pkg/util/kube/node.go new file mode 100644 index 0000000000..30eaefb13d --- /dev/null +++ b/pkg/util/kube/node.go @@ -0,0 +1,80 @@ +/* +Copyright The Velero Contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package kube + +import ( + "context" + + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + corev1api "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func IsLinuxNode(ctx context.Context, nodeName string, client client.Client) error { + node := &corev1api.Node{} + if err := client.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil { + return errors.Wrapf(err, "error getting node %s", nodeName) + } + + os, found := node.Labels["kubernetes.io/os"] + + if !found { + return errors.Errorf("no os type label for node %s", nodeName) + } + + if os != "linux" { + return errors.Errorf("os type %s for node %s is not linux", os, nodeName) + } + + return nil +} + +func WithLinuxNode(ctx context.Context, client client.Client, log logrus.FieldLogger) bool { + return withOSNode(ctx, client, "linux", log) +} + +func WithWindowsNode(ctx context.Context, client client.Client, log logrus.FieldLogger) bool { + return withOSNode(ctx, client, "windows", log) +} + +func withOSNode(ctx context.Context, client client.Client, osType string, log logrus.FieldLogger) bool { + nodeList := new(corev1api.NodeList) + if err := client.List(ctx, nodeList); err != nil { + log.Warnf("Failed to list nodes, cannot decide existence of nodes of OS %s", osType) + return false + } + + allNodeLabeled := true + for _, node := range nodeList.Items { + os, found := node.Labels["kubernetes.io/os"] + + if os == osType { + return true + } + + if !found { + allNodeLabeled = false + } + } + + if !allNodeLabeled { + log.Warnf("Not all nodes have os type label, cannot decide existence of nodes of OS %s", osType) + } + + return false +} diff --git a/pkg/util/kube/node_test.go b/pkg/util/kube/node_test.go new file mode 100644 index 0000000000..9463938eb6 --- /dev/null +++ b/pkg/util/kube/node_test.go @@ -0,0 +1,132 @@ +/* +Copyright The Velero Contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kube + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/vmware-tanzu/velero/pkg/builder" + + clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake" + + velerotest "github.com/vmware-tanzu/velero/pkg/test" +) + +func TestIsLinuxNode(t *testing.T) { + nodeNoOSLabel := builder.ForNode("fake-node").Result() + nodeWindows := builder.ForNode("fake-node").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result() + nodeLinux := builder.ForNode("fake-node").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result() + + scheme := runtime.NewScheme() + corev1.AddToScheme(scheme) + + tests := []struct { + name string + kubeClientObj []runtime.Object + err string + }{ + { + name: "error getting node", + err: "error getting node fake-node: nodes \"fake-node\" not found", + }, + { + name: "no os label", + kubeClientObj: []runtime.Object{ + nodeNoOSLabel, + }, + err: "no os type label for node fake-node", + }, + { + name: "os label does not match", + kubeClientObj: []runtime.Object{ + nodeWindows, + }, + err: "os type windows for node fake-node is not linux", + }, + { + name: "succeed", + kubeClientObj: []runtime.Object{ + nodeLinux, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + fakeClientBuilder := clientFake.NewClientBuilder() + fakeClientBuilder = fakeClientBuilder.WithScheme(scheme) + + fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() + + err := IsLinuxNode(context.TODO(), "fake-node", fakeClient) + if err != nil { + assert.EqualError(t, err, test.err) + } else { + assert.NoError(t, err) + } + }) + } +} + +func TestWithLinuxNode(t *testing.T) { + nodeWindows := builder.ForNode("fake-node-1").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result() + nodeLinux := builder.ForNode("fake-node-2").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result() + + scheme := runtime.NewScheme() + corev1.AddToScheme(scheme) + + tests := []struct { + name string + kubeClientObj []runtime.Object + result bool + }{ + { + name: "error listing node", + }, + { + name: "with node of other type", + kubeClientObj: []runtime.Object{ + nodeWindows, + }, + }, + { + name: "with node of the same type", + kubeClientObj: []runtime.Object{ + nodeWindows, + nodeLinux, + }, + result: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + fakeClientBuilder := clientFake.NewClientBuilder() + fakeClientBuilder = fakeClientBuilder.WithScheme(scheme) + + fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() + + result := withOSNode(context.TODO(), fakeClient, "linux", velerotest.NewLogger()) + assert.Equal(t, test.result, result) + }) + } +}