From 8861740988afdc3ca455b1a69419120f79aede17 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 11 Mar 2024 16:57:55 -0700 Subject: [PATCH] feat: Drop the hostname requirement when handling PV topology for release-v0.32.x (#1018) (#1081) Co-authored-by: Jason Deal --- hack/toolchain.sh | 9 ++ pkg/controllers/disruption/suite_test.go | 100 ++++++++++++++++ pkg/controllers/provisioning/nodepool_test.go | 111 ++++++++++++++++++ .../provisioning/provisioner_test.go | 111 ++++++++++++++++++ .../provisioning/scheduling/volumetopology.go | 9 +- pkg/test/statefulsets.go | 64 ++++++++++ pkg/test/storage.go | 14 +++ 7 files changed, 417 insertions(+), 1 deletion(-) create mode 100644 pkg/test/statefulsets.go diff --git a/hack/toolchain.sh b/hack/toolchain.sh index 3cb6c2eacf..9a7a5cbef0 100755 --- a/hack/toolchain.sh +++ b/hack/toolchain.sh @@ -37,6 +37,15 @@ kubebuilder() { fi ln -sf $(setup-envtest use -p path "${K8S_VERSION}" --arch="${arch}" --bin-dir="${KUBEBUILDER_ASSETS}")/* ${KUBEBUILDER_ASSETS} find $KUBEBUILDER_ASSETS + + # Install latest binaries for 1.25.x (contains CEL fix) + if [[ "${K8S_VERSION}" = "1.25.x" ]] && [[ "$OSTYPE" == "linux"* ]]; then + for binary in 'kube-apiserver' 'kubectl'; do + rm $KUBEBUILDER_ASSETS/$binary + wget -P $KUBEBUILDER_ASSETS dl.k8s.io/v1.25.16/bin/linux/${arch}/${binary} + chmod +x $KUBEBUILDER_ASSETS/$binary + done + fi } main "$@" diff --git a/pkg/controllers/disruption/suite_test.go b/pkg/controllers/disruption/suite_test.go index b3b612c155..b88d075588 100644 --- a/pkg/controllers/disruption/suite_test.go +++ b/pkg/controllers/disruption/suite_test.go @@ -130,6 +130,106 @@ var _ = AfterEach(func() { ExpectCleanedUp(ctx, env.Client) }) +var _ = Describe("Simulate Scheduling", func() { + var nodePool *v1beta1.NodePool + BeforeEach(func() { + nodePool = test.NodePool() + }) + It("can replace node with a local PV (ignoring hostname affinity)", func() { + nodeClaim, node := test.NodeClaimAndNode(v1beta1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1beta1.NodePoolLabelKey: nodePool.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name, + v1beta1.CapacityTypeLabelKey: mostExpensiveOffering.CapacityType, + v1.LabelTopologyZone: mostExpensiveOffering.Zone, + }, + }, + Status: v1beta1.NodeClaimStatus{ + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }, + }, + }) + labels := map[string]string{ + "app": "test", + } + // create our RS so we can link a pod to it + ss := test.StatefulSet() + ExpectApplied(ctx, env.Client, ss) + + // StorageClass that references "no-provisioner" and is used for local volume storage + storageClass := test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "local-path", + }, + Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"), + }) + persistentVolume := test.PersistentVolume(test.PersistentVolumeOptions{UseLocal: true}) + persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{ + Required: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + // This PV is only valid for use against this node + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelHostname, + Operator: v1.NodeSelectorOpIn, + Values: []string{node.Name}, + }, + }, + }, + }, + }, + } + persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name}) + pod := test.Pod(test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "StatefulSet", + Name: ss.Name, + UID: ss.UID, + Controller: ptr.Bool(true), + BlockOwnerDeletion: ptr.Bool(true), + }, + }, + }, + PersistentVolumeClaims: []string{persistentVolumeClaim.Name}, + }) + ExpectApplied(ctx, env.Client, ss, pod, nodeClaim, node, nodePool, storageClass, persistentVolume, persistentVolumeClaim) + + // bind pods to node + ExpectManualBinding(ctx, env.Client, pod, node) + + // inform cluster state about nodes and nodeclaims + ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim}) + + // disruption won't delete the old node until the new node is ready + var wg sync.WaitGroup + ExpectTriggerVerifyAction(&wg) + ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) + ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{}) + wg.Wait() + + // Cascade any deletion of the nodeClaim to the node + ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim) + + // Expect that the new nodeClaim was created, and it's different than the original + // We should succeed in getting a replacement, since we assume that the node affinity requirement will be invalid + // once we spin-down the old node + ExpectNotFound(ctx, env.Client, nodeClaim, node) + nodeclaims := ExpectNodeClaims(ctx, env.Client) + nodes := ExpectNodes(ctx, env.Client) + Expect(nodeclaims).To(HaveLen(1)) + Expect(nodes).To(HaveLen(1)) + Expect(nodeclaims[0].Name).ToNot(Equal(nodeClaim.Name)) + Expect(nodes[0].Name).ToNot(Equal(node.Name)) + }) +}) + var _ = Describe("Disruption Taints", func() { var provisioner *v1alpha5.Provisioner var machine *v1alpha5.Machine diff --git a/pkg/controllers/provisioning/nodepool_test.go b/pkg/controllers/provisioning/nodepool_test.go index 068504764f..1d00d4dedf 100644 --- a/pkg/controllers/provisioning/nodepool_test.go +++ b/pkg/controllers/provisioning/nodepool_test.go @@ -1247,6 +1247,117 @@ var _ = Describe("NodePool/Provisioning", func() { node := ExpectScheduled(ctx, env.Client, pod) Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, "test-zone-3")) }) + DescribeTable("should ignore hostname affinity scheduling when using local path volumes", + func(volumeOptions test.PersistentVolumeOptions) { + // StorageClass that references "no-provisioner" and is used for local volume storage + storageClass = test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "local-path", + }, + Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"), + }) + // Create a PersistentVolume that is using a random node name for its affinity + persistentVolume := test.PersistentVolume(volumeOptions) + persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{ + Required: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelHostname, + Operator: v1.NodeSelectorOpIn, + Values: []string{test.RandomName()}, + }, + }, + }, + }, + }, + } + persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name}) + ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, persistentVolumeClaim, persistentVolume) + pod := test.UnschedulablePod(test.PodOptions{ + PersistentVolumeClaims: []string{persistentVolumeClaim.Name}, + }) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) + // Expect that we are still able to schedule this pod to a node, even though we had a hostname affinity on it + ExpectScheduled(ctx, env.Client, pod) + }, + Entry("when using local volumes", test.PersistentVolumeOptions{UseLocal: true}), + Entry("when using hostpath volumes", test.PersistentVolumeOptions{UseHostPath: true}), + ) + DescribeTable("should ignore hostname affinity scheduling when using local path volumes (ephemeral volume)", + func(volumeOptions test.PersistentVolumeOptions) { + // StorageClass that references "no-provisioner" and is used for local volume storage + storageClass = test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "local-path", + }, + Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"), + }) + pod := test.UnschedulablePod(test.PodOptions{ + EphemeralVolumeTemplates: []test.EphemeralVolumeTemplateOptions{ + { + StorageClassName: &storageClass.Name, + }, + }, + }) + persistentVolume := test.PersistentVolume(volumeOptions) + persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{ + Required: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelHostname, + Operator: v1.NodeSelectorOpIn, + Values: []string{test.RandomName()}, + }, + }, + }, + }, + }, + } + persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-%s", pod.Name, pod.Spec.Volumes[0].Name), + }, + VolumeName: persistentVolume.Name, + StorageClassName: &storageClass.Name, + }) + ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, pod, persistentVolumeClaim, persistentVolume) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) + ExpectScheduled(ctx, env.Client, pod) + }, + Entry("when using local volumes", test.PersistentVolumeOptions{UseLocal: true}), + Entry("when using hostpath volumes", test.PersistentVolumeOptions{UseHostPath: true}), + ) + It("should not ignore hostname affinity when using non-local path volumes", func() { + // This PersistentVolume is going to use a standard CSI volume for provisioning + persistentVolume := test.PersistentVolume() + persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{ + Required: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelHostname, + Operator: v1.NodeSelectorOpIn, + Values: []string{test.RandomName()}, + }, + }, + }, + }, + }, + } + persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name}) + ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, persistentVolumeClaim, persistentVolume) + pod := test.UnschedulablePod(test.PodOptions{ + PersistentVolumeClaims: []string{persistentVolumeClaim.Name}, + }) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) + // Expect that this pod can't schedule because we have a hostname affinity, and we don't currently have a pod that we can schedule to + ExpectNotScheduled(ctx, env.Client, pod) + }) It("should not schedule if volume zones are incompatible", func() { persistentVolume := test.PersistentVolume(test.PersistentVolumeOptions{Zones: []string{"test-zone-3"}}) persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name}) diff --git a/pkg/controllers/provisioning/provisioner_test.go b/pkg/controllers/provisioning/provisioner_test.go index b3b8366400..65a78c0525 100644 --- a/pkg/controllers/provisioning/provisioner_test.go +++ b/pkg/controllers/provisioning/provisioner_test.go @@ -1194,6 +1194,117 @@ var _ = Describe("Provisioner/Provisioning", func() { node := ExpectScheduled(ctx, env.Client, pod) Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, "test-zone-3")) }) + DescribeTable("should ignore hostname affinity scheduling when using local path volumes", + func(volumeOptions test.PersistentVolumeOptions) { + // StorageClass that references "no-provisioner" and is used for local volume storage + storageClass = test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "local-path", + }, + Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"), + }) + // Create a PersistentVolume that is using a random node name for its affinity + persistentVolume := test.PersistentVolume(volumeOptions) + persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{ + Required: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelHostname, + Operator: v1.NodeSelectorOpIn, + Values: []string{test.RandomName()}, + }, + }, + }, + }, + }, + } + persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name}) + ExpectApplied(ctx, env.Client, test.Provisioner(), storageClass, persistentVolumeClaim, persistentVolume) + pod := test.UnschedulablePod(test.PodOptions{ + PersistentVolumeClaims: []string{persistentVolumeClaim.Name}, + }) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) + // Expect that we are still able to schedule this pod to a node, even though we had a hostname affinity on it + ExpectScheduled(ctx, env.Client, pod) + }, + Entry("when using local volumes", test.PersistentVolumeOptions{UseLocal: true}), + Entry("when using hostpath volumes", test.PersistentVolumeOptions{UseHostPath: true}), + ) + DescribeTable("should ignore hostname affinity scheduling when using local path volumes (ephemeral volume)", + func(volumeOptions test.PersistentVolumeOptions) { + // StorageClass that references "no-provisioner" and is used for local volume storage + storageClass = test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "local-path", + }, + Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"), + }) + pod := test.UnschedulablePod(test.PodOptions{ + EphemeralVolumeTemplates: []test.EphemeralVolumeTemplateOptions{ + { + StorageClassName: &storageClass.Name, + }, + }, + }) + persistentVolume := test.PersistentVolume(volumeOptions) + persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{ + Required: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelHostname, + Operator: v1.NodeSelectorOpIn, + Values: []string{test.RandomName()}, + }, + }, + }, + }, + }, + } + persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-%s", pod.Name, pod.Spec.Volumes[0].Name), + }, + VolumeName: persistentVolume.Name, + StorageClassName: &storageClass.Name, + }) + ExpectApplied(ctx, env.Client, test.Provisioner(), storageClass, pod, persistentVolumeClaim, persistentVolume) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) + ExpectScheduled(ctx, env.Client, pod) + }, + Entry("when using local volumes", test.PersistentVolumeOptions{UseLocal: true}), + Entry("when using hostpath volumes", test.PersistentVolumeOptions{UseHostPath: true}), + ) + It("should not ignore hostname affinity when using non-local path volumes", func() { + // This PersistentVolume is going to use a standard CSI volume for provisioning + persistentVolume := test.PersistentVolume() + persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{ + Required: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelHostname, + Operator: v1.NodeSelectorOpIn, + Values: []string{test.RandomName()}, + }, + }, + }, + }, + }, + } + persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name}) + ExpectApplied(ctx, env.Client, test.Provisioner(), storageClass, persistentVolumeClaim, persistentVolume) + pod := test.UnschedulablePod(test.PodOptions{ + PersistentVolumeClaims: []string{persistentVolumeClaim.Name}, + }) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) + // Expect that this pod can't schedule because we have a hostname affinity, and we don't currently have a pod that we can schedule to + ExpectNotScheduled(ctx, env.Client, pod) + }) It("should not schedule if volume zones are incompatible", func() { persistentVolume := test.PersistentVolume(test.PersistentVolumeOptions{Zones: []string{"test-zone-3"}}) persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name}) diff --git a/pkg/controllers/provisioning/scheduling/volumetopology.go b/pkg/controllers/provisioning/scheduling/volumetopology.go index 11d6986e67..fa7305b76d 100644 --- a/pkg/controllers/provisioning/scheduling/volumetopology.go +++ b/pkg/controllers/provisioning/scheduling/volumetopology.go @@ -148,7 +148,14 @@ func (v *VolumeTopology) getPersistentVolumeRequirements(ctx context.Context, po var requirements []v1.NodeSelectorRequirement if len(pv.Spec.NodeAffinity.Required.NodeSelectorTerms) > 0 { // Terms are ORed, only use the first term - requirements = append(requirements, pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions...) + requirements = pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions + // If we are using a Local volume or a HostPath volume, then we should ignore the Hostname affinity + // on it because re-scheduling this pod to a new node means not using the same Hostname affinity that we currently have + if pv.Spec.Local != nil || pv.Spec.HostPath != nil { + requirements = lo.Reject(requirements, func(n v1.NodeSelectorRequirement, _ int) bool { + return n.Key == v1.LabelHostname + }) + } } return requirements, nil } diff --git a/pkg/test/statefulsets.go b/pkg/test/statefulsets.go new file mode 100644 index 0000000000..b521e32e0b --- /dev/null +++ b/pkg/test/statefulsets.go @@ -0,0 +1,64 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package test + +import ( + "fmt" + + "github.com/imdario/mergo" + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "knative.dev/pkg/ptr" +) + +type StatefulSetOptions struct { + metav1.ObjectMeta + Labels map[string]string + Replicas int32 + PodOptions PodOptions +} + +func StatefulSet(overrides ...StatefulSetOptions) *appsv1.StatefulSet { + options := StatefulSetOptions{} + for _, opts := range overrides { + if err := mergo.Merge(&options, opts, mergo.WithOverride); err != nil { + panic(fmt.Sprintf("Failed to merge deployment options: %s", err)) + } + } + + objectMeta := NamespacedObjectMeta(options.ObjectMeta) + + if options.PodOptions.Image == "" { + options.PodOptions.Image = "public.ecr.aws/eks-distro/kubernetes/pause:3.2" + } + if options.PodOptions.Labels == nil { + options.PodOptions.Labels = map[string]string{ + "app": objectMeta.Name, + } + } + pod := Pod(options.PodOptions) + return &appsv1.StatefulSet{ + ObjectMeta: objectMeta, + Spec: appsv1.StatefulSetSpec{ + Replicas: ptr.Int32(options.Replicas), + Selector: &metav1.LabelSelector{MatchLabels: options.PodOptions.Labels}, + Template: v1.PodTemplateSpec{ + ObjectMeta: ObjectMeta(options.PodOptions.ObjectMeta), + Spec: pod.Spec, + }, + }, + } +} diff --git a/pkg/test/storage.go b/pkg/test/storage.go index 544f85aa68..b302ea51d9 100644 --- a/pkg/test/storage.go +++ b/pkg/test/storage.go @@ -32,6 +32,8 @@ type PersistentVolumeOptions struct { StorageClassName string Driver string UseAWSInTreeDriver bool + UseLocal bool + UseHostPath bool } func PersistentVolume(overrides ...PersistentVolumeOptions) *v1.PersistentVolume { @@ -44,6 +46,18 @@ func PersistentVolume(overrides ...PersistentVolumeOptions) *v1.PersistentVolume // Determine the PersistentVolumeSource based on the options var source v1.PersistentVolumeSource switch { + case options.UseLocal: + source = v1.PersistentVolumeSource{ + Local: &v1.LocalVolumeSource{ + Path: "/mnt/local-disks", + }, + } + case options.UseHostPath: + source = v1.PersistentVolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/mnt/local-disks", + }, + } case options.UseAWSInTreeDriver: source = v1.PersistentVolumeSource{ AWSElasticBlockStore: &v1.AWSElasticBlockStoreVolumeSource{