Skip to content

Commit ee99fe5

Browse files
committed
correctly handle domains from NodePools when honoring taints
1 parent 835cd98 commit ee99fe5

File tree

5 files changed

+250
-57
lines changed

5 files changed

+250
-57
lines changed

pkg/controllers/provisioning/provisioner.go

+14-17
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ import (
3030
appsv1 "k8s.io/api/apps/v1"
3131
v1 "k8s.io/api/core/v1"
3232
"k8s.io/apimachinery/pkg/types"
33-
"k8s.io/apimachinery/pkg/util/sets"
3433
"k8s.io/client-go/util/workqueue"
3534
"k8s.io/klog/v2"
3635
controllerruntime "sigs.k8s.io/controller-runtime"
@@ -223,7 +222,7 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
223222
nodePoolList.OrderByWeight()
224223

225224
instanceTypes := map[string][]*cloudprovider.InstanceType{}
226-
domains := map[string]sets.Set[string]{}
225+
domainGroups := map[string]scheduler.TopologyDomainGroup{}
227226
for _, nodePool := range nodePoolList.Items {
228227
// Get instance type options
229228
instanceTypeOptions, err := p.cloudProvider.GetInstanceTypes(ctx, lo.ToPtr(nodePool))
@@ -239,6 +238,7 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
239238
continue
240239
}
241240
instanceTypes[nodePool.Name] = append(instanceTypes[nodePool.Name], instanceTypeOptions...)
241+
nodePoolTaints := nodePool.Spec.Template.Spec.Taints
242242

243243
// Construct Topology Domains
244244
for _, instanceType := range instanceTypeOptions {
@@ -248,15 +248,12 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
248248
requirements.Add(scheduling.NewLabelRequirements(nodePool.Spec.Template.Labels).Values()...)
249249
requirements.Add(instanceType.Requirements.Values()...)
250250

251-
for key, requirement := range requirements {
252-
// This code used to execute a Union between domains[key] and requirement.Values().
253-
// The downside of this is that Union is immutable and takes a copy of the set it is executed upon.
254-
// This resulted in a lot of memory pressure on the heap and poor performance
255-
// https://github.com/aws/karpenter/issues/3565
256-
if domains[key] == nil {
257-
domains[key] = sets.New(requirement.Values()...)
258-
} else {
259-
domains[key].Insert(requirement.Values()...)
251+
for topologyKey, requirement := range requirements {
252+
if _, ok := domainGroups[topologyKey]; !ok {
253+
domainGroups[topologyKey] = scheduler.NewTopologyDomainGroup()
254+
}
255+
for _, domain := range requirement.Values() {
256+
domainGroups[topologyKey].Insert(domain, nodePoolTaints...)
260257
}
261258
}
262259
}
@@ -265,11 +262,11 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
265262
requirements.Add(scheduling.NewLabelRequirements(nodePool.Spec.Template.Labels).Values()...)
266263
for key, requirement := range requirements {
267264
if requirement.Operator() == v1.NodeSelectorOpIn {
268-
// The following is a performance optimisation, for the explanation see the comment above
269-
if domains[key] == nil {
270-
domains[key] = sets.New(requirement.Values()...)
271-
} else {
272-
domains[key].Insert(requirement.Values()...)
265+
if _, ok := domainGroups[key]; !ok {
266+
domainGroups[key] = scheduler.NewTopologyDomainGroup()
267+
}
268+
for _, value := range requirement.Values() {
269+
domainGroups[key].Insert(value, nodePoolTaints...)
273270
}
274271
}
275272
}
@@ -279,7 +276,7 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
279276
pods = p.injectVolumeTopologyRequirements(ctx, pods)
280277

281278
// Calculate cluster topology
282-
topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, domains, pods)
279+
topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, domainGroups, pods)
283280
if err != nil {
284281
return nil, fmt.Errorf("tracking topology counts, %w", err)
285282
}

pkg/controllers/provisioning/scheduling/topology.go

+10-6
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,18 @@ type Topology struct {
5252
// in some cases.
5353
inverseTopologies map[uint64]*TopologyGroup
5454
// The universe of domains by topology key
55-
domains map[string]sets.Set[string]
55+
domainGroups map[string]TopologyDomainGroup
5656
// excludedPods are the pod UIDs of pods that are excluded from counting. This is used so we can simulate
5757
// moving pods to prevent them from being double counted.
5858
excludedPods sets.Set[string]
5959
cluster *state.Cluster
6060
}
6161

62-
func NewTopology(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, domains map[string]sets.Set[string], pods []*v1.Pod) (*Topology, error) {
62+
func NewTopology(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, domainGroups map[string]TopologyDomainGroup, pods []*v1.Pod) (*Topology, error) {
6363
t := &Topology{
6464
kubeClient: kubeClient,
6565
cluster: cluster,
66-
domains: domains,
66+
domainGroups: domainGroups,
6767
topologies: map[uint64]*TopologyGroup{},
6868
inverseTopologies: map[uint64]*TopologyGroup{},
6969
excludedPods: sets.New[string](),
@@ -233,7 +233,7 @@ func (t *Topology) updateInverseAntiAffinity(ctx context.Context, pod *v1.Pod, d
233233
return err
234234
}
235235

236-
tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domains[term.TopologyKey])
236+
tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey])
237237

238238
hash := tg.Hash()
239239
if existing, ok := t.inverseTopologies[hash]; !ok {
@@ -269,6 +269,10 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error {
269269
// capture new domain values from existing nodes that may not have any pods selected by the topology group
270270
// scheduled to them already
271271
t.cluster.ForEachNode(func(n *state.StateNode) bool {
272+
// ignore state nodes which are tracking in-flight NodeClaims
273+
if n.Node == nil {
274+
return true
275+
}
272276
// ignore the node if it doesn't match the topology group
273277
if !tg.nodeFilter.Matches(n.Node) {
274278
return true
@@ -330,7 +334,7 @@ func (t *Topology) newForTopologies(p *v1.Pod) []*TopologyGroup {
330334
var topologyGroups []*TopologyGroup
331335
for _, cs := range p.Spec.TopologySpreadConstraints {
332336
topologyGroups = append(topologyGroups, NewTopologyGroup(TopologyTypeSpread, cs.TopologyKey, p, sets.New(p.Namespace),
333-
cs.LabelSelector, cs.MaxSkew, cs.MinDomains, cs.NodeTaintsPolicy, cs.NodeAffinityPolicy, t.domains[cs.TopologyKey]))
337+
cs.LabelSelector, cs.MaxSkew, cs.MinDomains, cs.NodeTaintsPolicy, cs.NodeAffinityPolicy, t.domainGroups[cs.TopologyKey]))
334338
}
335339
return topologyGroups
336340
}
@@ -367,7 +371,7 @@ func (t *Topology) newForAffinities(ctx context.Context, p *v1.Pod) ([]*Topology
367371
if err != nil {
368372
return nil, err
369373
}
370-
topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domains[term.TopologyKey]))
374+
topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey]))
371375
}
372376
}
373377
return topologyGroups, nil

pkg/controllers/provisioning/scheduling/topology_test.go

+147-29
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package scheduling_test
1818

1919
import (
20+
"fmt"
2021
"time"
2122

2223
. "github.com/onsi/gomega"
@@ -1132,7 +1133,7 @@ var _ = Describe("Topology", func() {
11321133
Skip("NodeTaintsPolicy ony enabled by default for K8s >= 1.26.x")
11331134
}
11341135

1135-
const spreadLabel = "karpenter.sh/fake-label"
1136+
const spreadLabel = "fake-label"
11361137
nodePool.Spec.Template.Labels = map[string]string{
11371138
spreadLabel: "baz",
11381139
}
@@ -1206,7 +1207,7 @@ var _ = Describe("Topology", func() {
12061207
if env.Version.Minor() < 26 {
12071208
Skip("NodeTaintsPolicy ony enabled by default for K8s >= 1.26.x")
12081209
}
1209-
const spreadLabel = "karpenter.sh/fake-label"
1210+
const spreadLabel = "fake-label"
12101211
nodePool.Spec.Template.Labels = map[string]string{
12111212
spreadLabel: "baz",
12121213
}
@@ -1275,23 +1276,83 @@ var _ = Describe("Topology", func() {
12751276
// and should schedule all of the pods on the same node
12761277
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(5))
12771278
})
1278-
FIt("should balance pods across a label when discovered from the provisioner (NodeTaintsPolicy=honor)", func() {
1279+
It("should balance pods across a label when discovered from the provisioner (NodeTaintsPolicy=ignore)", func() {
1280+
const spreadLabel = "fake-label"
1281+
const taintKey = "taint-key"
1282+
nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{
1283+
Key: spreadLabel,
1284+
Operator: v1.NodeSelectorOpIn,
1285+
Values: []string{"foo"},
1286+
})
1287+
taintedNodePool := test.NodePool(v1beta1.NodePool{
1288+
Spec: v1beta1.NodePoolSpec{
1289+
Template: v1beta1.NodeClaimTemplate{
1290+
Spec: v1beta1.NodeClaimSpec{
1291+
Taints: []v1.Taint{
1292+
{
1293+
Key: taintKey,
1294+
Value: "taint-value",
1295+
Effect: v1.TaintEffectNoSchedule,
1296+
},
1297+
},
1298+
Requirements: []v1.NodeSelectorRequirement{
1299+
{
1300+
Key: v1beta1.CapacityTypeLabelKey,
1301+
Operator: v1.NodeSelectorOpExists,
1302+
},
1303+
{
1304+
Key: spreadLabel,
1305+
Operator: v1.NodeSelectorOpIn,
1306+
Values: []string{"bar"},
1307+
},
1308+
},
1309+
},
1310+
},
1311+
},
1312+
})
1313+
1314+
honor := v1.NodeInclusionPolicyIgnore
1315+
topology := []v1.TopologySpreadConstraint{{
1316+
TopologyKey: spreadLabel,
1317+
WhenUnsatisfiable: v1.DoNotSchedule,
1318+
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
1319+
MaxSkew: 1,
1320+
NodeTaintsPolicy: &honor,
1321+
}}
1322+
1323+
pods := test.UnschedulablePods(test.PodOptions{
1324+
ObjectMeta: metav1.ObjectMeta{
1325+
Labels: labels,
1326+
},
1327+
TopologySpreadConstraints: topology,
1328+
}, 2)
1329+
1330+
ExpectApplied(ctx, env.Client, nodePool, taintedNodePool)
1331+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
1332+
1333+
// should fail to schedule both pods, one pod is scheduled to domain "foo" but the other can't be scheduled to domain "bar"
1334+
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1))
1335+
})
1336+
It("should balance pods across a label when discovered from the provisioner (NodeTaintsPolicy=honor)", func() {
12791337
if env.Version.Minor() < 26 {
12801338
Skip("NodeTaintsPolicy ony enabled by default for K8s >= 1.26.x")
12811339
}
1282-
const spreadLabel = "karpenter.sh/fake-label"
1283-
nodePool.Spec.Template.Labels = map[string]string{
1284-
spreadLabel: "baz",
1285-
}
12861340

1287-
nodePoolTainted := test.NodePool(v1beta1.NodePool{
1341+
const spreadLabel = "fake-label"
1342+
const taintKey = "taint-key"
1343+
nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{
1344+
Key: spreadLabel,
1345+
Operator: v1.NodeSelectorOpIn,
1346+
Values: []string{"foo"},
1347+
})
1348+
taintedNodePool := test.NodePool(v1beta1.NodePool{
12881349
Spec: v1beta1.NodePoolSpec{
12891350
Template: v1beta1.NodeClaimTemplate{
12901351
Spec: v1beta1.NodeClaimSpec{
12911352
Taints: []v1.Taint{
12921353
{
1293-
Key: "taintname",
1294-
Value: "taintvalue",
1354+
Key: taintKey,
1355+
Value: "taint-value",
12951356
Effect: v1.TaintEffectNoSchedule,
12961357
},
12971358
},
@@ -1303,17 +1364,14 @@ var _ = Describe("Topology", func() {
13031364
{
13041365
Key: spreadLabel,
13051366
Operator: v1.NodeSelectorOpIn,
1306-
Values: []string{"foo", "bar"},
1367+
Values: []string{"bar"},
13071368
},
13081369
},
13091370
},
13101371
},
13111372
},
13121373
})
13131374

1314-
ExpectApplied(ctx, env.Client, nodePool, nodePoolTainted)
1315-
1316-
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov)
13171375
honor := v1.NodeInclusionPolicyHonor
13181376
topology := []v1.TopologySpreadConstraint{{
13191377
TopologyKey: spreadLabel,
@@ -1323,20 +1381,80 @@ var _ = Describe("Topology", func() {
13231381
NodeTaintsPolicy: &honor,
13241382
}}
13251383

1326-
ExpectApplied(ctx, env.Client, nodePool)
1327-
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov,
1328-
test.UnschedulablePods(test.PodOptions{
1329-
ObjectMeta: metav1.ObjectMeta{Labels: labels},
1330-
ResourceRequirements: v1.ResourceRequirements{
1331-
Requests: v1.ResourceList{
1332-
v1.ResourceCPU: resource.MustParse("1"),
1384+
pods := test.UnschedulablePods(test.PodOptions{
1385+
ObjectMeta: metav1.ObjectMeta{
1386+
Labels: labels,
1387+
},
1388+
TopologySpreadConstraints: topology,
1389+
}, 2)
1390+
1391+
ExpectApplied(ctx, env.Client, nodePool, taintedNodePool)
1392+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
1393+
1394+
// should schedule all pods to domain "foo", ignoring bar since pods don't tolerate
1395+
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2))
1396+
})
1397+
It("should balance pods across a label when mutually exclusive NodePools (by taints) share domains (NodeTaintsPolicy=honor)", func() {
1398+
const spreadLabel = "fake-label"
1399+
const taintKey = "taint-key"
1400+
1401+
nodePools := lo.Map([][]string{{"foo", "bar"}, {"foo", "baz"}}, func(domains []string, i int) *v1beta1.NodePool {
1402+
return test.NodePool(v1beta1.NodePool{
1403+
Spec: v1beta1.NodePoolSpec{
1404+
Template: v1beta1.NodeClaimTemplate{
1405+
Spec: v1beta1.NodeClaimSpec{
1406+
Taints: []v1.Taint{
1407+
{
1408+
Key: taintKey,
1409+
Value: fmt.Sprintf("nodepool-%d", i),
1410+
Effect: v1.TaintEffectNoSchedule,
1411+
},
1412+
},
1413+
Requirements: []v1.NodeSelectorRequirement{
1414+
{
1415+
Key: v1beta1.CapacityTypeLabelKey,
1416+
Operator: v1.NodeSelectorOpExists,
1417+
},
1418+
{
1419+
Key: spreadLabel,
1420+
Operator: v1.NodeSelectorOpIn,
1421+
Values: domains,
1422+
},
1423+
},
1424+
},
13331425
},
13341426
},
1427+
})
1428+
})
1429+
1430+
honor := v1.NodeInclusionPolicyHonor
1431+
topology := []v1.TopologySpreadConstraint{{
1432+
TopologyKey: spreadLabel,
1433+
WhenUnsatisfiable: v1.DoNotSchedule,
1434+
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
1435+
MaxSkew: 1,
1436+
NodeTaintsPolicy: &honor,
1437+
}}
1438+
1439+
pods := lo.Flatten(lo.Map(nodePools, func(np *v1beta1.NodePool, _ int) []*v1.Pod {
1440+
return test.UnschedulablePods(test.PodOptions{
1441+
ObjectMeta: metav1.ObjectMeta{
1442+
Labels: labels,
1443+
},
13351444
TopologySpreadConstraints: topology,
1336-
}, 5)...,
1337-
)
1338-
// and should schedule all of the pods on the same node
1339-
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(5))
1445+
Tolerations: []v1.Toleration{{
1446+
Key: taintKey,
1447+
Effect: v1.TaintEffectNoSchedule,
1448+
Value: np.Spec.Template.Spec.Taints[0].Value,
1449+
}},
1450+
}, 2)
1451+
}))
1452+
1453+
ExpectApplied(ctx, env.Client, nodePools[0], nodePools[1])
1454+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
1455+
1456+
// Expect 3 total nodes provisioned, 2 pods schedule to foo, 1 to bar, and 1 to baz
1457+
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1, 2, 1))
13401458
})
13411459
})
13421460

@@ -1345,8 +1463,8 @@ var _ = Describe("Topology", func() {
13451463
if env.Version.Minor() < 26 {
13461464
Skip("NodeAffinityPolicy ony enabled by default for K8s >= 1.26.x")
13471465
}
1348-
const spreadLabel = "karpenter.sh/fake-label"
1349-
const affinityLabel = "karpenter.sh/selector"
1466+
const spreadLabel = "fake-label"
1467+
const affinityLabel = "selector"
13501468
const affinityMismatch = "mismatch"
13511469
const affinityMatch = "value"
13521470

@@ -1416,8 +1534,8 @@ var _ = Describe("Topology", func() {
14161534
if env.Version.Minor() < 26 {
14171535
Skip("NodeAffinityPolicy ony enabled by default for K8s >= 1.26.x")
14181536
}
1419-
const spreadLabel = "karpenter.sh/fake-label"
1420-
const affinityLabel = "karpenter.sh/selector"
1537+
const spreadLabel = "fake-label"
1538+
const affinityLabel = "selector"
14211539
const affinityMismatch = "mismatch"
14221540
const affinityMatch = "value"
14231541

0 commit comments

Comments
 (0)