Skip to content

Commit de534bc

Browse files
committed
correctly handle domains from NodePools when honoring taints
1 parent aff0218 commit de534bc

File tree

6 files changed

+251
-58
lines changed

6 files changed

+251
-58
lines changed

pkg/controllers/provisioning/provisioner.go

+14-17
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ import (
2929
appsv1 "k8s.io/api/apps/v1"
3030
v1 "k8s.io/api/core/v1"
3131
"k8s.io/apimachinery/pkg/types"
32-
"k8s.io/apimachinery/pkg/util/sets"
3332
corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
3433
"k8s.io/client-go/util/workqueue"
3534
"knative.dev/pkg/logging"
@@ -205,7 +204,7 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
205204
// Build node templates
206205
var nodeClaimTemplates []*scheduler.NodeClaimTemplate
207206
instanceTypes := map[string][]*cloudprovider.InstanceType{}
208-
domains := map[string]sets.Set[string]{}
207+
domainGroups := map[string]scheduler.TopologyDomainGroup{}
209208

210209
nodePoolList, err := nodepoolutil.List(ctx, p.kubeClient)
211210
if err != nil {
@@ -244,6 +243,7 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
244243
continue
245244
}
246245
instanceTypes[nodePool.Name] = append(instanceTypes[nodePool.Name], instanceTypeOptions...)
246+
nodePoolTaints := nodePool.Spec.Template.Spec.Taints
247247

248248
// Construct Topology Domains
249249
for _, instanceType := range instanceTypeOptions {
@@ -253,15 +253,12 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
253253
requirements.Add(scheduling.NewLabelRequirements(nodePool.Spec.Template.Labels).Values()...)
254254
requirements.Add(instanceType.Requirements.Values()...)
255255

256-
for key, requirement := range requirements {
257-
// This code used to execute a Union between domains[key] and requirement.Values().
258-
// The downside of this is that Union is immutable and takes a copy of the set it is executed upon.
259-
// This resulted in a lot of memory pressure on the heap and poor performance
260-
// https://github.com/aws/karpenter/issues/3565
261-
if domains[key] == nil {
262-
domains[key] = sets.New(requirement.Values()...)
263-
} else {
264-
domains[key].Insert(requirement.Values()...)
256+
for topologyKey, requirement := range requirements {
257+
if _, ok := domainGroups[topologyKey]; !ok {
258+
domainGroups[topologyKey] = scheduler.NewTopologyDomainGroup()
259+
}
260+
for _, domain := range requirement.Values() {
261+
domainGroups[topologyKey].Insert(domain, nodePoolTaints...)
265262
}
266263
}
267264
}
@@ -270,11 +267,11 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
270267
requirements.Add(scheduling.NewLabelRequirements(nodePool.Spec.Template.Labels).Values()...)
271268
for key, requirement := range requirements {
272269
if requirement.Operator() == v1.NodeSelectorOpIn {
273-
//The following is a performance optimisation, for the explanation see the comment above
274-
if domains[key] == nil {
275-
domains[key] = sets.New(requirement.Values()...)
276-
} else {
277-
domains[key].Insert(requirement.Values()...)
270+
if _, ok := domainGroups[key]; !ok {
271+
domainGroups[key] = scheduler.NewTopologyDomainGroup()
272+
}
273+
for _, value := range requirement.Values() {
274+
domainGroups[key].Insert(value, nodePoolTaints...)
278275
}
279276
}
280277
}
@@ -284,7 +281,7 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, stateNod
284281
pods = p.injectTopology(ctx, pods)
285282

286283
// Calculate cluster topology
287-
topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, domains, pods)
284+
topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, domainGroups, pods)
288285
if err != nil {
289286
return nil, fmt.Errorf("tracking topology counts, %w", err)
290287
}

pkg/controllers/provisioning/scheduling/topology.go

+10-6
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,18 @@ type Topology struct {
5151
// in some cases.
5252
inverseTopologies map[uint64]*TopologyGroup
5353
// The universe of domains by topology key
54-
domains map[string]sets.Set[string]
54+
domainGroups map[string]TopologyDomainGroup
5555
// excludedPods are the pod UIDs of pods that are excluded from counting. This is used so we can simulate
5656
// moving pods to prevent them from being double counted.
5757
excludedPods sets.Set[string]
5858
cluster *state.Cluster
5959
}
6060

61-
func NewTopology(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, domains map[string]sets.Set[string], pods []*v1.Pod) (*Topology, error) {
61+
func NewTopology(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, domainGroups map[string]TopologyDomainGroup, pods []*v1.Pod) (*Topology, error) {
6262
t := &Topology{
6363
kubeClient: kubeClient,
6464
cluster: cluster,
65-
domains: domains,
65+
domainGroups: domainGroups,
6666
topologies: map[uint64]*TopologyGroup{},
6767
inverseTopologies: map[uint64]*TopologyGroup{},
6868
excludedPods: sets.New[string](),
@@ -215,7 +215,7 @@ func (t *Topology) updateInverseAntiAffinity(ctx context.Context, pod *v1.Pod, d
215215
return err
216216
}
217217

218-
tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domains[term.TopologyKey])
218+
tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey])
219219

220220
hash := tg.Hash()
221221
if existing, ok := t.inverseTopologies[hash]; !ok {
@@ -251,6 +251,10 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error {
251251
// capture new domain values from existing nodes that may not have any pods selected by the topology group
252252
// scheduled to them already
253253
t.cluster.ForEachNode(func(n *state.StateNode) bool {
254+
// ignore state nodes which are tracking in-flight NodeClaims
255+
if n.Node == nil {
256+
return true
257+
}
254258
// ignore the node if it doesn't match the topology group
255259
if !tg.nodeFilter.Matches(n.Node) {
256260
return true
@@ -312,7 +316,7 @@ func (t *Topology) newForTopologies(p *v1.Pod) []*TopologyGroup {
312316
var topologyGroups []*TopologyGroup
313317
for _, cs := range p.Spec.TopologySpreadConstraints {
314318
topologyGroups = append(topologyGroups, NewTopologyGroup(TopologyTypeSpread, cs.TopologyKey, p, sets.New(p.Namespace),
315-
cs.LabelSelector, cs.MaxSkew, cs.MinDomains, cs.NodeTaintsPolicy, cs.NodeAffinityPolicy, t.domains[cs.TopologyKey]))
319+
cs.LabelSelector, cs.MaxSkew, cs.MinDomains, cs.NodeTaintsPolicy, cs.NodeAffinityPolicy, t.domainGroups[cs.TopologyKey]))
316320
}
317321
return topologyGroups
318322
}
@@ -349,7 +353,7 @@ func (t *Topology) newForAffinities(ctx context.Context, p *v1.Pod) ([]*Topology
349353
if err != nil {
350354
return nil, err
351355
}
352-
topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domains[term.TopologyKey]))
356+
topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey]))
353357
}
354358
}
355359
return topologyGroups, nil

pkg/controllers/provisioning/scheduling/topology_test.go

+147-29
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package scheduling_test
1818

1919
import (
20+
"fmt"
2021
"time"
2122

2223
. "github.com/onsi/gomega"
@@ -1120,7 +1121,7 @@ var _ = Describe("Topology", func() {
11201121
Skip("NodeTaintsPolicy ony enabled by default for K8s >= 1.26.x")
11211122
}
11221123

1123-
const spreadLabel = "karpenter.sh/fake-label"
1124+
const spreadLabel = "fake-label"
11241125
nodePool.Spec.Template.Labels = map[string]string{
11251126
spreadLabel: "baz",
11261127
}
@@ -1194,7 +1195,7 @@ var _ = Describe("Topology", func() {
11941195
if env.Version.Minor() < 26 {
11951196
Skip("NodeTaintsPolicy ony enabled by default for K8s >= 1.26.x")
11961197
}
1197-
const spreadLabel = "karpenter.sh/fake-label"
1198+
const spreadLabel = "fake-label"
11981199
nodePool.Spec.Template.Labels = map[string]string{
11991200
spreadLabel: "baz",
12001201
}
@@ -1263,23 +1264,83 @@ var _ = Describe("Topology", func() {
12631264
// and should schedule all of the pods on the same node
12641265
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(5))
12651266
})
1266-
FIt("should balance pods across a label when discovered from the provisioner (NodeTaintsPolicy=honor)", func() {
1267+
It("should balance pods across a label when discovered from the provisioner (NodeTaintsPolicy=ignore)", func() {
1268+
const spreadLabel = "fake-label"
1269+
const taintKey = "taint-key"
1270+
nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{
1271+
Key: spreadLabel,
1272+
Operator: v1.NodeSelectorOpIn,
1273+
Values: []string{"foo"},
1274+
})
1275+
taintedNodePool := test.NodePool(v1beta1.NodePool{
1276+
Spec: v1beta1.NodePoolSpec{
1277+
Template: v1beta1.NodeClaimTemplate{
1278+
Spec: v1beta1.NodeClaimSpec{
1279+
Taints: []v1.Taint{
1280+
{
1281+
Key: taintKey,
1282+
Value: "taint-value",
1283+
Effect: v1.TaintEffectNoSchedule,
1284+
},
1285+
},
1286+
Requirements: []v1.NodeSelectorRequirement{
1287+
{
1288+
Key: v1beta1.CapacityTypeLabelKey,
1289+
Operator: v1.NodeSelectorOpExists,
1290+
},
1291+
{
1292+
Key: spreadLabel,
1293+
Operator: v1.NodeSelectorOpIn,
1294+
Values: []string{"bar"},
1295+
},
1296+
},
1297+
},
1298+
},
1299+
},
1300+
})
1301+
1302+
honor := v1.NodeInclusionPolicyIgnore
1303+
topology := []v1.TopologySpreadConstraint{{
1304+
TopologyKey: spreadLabel,
1305+
WhenUnsatisfiable: v1.DoNotSchedule,
1306+
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
1307+
MaxSkew: 1,
1308+
NodeTaintsPolicy: &honor,
1309+
}}
1310+
1311+
pods := test.UnschedulablePods(test.PodOptions{
1312+
ObjectMeta: metav1.ObjectMeta{
1313+
Labels: labels,
1314+
},
1315+
TopologySpreadConstraints: topology,
1316+
}, 2)
1317+
1318+
ExpectApplied(ctx, env.Client, nodePool, taintedNodePool)
1319+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
1320+
1321+
// should fail to schedule both pods, one pod is scheduled to domain "foo" but the other can't be scheduled to domain "bar"
1322+
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1))
1323+
})
1324+
It("should balance pods across a label when discovered from the provisioner (NodeTaintsPolicy=honor)", func() {
12671325
if env.Version.Minor() < 26 {
12681326
Skip("NodeTaintsPolicy ony enabled by default for K8s >= 1.26.x")
12691327
}
1270-
const spreadLabel = "karpenter.sh/fake-label"
1271-
nodePool.Spec.Template.Labels = map[string]string{
1272-
spreadLabel: "baz",
1273-
}
12741328

1275-
nodePoolTainted := test.NodePool(v1beta1.NodePool{
1329+
const spreadLabel = "fake-label"
1330+
const taintKey = "taint-key"
1331+
nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{
1332+
Key: spreadLabel,
1333+
Operator: v1.NodeSelectorOpIn,
1334+
Values: []string{"foo"},
1335+
})
1336+
taintedNodePool := test.NodePool(v1beta1.NodePool{
12761337
Spec: v1beta1.NodePoolSpec{
12771338
Template: v1beta1.NodeClaimTemplate{
12781339
Spec: v1beta1.NodeClaimSpec{
12791340
Taints: []v1.Taint{
12801341
{
1281-
Key: "taintname",
1282-
Value: "taintvalue",
1342+
Key: taintKey,
1343+
Value: "taint-value",
12831344
Effect: v1.TaintEffectNoSchedule,
12841345
},
12851346
},
@@ -1291,17 +1352,14 @@ var _ = Describe("Topology", func() {
12911352
{
12921353
Key: spreadLabel,
12931354
Operator: v1.NodeSelectorOpIn,
1294-
Values: []string{"foo", "bar"},
1355+
Values: []string{"bar"},
12951356
},
12961357
},
12971358
},
12981359
},
12991360
},
13001361
})
13011362

1302-
ExpectApplied(ctx, env.Client, nodePool, nodePoolTainted)
1303-
1304-
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov)
13051363
honor := v1.NodeInclusionPolicyHonor
13061364
topology := []v1.TopologySpreadConstraint{{
13071365
TopologyKey: spreadLabel,
@@ -1311,20 +1369,80 @@ var _ = Describe("Topology", func() {
13111369
NodeTaintsPolicy: &honor,
13121370
}}
13131371

1314-
ExpectApplied(ctx, env.Client, nodePool)
1315-
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov,
1316-
test.UnschedulablePods(test.PodOptions{
1317-
ObjectMeta: metav1.ObjectMeta{Labels: labels},
1318-
ResourceRequirements: v1.ResourceRequirements{
1319-
Requests: v1.ResourceList{
1320-
v1.ResourceCPU: resource.MustParse("1"),
1372+
pods := test.UnschedulablePods(test.PodOptions{
1373+
ObjectMeta: metav1.ObjectMeta{
1374+
Labels: labels,
1375+
},
1376+
TopologySpreadConstraints: topology,
1377+
}, 2)
1378+
1379+
ExpectApplied(ctx, env.Client, nodePool, taintedNodePool)
1380+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
1381+
1382+
// should schedule all pods to domain "foo", ignoring bar since pods don't tolerate
1383+
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2))
1384+
})
1385+
It("should balance pods across a label when mutually exclusive NodePools (by taints) share domains (NodeTaintsPolicy=honor)", func() {
1386+
const spreadLabel = "fake-label"
1387+
const taintKey = "taint-key"
1388+
1389+
nodePools := lo.Map([][]string{{"foo", "bar"}, {"foo", "baz"}}, func(domains []string, i int) *v1beta1.NodePool {
1390+
return test.NodePool(v1beta1.NodePool{
1391+
Spec: v1beta1.NodePoolSpec{
1392+
Template: v1beta1.NodeClaimTemplate{
1393+
Spec: v1beta1.NodeClaimSpec{
1394+
Taints: []v1.Taint{
1395+
{
1396+
Key: taintKey,
1397+
Value: fmt.Sprintf("nodepool-%d", i),
1398+
Effect: v1.TaintEffectNoSchedule,
1399+
},
1400+
},
1401+
Requirements: []v1.NodeSelectorRequirement{
1402+
{
1403+
Key: v1beta1.CapacityTypeLabelKey,
1404+
Operator: v1.NodeSelectorOpExists,
1405+
},
1406+
{
1407+
Key: spreadLabel,
1408+
Operator: v1.NodeSelectorOpIn,
1409+
Values: domains,
1410+
},
1411+
},
1412+
},
13211413
},
13221414
},
1415+
})
1416+
})
1417+
1418+
honor := v1.NodeInclusionPolicyHonor
1419+
topology := []v1.TopologySpreadConstraint{{
1420+
TopologyKey: spreadLabel,
1421+
WhenUnsatisfiable: v1.DoNotSchedule,
1422+
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
1423+
MaxSkew: 1,
1424+
NodeTaintsPolicy: &honor,
1425+
}}
1426+
1427+
pods := lo.Flatten(lo.Map(nodePools, func(np *v1beta1.NodePool, _ int) []*v1.Pod {
1428+
return test.UnschedulablePods(test.PodOptions{
1429+
ObjectMeta: metav1.ObjectMeta{
1430+
Labels: labels,
1431+
},
13231432
TopologySpreadConstraints: topology,
1324-
}, 5)...,
1325-
)
1326-
// and should schedule all of the pods on the same node
1327-
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(5))
1433+
Tolerations: []v1.Toleration{{
1434+
Key: taintKey,
1435+
Effect: v1.TaintEffectNoSchedule,
1436+
Value: np.Spec.Template.Spec.Taints[0].Value,
1437+
}},
1438+
}, 2)
1439+
}))
1440+
1441+
ExpectApplied(ctx, env.Client, nodePools[0], nodePools[1])
1442+
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
1443+
1444+
// Expect 3 total nodes provisioned, 2 pods schedule to foo, 1 to bar, and 1 to baz
1445+
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1, 2, 1))
13281446
})
13291447
})
13301448

@@ -1333,8 +1451,8 @@ var _ = Describe("Topology", func() {
13331451
if env.Version.Minor() < 26 {
13341452
Skip("NodeAffinityPolicy ony enabled by default for K8s >= 1.26.x")
13351453
}
1336-
const spreadLabel = "karpenter.sh/fake-label"
1337-
const affinityLabel = "karpenter.sh/selector"
1454+
const spreadLabel = "fake-label"
1455+
const affinityLabel = "selector"
13381456
const affinityMismatch = "mismatch"
13391457
const affinityMatch = "value"
13401458

@@ -1404,8 +1522,8 @@ var _ = Describe("Topology", func() {
14041522
if env.Version.Minor() < 26 {
14051523
Skip("NodeAffinityPolicy ony enabled by default for K8s >= 1.26.x")
14061524
}
1407-
const spreadLabel = "karpenter.sh/fake-label"
1408-
const affinityLabel = "karpenter.sh/selector"
1525+
const spreadLabel = "fake-label"
1526+
const affinityLabel = "selector"
14091527
const affinityMismatch = "mismatch"
14101528
const affinityMatch = "value"
14111529

0 commit comments

Comments
 (0)