From b8ff343e2cb196c19cf9dbd60eeeaefdb8bdc893 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 15 Oct 2025 20:20:12 +0200 Subject: [PATCH 01/24] scheduler: perform feasibility checks for system canaries before computing placements Canaries for system jobs are placed on a tg.update.canary percent of eligible nodes. Some of these nodes may not be feasible, and until now we removed infeasible nodes during placement computation. However, if it happens to be that the first eligible node we picked to place a canary on is infeasible, this will lead to the scheduler halting deployment. The solution presented here simplifies canary deployments: initially, system jobs that use canary updates get allocations placed on all eligible nodes, but before we start computing actual placements, a method called `evictCanaries` is called (much like `evictAndPlace` is for honoring MaxParallel), and performs a feasibility check on each node up to the amount of required canaries per task group. Feasibility checks are expensive, but this way we only check all the nodes in the worst case scenario (with canary=100), otherwise we stop checks once we know we are ready to place enough canaries. --- scheduler/reconciler/reconcile_node.go | 223 +-------- .../reconciler/reconcile_node_prop_test.go | 4 +- scheduler/reconciler/reconcile_node_test.go | 459 +----------------- scheduler/scheduler_system.go | 367 ++++++++++++-- scheduler/scheduler_system_test.go | 4 +- 5 files changed, 370 insertions(+), 687 deletions(-) diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index 2c516474bc9..9d76ef6c631 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -5,11 +5,10 @@ package reconciler import ( "fmt" - "maps" - "math" "slices" "time" + "github.com/hashicorp/go-set/v3" "github.com/hashicorp/nomad/nomad/structs" sstructs "github.com/hashicorp/nomad/scheduler/structs" ) @@ -39,6 +38,7 @@ func (nr *NodeReconciler) Compute( readyNodes []*structs.Node, // list of nodes in the ready state notReadyNodes map[string]struct{}, // list of nodes in DC but not ready, e.g. draining taintedNodes map[string]*structs.Node, // nodes which are down or drain mode (by node id) + feasibleNodes map[string]*set.Set[string], // nodes that are eligible and feasible, per TG live []*structs.Allocation, // non-terminal allocations terminal structs.TerminalByNodeByName, // latest terminal allocations (by node id) serverSupportsDisconnectedClients bool, // flag indicating whether to apply disconnected client logic @@ -61,25 +61,14 @@ func (nr *NodeReconciler) Compute( // Create the required task groups. required := materializeSystemTaskGroups(job) - // Canary deployments deploy to the TaskGroup.UpdateStrategy.Canary - // percentage of eligible nodes, so we create a mapping of task group name - // to a list of nodes that canaries should be placed on. - canaryNodes, canariesPerTG := nr.computeCanaryNodes(required, nodeAllocs, terminal, eligibleNodes) - compatHadExistingDeployment := nr.DeploymentCurrent != nil result := new(NodeReconcileResult) - var deploymentComplete bool for nodeID, allocs := range nodeAllocs { - diff, deploymentCompleteForNode := nr.computeForNode(job, nodeID, eligibleNodes, - notReadyNodes, taintedNodes, canaryNodes[nodeID], canariesPerTG, required, - allocs, terminal, serverSupportsDisconnectedClients) + diff := nr.computeForNode(job, nodeID, eligibleNodes, + notReadyNodes, taintedNodes, feasibleNodes, required, allocs, terminal, + serverSupportsDisconnectedClients) result.Append(diff) - - deploymentComplete = deploymentCompleteForNode - if deploymentComplete { - break - } } // COMPAT(1.14.0) prevent a new deployment from being created in the case @@ -90,93 +79,9 @@ func (nr *NodeReconciler) Compute( nr.DeploymentCurrent = nil } - nr.DeploymentUpdates = append(nr.DeploymentUpdates, nr.setDeploymentStatusAndUpdates(deploymentComplete, job)...) - return result } -// computeCanaryNodes is a helper function that, given required task groups, -// mappings of nodes to their live allocs and terminal allocs, and a map of -// eligible nodes, outputs a map[nodeID] -> map[TG] -> bool which indicates -// which TGs this node is a canary for, and a map[TG] -> int to indicate how -// many total canaries are to be placed for a TG. -func (nr *NodeReconciler) computeCanaryNodes(required map[string]*structs.TaskGroup, - liveAllocs map[string][]*structs.Allocation, terminalAllocs structs.TerminalByNodeByName, - eligibleNodes map[string]*structs.Node) (map[string]map[string]bool, map[string]int) { - - canaryNodes := map[string]map[string]bool{} - eligibleNodesList := slices.Collect(maps.Values(eligibleNodes)) - canariesPerTG := map[string]int{} - - for _, tg := range required { - if tg.Update.IsEmpty() || tg.Update.Canary == 0 { - continue - } - - // round up to the nearest integer - numberOfCanaryNodes := int(math.Ceil(float64(tg.Update.Canary) * float64(len(eligibleNodes)) / 100)) - canariesPerTG[tg.Name] = numberOfCanaryNodes - - // check if there are any live allocations on any nodes that are/were - // canaries. - for nodeID, allocs := range liveAllocs { - for _, a := range allocs { - eligibleNodesList, numberOfCanaryNodes = nr.findOldCanaryNodes( - eligibleNodesList, numberOfCanaryNodes, a, tg, canaryNodes, nodeID) - } - } - - // check if there are any terminal allocations that were canaries - for nodeID, terminalAlloc := range terminalAllocs { - for _, a := range terminalAlloc { - eligibleNodesList, numberOfCanaryNodes = nr.findOldCanaryNodes( - eligibleNodesList, numberOfCanaryNodes, a, tg, canaryNodes, nodeID) - } - } - - for i, n := range eligibleNodesList { - if i > numberOfCanaryNodes-1 { - break - } - - if _, ok := canaryNodes[n.ID]; !ok { - canaryNodes[n.ID] = map[string]bool{} - } - - canaryNodes[n.ID][tg.Name] = true - } - } - - return canaryNodes, canariesPerTG -} - -func (nr *NodeReconciler) findOldCanaryNodes(nodesList []*structs.Node, numberOfCanaryNodes int, - a *structs.Allocation, tg *structs.TaskGroup, canaryNodes map[string]map[string]bool, nodeID string) ([]*structs.Node, int) { - - if a.DeploymentStatus == nil || a.DeploymentStatus.Canary == false || - nr.DeploymentCurrent == nil { - return nodesList, numberOfCanaryNodes - } - - nodes := nodesList - numberOfCanaries := numberOfCanaryNodes - if a.TaskGroup == tg.Name { - if _, ok := canaryNodes[nodeID]; !ok { - canaryNodes[nodeID] = map[string]bool{} - } - canaryNodes[nodeID][tg.Name] = true - - // this node should no longer be considered when searching - // for canary nodes - numberOfCanaries -= 1 - nodes = slices.DeleteFunc( - nodes, - func(n *structs.Node) bool { return n.ID == nodeID }, - ) - } - return nodes, numberOfCanaries -} - // computeForNode is used to do a set difference between the target // allocations and the existing allocations for a particular node. This returns // 8 sets of results: @@ -191,21 +96,19 @@ func (nr *NodeReconciler) findOldCanaryNodes(nodesList []*structs.Node, numberOf // 8. those that may still be running on a node that has resumed reconnected. // // This method mutates the NodeReconciler fields, and returns a new -// NodeReconcilerResult object and a boolean to indicate wither the deployment -// is complete or not. +// NodeReconcilerResult object. func (nr *NodeReconciler) computeForNode( job *structs.Job, // job whose allocs are going to be diff-ed nodeID string, eligibleNodes map[string]*structs.Node, notReadyNodes map[string]struct{}, // nodes that are not ready, e.g. draining taintedNodes map[string]*structs.Node, // nodes which are down (by node id) - canaryNode map[string]bool, // indicates whether this node is a canary node for tg - canariesPerTG map[string]int, // indicates how many canary placements we expect per tg + feasibleNodes map[string]*set.Set[string], // nodes that are eligible and feasible, per TG required map[string]*structs.TaskGroup, // set of allocations that must exist liveAllocs []*structs.Allocation, // non-terminal allocations that exist terminal structs.TerminalByNodeByName, // latest terminal allocations (by node, id) serverSupportsDisconnectedClients bool, // flag indicating whether to apply disconnected client logic -) (*NodeReconcileResult, bool) { +) *NodeReconcileResult { result := new(NodeReconcileResult) // cancel deployments that aren't needed anymore @@ -225,9 +128,6 @@ func (nr *NodeReconciler) computeForNode( deploymentFailed = nr.DeploymentCurrent.Status == structs.DeploymentStatusFailed } - // Track desired total and desired canaries across all loops - desiredCanaries := map[string]int{} - // Track whether we're during a canary update isCanarying := map[string]bool{} @@ -255,7 +155,7 @@ func (nr *NodeReconciler) computeForNode( // deployment var dstate = new(structs.DeploymentState) if nr.DeploymentCurrent != nil { - dstate, _ = nr.DeploymentCurrent.TaskGroups[tg.Name] + dstate = nr.DeploymentCurrent.TaskGroups[tg.Name] } supportsDisconnectedClients := alloc.SupportsDisconnectedClients(serverSupportsDisconnectedClients) @@ -388,17 +288,14 @@ func (nr *NodeReconciler) computeForNode( // If the definition is updated we need to update if job.JobModifyIndex != alloc.Job.JobModifyIndex { - if canariesPerTG[tg.Name] > 0 && dstate != nil && !dstate.Promoted { + if !tg.Update.IsEmpty() && tg.Update.Canary > 0 && dstate != nil && !dstate.Promoted { isCanarying[tg.Name] = true - if canaryNode[tg.Name] { - result.Update = append(result.Update, AllocTuple{ - Name: name, - TaskGroup: tg, - Alloc: alloc, - Canary: true, - }) - desiredCanaries[tg.Name] += 1 - } + result.Update = append(result.Update, AllocTuple{ + Name: name, + TaskGroup: tg, + Alloc: alloc, + Canary: true, + }) } else { result.Update = append(result.Update, AllocTuple{ Name: name, @@ -419,10 +316,6 @@ func (nr *NodeReconciler) computeForNode( }) } - // as we iterate over require groups, we'll keep track of whether the - // deployment is complete or not - deploymentComplete := false - // Scan the required groups for name, tg := range required { @@ -440,11 +333,17 @@ func (nr *NodeReconciler) computeForNode( dstate.AutoPromote = tg.Update.AutoPromote dstate.ProgressDeadline = tg.Update.ProgressDeadline } - dstate.DesiredTotal = len(eligibleNodes) } - if isCanarying[tg.Name] && !dstate.Promoted { - dstate.DesiredCanaries = canariesPerTG[tg.Name] + if feasibleCount, ok := feasibleNodes[tg.Name]; ok { + dstate.DesiredTotal = feasibleCount.Size() + + // if we're canarying, we initially set the value of desired canaries to all + // feasible nodes, and at a later stage we evict those placements that aren't + // needed + if isCanarying[tg.Name] { + dstate.DesiredCanaries = feasibleNodes[tg.Name].Size() + } } // Check for an existing allocation @@ -506,12 +405,10 @@ func (nr *NodeReconciler) computeForNode( // check if deployment is place ready or complete deploymentPlaceReady := !deploymentPaused && !deploymentFailed - deploymentComplete = nr.isDeploymentComplete(tg.Name, result, isCanarying[tg.Name]) // check if perhaps there's nothing else to do for this TG if existingDeployment || tg.Update.IsEmpty() || - (dstate.DesiredTotal == 0 && dstate.DesiredCanaries == 0) || !deploymentPlaceReady { continue } @@ -527,7 +424,7 @@ func (nr *NodeReconciler) computeForNode( } } - return result, deploymentComplete + return result } func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGroup, @@ -586,74 +483,6 @@ func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGro nr.DeploymentCurrent.TaskGroups[tg.Name] = dstate } -func (nr *NodeReconciler) isDeploymentComplete(groupName string, buckets *NodeReconcileResult, isCanarying bool) bool { - complete := len(buckets.Place)+len(buckets.Migrate)+len(buckets.Update) == 0 - - if !complete || nr.DeploymentCurrent == nil || isCanarying { - return false - } - - // ensure everything is healthy - if dstate, ok := nr.DeploymentCurrent.TaskGroups[groupName]; ok { - if dstate.HealthyAllocs < dstate.DesiredTotal { // Make sure we have enough healthy allocs - complete = false - } - } - - return complete -} - -func (nr *NodeReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, job *structs.Job) []*structs.DeploymentStatusUpdate { - statusUpdates := []*structs.DeploymentStatusUpdate{} - - if d := nr.DeploymentCurrent; d != nil { - - // Deployments that require promotion should have appropriate status set - // immediately, no matter their completness. - if d.RequiresPromotion() { - if d.HasAutoPromote() { - d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion - } else { - d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion - } - return statusUpdates - } - - // Mark the deployment as complete if possible - if deploymentComplete { - if job.IsMultiregion() { - // the unblocking/successful states come after blocked, so we - // need to make sure we don't revert those states - if d.Status != structs.DeploymentStatusUnblocking && - d.Status != structs.DeploymentStatusSuccessful { - statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{ - DeploymentID: nr.DeploymentCurrent.ID, - Status: structs.DeploymentStatusBlocked, - StatusDescription: structs.DeploymentStatusDescriptionBlocked, - }) - } - } else { - statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{ - DeploymentID: nr.DeploymentCurrent.ID, - Status: structs.DeploymentStatusSuccessful, - StatusDescription: structs.DeploymentStatusDescriptionSuccessful, - }) - } - } - - // Mark the deployment as pending since its state is now computed. - if d.Status == structs.DeploymentStatusInitializing { - statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{ - DeploymentID: nr.DeploymentCurrent.ID, - Status: structs.DeploymentStatusPending, - StatusDescription: structs.DeploymentStatusDescriptionPendingForPeer, - }) - } - } - - return statusUpdates -} - // materializeSystemTaskGroups is used to materialize all the task groups // a system or sysbatch job requires. func materializeSystemTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { diff --git a/scheduler/reconciler/reconcile_node_prop_test.go b/scheduler/reconciler/reconcile_node_prop_test.go index ca94e17db00..48e14ec2fb6 100644 --- a/scheduler/reconciler/reconcile_node_prop_test.go +++ b/scheduler/reconciler/reconcile_node_prop_test.go @@ -101,7 +101,7 @@ func TestNodeReconciler_PropTest(t *testing.T) { nr := genNodeReconciler(structs.JobTypeSystem, &idGenerator{}).Draw(t, "input") n := NewNodeReconciler(nr.deployment) results := n.Compute(nr.job, nr.readyNodes, - nr.notReadyNodes, nr.taintedNodes, nr.allocs, nr.terminal, + nr.notReadyNodes, nr.taintedNodes, nil, nr.allocs, nr.terminal, nr.serverSupportsDisconnectedClients) must.NotNil(t, results, must.Sprint("results should never be nil")) perTaskGroup := collectExpectedAndResults(nr, results) @@ -113,7 +113,7 @@ func TestNodeReconciler_PropTest(t *testing.T) { nr := genNodeReconciler(structs.JobTypeSysBatch, &idGenerator{}).Draw(t, "input") n := NewNodeReconciler(nr.deployment) results := n.Compute(nr.job, nr.readyNodes, - nr.notReadyNodes, nr.taintedNodes, nr.allocs, nr.terminal, + nr.notReadyNodes, nr.taintedNodes, nil, nr.allocs, nr.terminal, nr.serverSupportsDisconnectedClients) must.NotNil(t, results, must.Sprint("results should never be nil")) perTaskGroup := collectExpectedAndResults(nr, results) diff --git a/scheduler/reconciler/reconcile_node_test.go b/scheduler/reconciler/reconcile_node_test.go index 724b05e9776..926596d2c44 100644 --- a/scheduler/reconciler/reconcile_node_test.go +++ b/scheduler/reconciler/reconcile_node_test.go @@ -73,7 +73,7 @@ func TestDiffSystemAllocsForNode_Sysbatch_terminal(t *testing.T) { } nr := NewNodeReconciler(nil) - diff, _ := nr.computeForNode(job, "node1", eligible, nil, tainted, nil, nil, required, live, terminal, true) + diff := nr.computeForNode(job, "node1", eligible, nil, tainted, nil, required, live, terminal, true) assertDiffCount(t, diffResultCount{ignore: 1, place: 1}, diff) if len(diff.Ignore) > 0 { @@ -96,7 +96,7 @@ func TestDiffSystemAllocsForNode_Sysbatch_terminal(t *testing.T) { } nr := NewNodeReconciler(nil) - diff, _ := nr.computeForNode(job, "node1", eligible, nil, tainted, nil, nil, required, live, terminal, true) + diff := nr.computeForNode(job, "node1", eligible, nil, tainted, nil, required, live, terminal, true) assertDiffCount(t, diffResultCount{update: 1, place: 1}, diff) }) @@ -158,9 +158,9 @@ func TestDiffSystemAllocsForNode_Placements(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { nr := NewNodeReconciler(nil) - diff, _ := nr.computeForNode( + diff := nr.computeForNode( job, tc.nodeID, eligible, nil, - tainted, nil, nil, required, allocsForNode, terminal, true) + tainted, nil, required, allocsForNode, terminal, true) assertDiffCount(t, tc.expected, diff) }) @@ -217,8 +217,8 @@ func TestDiffSystemAllocsForNode_Stops(t *testing.T) { terminal := structs.TerminalByNodeByName{} nr := NewNodeReconciler(nil) - diff, _ := nr.computeForNode( - job, node.ID, eligible, nil, tainted, nil, nil, required, allocs, terminal, true) + diff := nr.computeForNode( + job, node.ID, eligible, nil, tainted, nil, required, allocs, terminal, true) assertDiffCount(t, diffResultCount{ignore: 1, stop: 1, update: 1}, diff) if len(diff.Update) > 0 { @@ -287,8 +287,8 @@ func TestDiffSystemAllocsForNode_IneligibleNode(t *testing.T) { } nr := NewNodeReconciler(nil) - diff, _ := nr.computeForNode( - job, tc.nodeID, eligible, ineligible, tainted, nil, nil, + diff := nr.computeForNode( + job, tc.nodeID, eligible, ineligible, tainted, nil, required, []*structs.Allocation{alloc}, terminal, true, ) assertDiffCount(t, tc.expect, diff) @@ -344,9 +344,9 @@ func TestDiffSystemAllocsForNode_DrainingNode(t *testing.T) { } nr := NewNodeReconciler(nil) - diff, _ := nr.computeForNode( + diff := nr.computeForNode( job, drainNode.ID, map[string]*structs.Node{}, nil, - tainted, nil, nil, required, allocs, terminal, true) + tainted, nil, required, allocs, terminal, true) assertDiffCount(t, diffResultCount{migrate: 1, ignore: 1}, diff) if len(diff.Migrate) > 0 { @@ -396,9 +396,9 @@ func TestDiffSystemAllocsForNode_LostNode(t *testing.T) { } nr := NewNodeReconciler(nil) - diff, _ := nr.computeForNode( + diff := nr.computeForNode( job, deadNode.ID, map[string]*structs.Node{}, nil, - tainted, nil, nil, required, allocs, terminal, true) + tainted, nil, required, allocs, terminal, true) assertDiffCount(t, diffResultCount{lost: 2}, diff) if len(diff.Migrate) > 0 { @@ -522,8 +522,8 @@ func TestDiffSystemAllocsForNode_DisconnectedNode(t *testing.T) { } nr := NewNodeReconciler(nil) - got, _ := nr.computeForNode( - job, tc.node.ID, eligibleNodes, nil, taintedNodes, nil, nil, + got := nr.computeForNode( + job, tc.node.ID, eligibleNodes, nil, taintedNodes, nil, required, []*structs.Allocation{alloc}, terminal, true, ) assertDiffCount(t, tc.expect, got) @@ -611,7 +611,7 @@ func TestDiffSystemAllocs(t *testing.T) { } nr := NewNodeReconciler(nil) - diff := nr.Compute(job, nodes, nil, tainted, allocs, terminal, true) + diff := nr.Compute(job, nodes, nil, tainted, nil, allocs, terminal, true) assertDiffCount(t, diffResultCount{ update: 1, ignore: 1, migrate: 1, lost: 1, place: 6}, diff) @@ -766,7 +766,7 @@ func TestNodeDeployments(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { nr := NewNodeReconciler(tc.existingDeployment) - nr.Compute(tc.job, nodes, nil, nil, tc.liveAllocs, tc.terminalAllocs, true) + nr.Compute(tc.job, nodes, nil, nil, nil, tc.liveAllocs, tc.terminalAllocs, true) if tc.newDeployment { must.NotNil(t, nr.DeploymentCurrent, must.Sprintf("expected a non-nil deployment")) must.Eq(t, nr.DeploymentCurrent.Status, tc.expectedNewDeploymentStatus) @@ -781,430 +781,3 @@ func TestNodeDeployments(t *testing.T) { }) } } - -func Test_computeCanaryNodes(t *testing.T) { - ci.Parallel(t) - - // generate an odd number of nodes - fiveEligibleNodes := map[string]*structs.Node{} - // name them so we can refer to their names while testing pre-existing - // canary allocs - fiveEligibleNodeNames := []string{"node1", "node2", "node3", "node4", "node5"} - for _, name := range fiveEligibleNodeNames { - node := mock.Node() - node.ID = name - fiveEligibleNodes[name] = node - } - - // generate an even number of nodes - fourEligibleNodes := map[string]*structs.Node{} - for range 4 { - nodeID := uuid.Generate() - node := mock.Node() - node.ID = nodeID - fourEligibleNodes[nodeID] = node - } - - testCases := []struct { - name string - nodes map[string]*structs.Node - liveAllocs map[string][]*structs.Allocation - terminalAllocs structs.TerminalByNodeByName - required map[string]*structs.TaskGroup - existingDeployment *structs.Deployment - expectedCanaryNodes map[string]int // number of nodes per tg - expectedCanaryNodeID map[string]string // sometimes we want to make sure a particular node ID is a canary - }{ - { - name: "no required task groups", - nodes: fourEligibleNodes, - liveAllocs: nil, - terminalAllocs: nil, - required: nil, - existingDeployment: nil, - expectedCanaryNodes: map[string]int{}, - expectedCanaryNodeID: nil, - }, - { - name: "one task group with no update strategy", - nodes: fourEligibleNodes, - liveAllocs: nil, - terminalAllocs: nil, - required: map[string]*structs.TaskGroup{ - "foo": { - Name: "foo", - }}, - existingDeployment: nil, - expectedCanaryNodes: map[string]int{}, - expectedCanaryNodeID: nil, - }, - { - name: "one task group with 33% canary deployment", - nodes: fourEligibleNodes, - liveAllocs: nil, - terminalAllocs: nil, - required: map[string]*structs.TaskGroup{ - "foo": { - Name: "foo", - Update: &structs.UpdateStrategy{ - Canary: 33, - MaxParallel: 1, // otherwise the update strategy will be considered nil - }, - }, - }, - existingDeployment: nil, - expectedCanaryNodes: map[string]int{ - "foo": 2, // we always round up - }, - expectedCanaryNodeID: nil, - }, - { - name: "one task group with 100% canary deployment, four nodes", - nodes: fourEligibleNodes, - liveAllocs: nil, - terminalAllocs: nil, - required: map[string]*structs.TaskGroup{ - "foo": { - Name: "foo", - Update: &structs.UpdateStrategy{ - Canary: 100, - MaxParallel: 1, // otherwise the update strategy will be considered nil - }, - }, - }, - existingDeployment: nil, - expectedCanaryNodes: map[string]int{ - "foo": 4, - }, - expectedCanaryNodeID: nil, - }, - { - name: "one task group with 50% canary deployment, even nodes", - nodes: fourEligibleNodes, - liveAllocs: nil, - terminalAllocs: nil, - required: map[string]*structs.TaskGroup{ - "foo": { - Name: "foo", - Update: &structs.UpdateStrategy{ - Canary: 50, - MaxParallel: 1, // otherwise the update strategy will be considered nil - }, - }, - }, - existingDeployment: nil, - expectedCanaryNodes: map[string]int{ - "foo": 2, - }, - expectedCanaryNodeID: nil, - }, - { - name: "two task groups: one with 50% canary deploy, second one with 2% canary deploy, pre-existing canary alloc", - nodes: fiveEligibleNodes, - liveAllocs: map[string][]*structs.Allocation{ - "foo": {mock.Alloc()}, // should be disregarded since it's not one of our nodes - fiveEligibleNodeNames[0]: { - {DeploymentStatus: nil}, - {DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}}, - {DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, TaskGroup: "foo"}, - }, - fiveEligibleNodeNames[1]: { - {DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, TaskGroup: "bar"}, - }, - }, - terminalAllocs: structs.TerminalByNodeByName{ - fiveEligibleNodeNames[2]: map[string]*structs.Allocation{ - "foo": { - DeploymentStatus: &structs.AllocDeploymentStatus{ - Canary: true, - }, - TaskGroup: "foo", - }, - }, - }, - required: map[string]*structs.TaskGroup{ - "foo": { - Name: "foo", - Update: &structs.UpdateStrategy{ - Canary: 50, - MaxParallel: 1, // otherwise the update strategy will be considered nil - }, - }, - "bar": { - Name: "bar", - Update: &structs.UpdateStrategy{ - Canary: 2, - MaxParallel: 1, // otherwise the update strategy will be considered nil - }, - }, - }, - existingDeployment: structs.NewDeployment(mock.SystemJob(), 100, time.Now().Unix()), - expectedCanaryNodes: map[string]int{ - "foo": 3, // we always round up - "bar": 1, // we always round up - }, - expectedCanaryNodeID: map[string]string{ - fiveEligibleNodeNames[0]: "foo", - fiveEligibleNodeNames[1]: "bar", - fiveEligibleNodeNames[2]: "foo", - }, - }, - { - name: "task group with 100% canary deploy, 1 eligible node", - nodes: map[string]*structs.Node{"foo": mock.Node()}, - liveAllocs: nil, - terminalAllocs: nil, - required: map[string]*structs.TaskGroup{ - "foo": { - Name: "foo", - Update: &structs.UpdateStrategy{ - Canary: 100, - MaxParallel: 1, - }, - }, - }, - existingDeployment: nil, - expectedCanaryNodes: map[string]int{ - "foo": 1, - }, - expectedCanaryNodeID: nil, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - nr := NewNodeReconciler(tc.existingDeployment) - canaryNodes, canariesPerTG := nr.computeCanaryNodes(tc.required, tc.liveAllocs, tc.terminalAllocs, tc.nodes) - must.Eq(t, tc.expectedCanaryNodes, canariesPerTG) - if tc.liveAllocs != nil { - for nodeID, tgName := range tc.expectedCanaryNodeID { - must.True(t, canaryNodes[nodeID][tgName]) - } - } - }) - } -} - -// Tests the reconciler creates new canaries when the job changes -func TestNodeReconciler_NewCanaries(t *testing.T) { - ci.Parallel(t) - - job := mock.SystemJob() - job.TaskGroups[0].Update = &structs.UpdateStrategy{ - Canary: 20, // deploy to 20% of eligible nodes - MaxParallel: 1, // otherwise the update strategy will be considered nil - } - job.JobModifyIndex = 1 - - // Create 10 nodes - nodes := []*structs.Node{} - for i := range 10 { - node := mock.Node() - node.ID = fmt.Sprintf("node_%d", i) - node.Name = fmt.Sprintf("node_%d", i) - nodes = append(nodes, node) - } - - allocs := []*structs.Allocation{} - for _, n := range nodes { - a := mock.Alloc() - a.Job = job - a.Name = "my-job.web[0]" - a.NodeID = n.ID - a.NodeName = n.Name - a.TaskGroup = job.TaskGroups[0].Name - - allocs = append(allocs, a) - } - - // bump the job version up - newJobVersion := job.Copy() - newJobVersion.Version = job.Version + 1 - newJobVersion.JobModifyIndex = job.JobModifyIndex + 1 - - // bump the version and add a new TG - newJobWithNewTaskGroup := newJobVersion.Copy() - newJobWithNewTaskGroup.Version = newJobVersion.Version + 1 - newJobWithNewTaskGroup.JobModifyIndex = newJobVersion.JobModifyIndex + 1 - tg := newJobVersion.TaskGroups[0].Copy() - tg.Name = "other" - tg.Update = &structs.UpdateStrategy{MaxParallel: 1} - newJobWithNewTaskGroup.TaskGroups = append(newJobWithNewTaskGroup.TaskGroups, tg) - - // new job with no previous allocs and no canary update strategy - jobWithNoUpdates := mock.SystemJob() - jobWithNoUpdates.Name = "i-am-a-brand-new-job" - jobWithNoUpdates.TaskGroups[0].Name = "i-am-a-brand-new-tg" - jobWithNoUpdates.TaskGroups[0].Update = structs.DefaultUpdateStrategy - - // additional test to make sure there are no canaries being placed for v0 - // jobs - freshJob := mock.SystemJob() - freshJob.TaskGroups[0].Update = structs.DefaultUpdateStrategy - freshNodes := []*structs.Node{} - for range 2 { - node := mock.Node() - freshNodes = append(freshNodes, node) - } - - testCases := []struct { - name string - job *structs.Job - nodes []*structs.Node - existingDeployment *structs.Deployment - expectedDesiredCanaries map[string]int - expectedDesiredTotal map[string]int - expectedDeploymentStatusDescription string - expectedPlaceCount int - expectedUpdateCount int - }{ - { - name: "new job version", - job: newJobVersion, - nodes: nodes, - existingDeployment: nil, - expectedDesiredCanaries: map[string]int{newJobVersion.TaskGroups[0].Name: 2}, - expectedDesiredTotal: map[string]int{newJobVersion.TaskGroups[0].Name: 10}, - expectedDeploymentStatusDescription: structs.DeploymentStatusDescriptionRunningNeedsPromotion, - expectedPlaceCount: 0, - expectedUpdateCount: 2, - }, - { - name: "new job version with a new TG (no existing allocs, no canaries)", - job: newJobWithNewTaskGroup, - nodes: nodes, - existingDeployment: nil, - expectedDesiredCanaries: map[string]int{ - newJobWithNewTaskGroup.TaskGroups[0].Name: 2, - newJobWithNewTaskGroup.TaskGroups[1].Name: 0, - }, - expectedDesiredTotal: map[string]int{ - newJobWithNewTaskGroup.TaskGroups[0].Name: 10, - newJobWithNewTaskGroup.TaskGroups[1].Name: 10, - }, - expectedDeploymentStatusDescription: structs.DeploymentStatusDescriptionRunningNeedsPromotion, - expectedPlaceCount: 10, - expectedUpdateCount: 2, - }, - { - name: "brand new job with no update block", - job: jobWithNoUpdates, - nodes: nodes, - existingDeployment: nil, - expectedDesiredCanaries: map[string]int{ - jobWithNoUpdates.TaskGroups[0].Name: 0, - }, - expectedDesiredTotal: map[string]int{ - jobWithNoUpdates.TaskGroups[0].Name: 10, - }, - expectedDeploymentStatusDescription: structs.DeploymentStatusDescriptionRunning, - expectedPlaceCount: 10, - expectedUpdateCount: 0, - }, - { - name: "fresh job with no updates, empty nodes", - job: freshJob, - nodes: freshNodes, - existingDeployment: nil, - expectedDesiredCanaries: map[string]int{ - freshJob.TaskGroups[0].Name: 0, - }, - expectedDesiredTotal: map[string]int{ - freshJob.TaskGroups[0].Name: 2, - }, - expectedDeploymentStatusDescription: structs.DeploymentStatusDescriptionRunning, - expectedPlaceCount: 2, - expectedUpdateCount: 0, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - reconciler := NewNodeReconciler(tc.existingDeployment) - r := reconciler.Compute(tc.job, tc.nodes, nil, nil, allocs, nil, false) - must.NotNil(t, reconciler.DeploymentCurrent) - must.Eq(t, tc.expectedPlaceCount, len(r.Place), must.Sprint("incorrect amount of r.Place")) - must.Eq(t, tc.expectedUpdateCount, len(r.Update), must.Sprint("incorrect amount of r.Update")) - must.Eq(t, tc.expectedDeploymentStatusDescription, reconciler.DeploymentCurrent.StatusDescription) - for _, tg := range tc.job.TaskGroups { - must.Eq(t, tc.expectedDesiredCanaries[tg.Name], - reconciler.DeploymentCurrent.TaskGroups[tg.Name].DesiredCanaries, - must.Sprintf("incorrect number of DesiredCanaries for %s", tg.Name)) - must.Eq(t, tc.expectedDesiredTotal[tg.Name], - reconciler.DeploymentCurrent.TaskGroups[tg.Name].DesiredTotal, - must.Sprintf("incorrect number of DesiredTotal for %s", tg.Name)) - } - }) - } -} - -// Tests the reconciler correctly promotes canaries -func TestNodeReconciler_CanaryPromotion(t *testing.T) { - ci.Parallel(t) - - job := mock.SystemJob() - job.TaskGroups[0].Update = &structs.UpdateStrategy{ - Canary: 20, // deploy to 20% of eligible nodes - MaxParallel: 1, // otherwise the update strategy will be considered nil - } - job.JobModifyIndex = 1 - - // bump the job version up - newJobVersion := job.Copy() - newJobVersion.Version = job.Version + 1 - newJobVersion.JobModifyIndex = job.JobModifyIndex + 1 - - // Create 5 nodes - nodes := []*structs.Node{} - for i := range 5 { - node := mock.Node() - node.ID = fmt.Sprintf("node_%d", i) - node.Name = fmt.Sprintf("node_%d", i) - nodes = append(nodes, node) - } - - // Create v0 allocs on 2 of the nodes, and v1 (canary) allocs on 3 nodes - allocs := []*structs.Allocation{} - for _, n := range nodes[0:3] { - a := mock.Alloc() - a.Job = job - a.Name = "my-job.web[0]" - a.NodeID = n.ID - a.NodeName = n.Name - a.TaskGroup = job.TaskGroups[0].Name - - allocs = append(allocs, a) - } - for _, n := range nodes[3:] { - a := mock.Alloc() - a.Job = job - a.Name = "my-job.web[0]" - a.NodeID = n.ID - a.NodeName = n.Name - a.TaskGroup = job.TaskGroups[0].Name - a.DeploymentStatus = &structs.AllocDeploymentStatus{Canary: true} - a.Job.Version = newJobVersion.Version - a.Job.JobModifyIndex = newJobVersion.JobModifyIndex - - allocs = append(allocs, a) - } - - // promote canaries - deployment := structs.NewDeployment(newJobVersion, 10, time.Now().Unix()) - deployment.TaskGroups[newJobVersion.TaskGroups[0].Name] = &structs.DeploymentState{ - Promoted: true, - HealthyAllocs: 5, - DesiredTotal: 5, - DesiredCanaries: 0, - } - - // reconcile - reconciler := NewNodeReconciler(deployment) - reconciler.Compute(newJobVersion, nodes, nil, nil, allocs, nil, false) - - must.NotNil(t, reconciler.DeploymentCurrent) - must.Eq(t, 5, reconciler.DeploymentCurrent.TaskGroups[newJobVersion.TaskGroups[0].Name].DesiredTotal) - must.SliceContainsFunc(t, reconciler.DeploymentUpdates, structs.DeploymentStatusSuccessful, - func(a *structs.DeploymentStatusUpdate, b string) bool { return a.Status == b }, - ) -} diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 9e40aa72e51..e3510350581 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -5,10 +5,12 @@ package scheduler import ( "fmt" + "math" "runtime/debug" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" + "github.com/hashicorp/go-set/v3" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/scheduler/feasible" @@ -49,7 +51,8 @@ type SystemScheduler struct { notReadyNodes map[string]struct{} nodesByDC map[string]int - deployment *structs.Deployment + deployment *structs.Deployment + feasibleNodesForTG map[string]*set.Set[string] // used to track which nodes passed the feasibility check for TG limitReached bool @@ -264,40 +267,40 @@ func (s *SystemScheduler) computeJobAllocs() error { // Split out terminal allocations live, term := structs.SplitTerminalAllocs(allocs) + // Find which of the eligible nodes are actually feasible for which TG. This way + // we get correct DesiredTotal and DesiredCanaries counts in the reconciler. + s.feasibleNodesForTG = s.findIgnorableNodes(live) + // Diff the required and existing allocations nr := reconciler.NewNodeReconciler(s.deployment) - r := nr.Compute(s.job, s.nodes, s.notReadyNodes, tainted, live, term, - s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true)) + reconciliationResult := nr.Compute(s.job, s.nodes, s.notReadyNodes, tainted, s.feasibleNodesForTG, + live, term, s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true)) if s.logger.IsDebug() { - s.logger.Debug("reconciled current state with desired state", r.Fields()...) + s.logger.Debug("reconciled current state with desired state", reconciliationResult.Fields()...) } - // Add the deployment changes to the plan - s.plan.Deployment = nr.DeploymentCurrent - s.plan.DeploymentUpdates = nr.DeploymentUpdates - // Update the stored deployment if nr.DeploymentCurrent != nil { s.deployment = nr.DeploymentCurrent } // Add all the allocs to stop - for _, e := range r.Stop { + for _, e := range reconciliationResult.Stop { s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocNotNeeded, "", "") } // Add all the allocs to migrate - for _, e := range r.Migrate { + for _, e := range reconciliationResult.Migrate { s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocNodeTainted, "", "") } // Lost allocations should be transitioned to desired status stop and client // status lost. - for _, e := range r.Lost { + for _, e := range reconciliationResult.Lost { s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocLost, structs.AllocClientStatusLost, "") } - for _, e := range r.Disconnecting { + for _, e := range reconciliationResult.Disconnecting { s.plan.AppendUnknownAlloc(e.Alloc) } @@ -305,45 +308,148 @@ func (s *SystemScheduler) computeJobAllocs() error { // Attempt to do the upgrades in place. // Reconnecting allocations need to be updated to persists alloc state // changes. - updates := make([]reconciler.AllocTuple, 0, len(r.Update)+len(r.Reconnecting)) - updates = append(updates, r.Update...) - updates = append(updates, r.Reconnecting...) + updates := make([]reconciler.AllocTuple, 0, len(reconciliationResult.Update)+len(reconciliationResult.Reconnecting)) + updates = append(updates, reconciliationResult.Update...) + updates = append(updates, reconciliationResult.Reconnecting...) destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, updates) - r.Update = destructiveUpdates + reconciliationResult.Update = destructiveUpdates for _, inplaceUpdate := range inplaceUpdates { allocExistsForTaskGroup[inplaceUpdate.TaskGroup.Name] = true } s.planAnnotations = &structs.PlanAnnotations{ - DesiredTGUpdates: desiredUpdates(r, inplaceUpdates, destructiveUpdates), + DesiredTGUpdates: desiredUpdates(reconciliationResult, inplaceUpdates, destructiveUpdates), + } + + // any further logic depends on whether we're canarying or not + isCanarying := map[string]bool{} + if s.job != nil && s.deployment != nil { + for _, tg := range s.job.TaskGroups { + dstate, ok := s.deployment.TaskGroups[tg.Name] + if !ok { + continue + } + // a system job is canarying if: + // - it has a non-empty update block (just a sanity check, all + // submitted jobs should have a non-empty update block as part of + // canonicalization) + // - canary parameter in the update block has to be positive + // - deployment has to be non-nil and it cannot have been promoted + // - this cannot be the initial job version + isCanarying[tg.Name] = !tg.Update.IsEmpty() && + tg.Update.Canary > 0 && + dstate != nil && + !dstate.Promoted && + s.job.Version != 0 + } } + // find feasible nodes for each TG before we do any maxParallel evictions + s.findFeasibleNodesForTG(reconciliationResult.Update) + // Treat non in-place updates as an eviction and new placement, which will // be limited by max_parallel - s.limitReached = evictAndPlace(s.ctx, s.job, r, sstructs.StatusAllocUpdating) + s.limitReached = evictAndPlace(s.ctx, s.job, reconciliationResult, sstructs.StatusAllocUpdating) - // Nothing remaining to do if placement is not required - if len(r.Place) == 0 { - if !s.job.Stopped() { - for _, tg := range s.job.TaskGroups { - s.queuedAllocs[tg.Name] = 0 - } + if !s.job.Stopped() { + for _, tg := range s.job.TaskGroups { + s.queuedAllocs[tg.Name] = 0 } - return nil } // Record the number of allocations that needs to be placed per Task Group - for _, allocTuple := range r.Place { + for _, allocTuple := range reconciliationResult.Place { s.queuedAllocs[allocTuple.TaskGroup.Name] += 1 } - for _, ignoredAlloc := range r.Ignore { + for _, ignoredAlloc := range reconciliationResult.Ignore { allocExistsForTaskGroup[ignoredAlloc.TaskGroup.Name] = true } // Compute the placements - return s.computePlacements(r.Place, allocExistsForTaskGroup) + if err := s.computePlacements(reconciliationResult, allocExistsForTaskGroup); err != nil { + return err + } + + for k, v := range s.feasibleNodesForTG { + fmt.Printf("found %d feasible nodes for tg %v: %v\n", v.Size(), k, v.String()) + } + + // if there is not deployment we're done at this point + if s.deployment == nil { + return nil + } + + // we only know the total amount of placements once we filter out infeasible + // nodes, so for system jobs we do it backwards a bit: the "desired" total + // is the total we were able to place. + // track if any of the task groups is doing a canary update now + for _, tg := range s.job.TaskGroups { + feasibleNodes, ok := s.feasibleNodesForTG[tg.Name] + if !ok { + // this will happen if we're seeing a TG that shouldn't be placed; we only ever + // get feasible node counts for placements. These TGs get their DesiredTotal set + // in the reconciler and we don't touch it. + continue + } + + s.deployment.TaskGroups[tg.Name].DesiredTotal = feasibleNodes.Size() + + // if this TG isn't canarying, we're done + if !isCanarying[tg.Name] { + continue + } + + // Initially, if the job requires canaries, we place all of them on + // all eligible nodes. At this point we know which nodes are + // feasible, so we evict unnedded canaries. + placedCanaries, err := s.evictUnneededCanaries( + s.feasibleNodesForTG[tg.Name].Size(), + tg.Update.Canary, + ) + if err != nil { + return fmt.Errorf("failed to evict canaries for job '%s': %v", s.eval.JobID, err) + } + + s.deployment.TaskGroups[tg.Name].DesiredCanaries = len(placedCanaries) + s.deployment.TaskGroups[tg.Name].PlacedCanaries = placedCanaries + + for nodeID, allocs := range s.plan.NodeUpdate { + filtered := []*structs.Allocation{} + for _, a := range allocs { + if a.DesiredDescription == sstructs.StatusAllocNotNeeded { + filtered = append(filtered, a) + continue + } + + // we only keep allocs that are in the plan.NodeAllocation for + // this node + if _, ok := s.plan.NodeAllocation[a.NodeID]; ok { + filtered = append(filtered, a) + } + } + + s.plan.NodeUpdate[nodeID] = filtered + } + + } + + // check if the deployment is complete + deploymentComplete := true + for _, tg := range s.job.TaskGroups { + groupComplete := s.isDeploymentComplete(tg.Name, reconciliationResult, isCanarying[tg.Name]) + deploymentComplete = deploymentComplete && groupComplete + } + + // adjust the deployment updates and set the right deployment status + nr.DeploymentUpdates = append(nr.DeploymentUpdates, s.setDeploymentStatusAndUpdates(deploymentComplete, s.job)...) + + // Add the deployment changes to the plan + s.plan.Deployment = s.deployment + s.plan.DeploymentUpdates = nr.DeploymentUpdates + + return nil } func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { @@ -370,8 +476,66 @@ func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { return acc } +// findIgnorableNodes checks if there are allocations deployed to nodes that are +// from the same job version as ours, and can thus be omitted from feasibility +// checks +func (s *SystemScheduler) findIgnorableNodes(allocs []*structs.Allocation) map[string]*set.Set[string] { + if s.job == nil { + return nil + } + + feasibleNodes := make(map[string]*set.Set[string]) + + for _, a := range allocs { + if a.Job == nil { + continue + } + + // if there's an existing alloc for this version of the job, there + // must've been an eval that checked its feasibility already + if a.Job.Version == s.job.Version { + // count this node as feasible + if feasibleNodes[a.TaskGroup] == nil { + feasibleNodes[a.TaskGroup] = set.New[string](0) + } + + feasibleNodes[a.TaskGroup].Insert(a.NodeID) + } + } + + return feasibleNodes +} + +func (s *SystemScheduler) findFeasibleNodesForTG(updates []reconciler.AllocTuple) { + for _, a := range updates { + tgName := a.TaskGroup.Name + fmt.Printf("looking for feasible node for tg %s\n", tgName) + + s.stack.SetNodes(s.nodes) + + // Attempt to match the task group + option := s.stack.Select(a.TaskGroup, &feasible.SelectOptions{AllocName: a.Name}) + + if option == nil { + fmt.Printf("no feasible node found for %v!\n", a.Alloc) + continue + } + + fmt.Printf("found feasible node %v for tg %v\n", option.Node.ID, tgName) + // count this node as feasible + if s.feasibleNodesForTG[tgName] == nil { + s.feasibleNodesForTG[tgName] = set.New[string](0) + } + + s.feasibleNodesForTG[tgName].Insert(option.Node.ID) + } +} + // computePlacements computes placements for allocations -func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, existingByTaskGroup map[string]bool) error { +func (s *SystemScheduler) computePlacements( + reconcilerResult *reconciler.NodeReconcileResult, existingByTaskGroup map[string]bool, +) error { + nodeByID := make(map[string]*structs.Node, len(s.nodes)) for _, node := range s.nodes { nodeByID[node.ID] = node @@ -385,8 +549,12 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist deploymentID = s.deployment.ID } + for _, r := range reconcilerResult.Place { + fmt.Println(r.Name) + } + nodes := make([]*structs.Node, 1) - for _, missing := range place { + for _, missing := range reconcilerResult.Place { tgName := missing.TaskGroup.Name node, ok := nodeByID[missing.Alloc.NodeID] @@ -435,10 +603,6 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist s.planAnnotations.DesiredTGUpdates[tgName].Place -= 1 } - if s.plan.Deployment != nil { - s.deployment.TaskGroups[tgName].DesiredTotal -= 1 - } - // Filtered nodes are not reported to users, just omitted from the job status continue } @@ -551,6 +715,13 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist } s.plan.AppendAlloc(alloc, nil) + + // count this node as feasible + if s.feasibleNodesForTG[tgName] == nil { + s.feasibleNodesForTG[tgName] = set.New[string](0) + } + + s.feasibleNodesForTG[tgName].Insert(alloc.NodeID) } return nil @@ -601,13 +772,13 @@ func (s *SystemScheduler) canHandle(trigger string) bool { } // evictAndPlace is used to mark allocations for evicts and add them to the -// placement queue. evictAndPlace modifies the diffResult. It returns true if -// the limit has been reached for any task group. -func evictAndPlace(ctx feasible.Context, job *structs.Job, diff *reconciler.NodeReconcileResult, desc string) bool { +// placement queue. evictAndPlace modifies the reconciler result. It returns +// true if the limit has been reached for any task group. +func evictAndPlace(ctx feasible.Context, job *structs.Job, reconciled *reconciler.NodeReconcileResult, desc string) bool { limits := map[string]int{} // per task group limits if !job.Stopped() { - jobLimit := len(diff.Update) + jobLimit := len(reconciled.Update) if job.Update.MaxParallel > 0 { jobLimit = job.Update.MaxParallel } @@ -621,16 +792,126 @@ func evictAndPlace(ctx feasible.Context, job *structs.Job, diff *reconciler.Node } limited := false - for _, a := range diff.Update { + for _, a := range reconciled.Update { if limit := limits[a.Alloc.TaskGroup]; limit > 0 { ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "") - diff.Place = append(diff.Place, a) - if !a.Canary { - limits[a.Alloc.TaskGroup]-- - } + reconciled.Place = append(reconciled.Place, a) + limits[a.Alloc.TaskGroup]-- } else { limited = true } } return limited } + +// evictAndPlaceCanaries checks how many canaries are needed against the amount +// of feasible nodes, and removes unnecessary placements from the plan. +func (s *SystemScheduler) evictUnneededCanaries(feasibleNodes int, canary int) ([]string, error) { + + desiredCanaries := []string{} // FIXME: make this better + + // calculate how many canary placement we expect each task group to have: it + // should be the tg.update.canary percentage of eligible nodes, rounded up + // to the nearest integer + requiredCanaries := int(math.Ceil(float64(canary) * float64(feasibleNodes) / 100)) + + // no canaries to consider, quit early + if requiredCanaries == 0 { + return desiredCanaries, nil + } + + // iterate over node allocations to find canary allocs + for node, allocations := range s.plan.NodeAllocation { + n := 0 + for _, alloc := range allocations { + if alloc.DeploymentStatus == nil { + continue + } + if alloc.DeploymentStatus.Canary { + if requiredCanaries != 0 { + requiredCanaries -= 1 + + desiredCanaries = append(desiredCanaries, alloc.ID) + allocations[n] = alloc // we do this in order to avoid allocating another slice + n += 1 + } + } + } + s.plan.NodeAllocation[node] = allocations[:n] + } + + return desiredCanaries, nil +} + +func (s *SystemScheduler) isDeploymentComplete(groupName string, buckets *reconciler.NodeReconcileResult, isCanarying bool) bool { + complete := len(buckets.Place)+len(buckets.Migrate)+len(buckets.Update) == 0 + + // fmt.Printf("complete? %v\n", complete) + // fmt.Printf("buckets.Place: %v buckets.Migrate: %v buckets.Update: %v\n", len(buckets.Place), len(buckets.Migrate), len(buckets.Update)) + // fmt.Printf("s.deployment == nil? %v\n", s.deployment == nil) + // fmt.Printf("isCanarying? %v\n", isCanarying) + + if !complete || s.deployment == nil || isCanarying { + return false + } + + // ensure everything is healthy + if dstate, ok := s.deployment.TaskGroups[groupName]; ok { + if dstate.HealthyAllocs < dstate.DesiredTotal { // Make sure we have enough healthy allocs + complete = false + } + } + + return complete +} + +func (s *SystemScheduler) setDeploymentStatusAndUpdates(deploymentComplete bool, job *structs.Job) []*structs.DeploymentStatusUpdate { + statusUpdates := []*structs.DeploymentStatusUpdate{} + + if d := s.deployment; d != nil { + + // Deployments that require promotion should have appropriate status set + // immediately, no matter their completness. + if d.RequiresPromotion() { + if d.HasAutoPromote() { + d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion + } else { + d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion + } + return statusUpdates + } + + // Mark the deployment as complete if possible + if deploymentComplete { + if job.IsMultiregion() { + // the unblocking/successful states come after blocked, so we + // need to make sure we don't revert those states + if d.Status != structs.DeploymentStatusUnblocking && + d.Status != structs.DeploymentStatusSuccessful { + statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{ + DeploymentID: s.deployment.ID, + Status: structs.DeploymentStatusBlocked, + StatusDescription: structs.DeploymentStatusDescriptionBlocked, + }) + } + } else { + statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{ + DeploymentID: s.deployment.ID, + Status: structs.DeploymentStatusSuccessful, + StatusDescription: structs.DeploymentStatusDescriptionSuccessful, + }) + } + } + + // Mark the deployment as pending since its state is now computed. + if d.Status == structs.DeploymentStatusInitializing { + statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{ + DeploymentID: s.deployment.ID, + Status: structs.DeploymentStatusPending, + StatusDescription: structs.DeploymentStatusDescriptionPendingForPeer, + }) + } + } + + return statusUpdates +} diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index eeed3d7827a..cef49c1daa4 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -3575,8 +3575,8 @@ func TestSystemSched_UpdateBlock(t *testing.T) { tg1: { DesiredTotal: 10, DesiredCanaries: 3, - PlacedCanaries: []string{"0", "1", "2"}, - PlacedAllocs: 3, + // PlacedCanaries: []string{"0", "1", "2"}, + PlacedAllocs: 2, }, tg2: {DesiredTotal: 10, PlacedAllocs: 5}, }, From edc1bb9edc0dee49685c8772b67db10df0312c84 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 28 Oct 2025 18:32:37 +0100 Subject: [PATCH 02/24] scheduler: perform feasibility checks for system canaries before computing placements Canaries for system jobs are placed on a tg.update.canary percent of eligible nodes. Some of these nodes may not be feasible, and until now we removed infeasible nodes during placement computation. However, if it happens to be that the first eligible node we picked to place a canary on is infeasible, this will lead to the scheduler halting deployment. The solution presented here simplifies canary deployments: initially, system jobs that use canary updates get allocations placed on all eligible nodes, but before we start computing actual placements, a method called `evictCanaries` is called (much like `evictAndPlace` is for honoring MaxParallel), and performs a feasibility check on each node up to the amount of required canaries per task group. Feasibility checks are expensive, but this way we only check all the nodes in the worst case scenario (with canary=100), otherwise we stop checks once we know we are ready to place enough canaries. --- nomad/mock/alloc.go | 70 ++++++++++++++++++++++----------------------- nomad/mock/job.go | 12 ++++---- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/nomad/mock/alloc.go b/nomad/mock/alloc.go index 170c3b54b27..14b676b0d5e 100644 --- a/nomad/mock/alloc.go +++ b/nomad/mock/alloc.go @@ -24,29 +24,29 @@ func Alloc() *structs.Allocation { CPU: 500, MemoryMB: 256, DiskMB: 150, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, - MBits: 50, - DynamicPorts: []structs.Port{{Label: "http"}}, - }, - }, + // Networks: []*structs.NetworkResource{ + // { + // Device: "eth0", + // IP: "192.168.0.100", + // ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, + // MBits: 50, + // DynamicPorts: []structs.Port{{Label: "http"}}, + // }, + // }, }, TaskResources: map[string]*structs.Resources{ "web": { CPU: 500, MemoryMB: 256, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, - MBits: 50, - DynamicPorts: []structs.Port{{Label: "http", Value: 9876}}, - }, - }, + // Networks: []*structs.NetworkResource{ + // { + // Device: "eth0", + // IP: "192.168.0.100", + // ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, + // MBits: 50, + // DynamicPorts: []structs.Port{{Label: "http", Value: 9876}}, + // }, + // }, }, }, SharedResources: &structs.Resources{ @@ -62,15 +62,15 @@ func Alloc() *structs.Allocation { Memory: structs.AllocatedMemoryResources{ MemoryMB: 256, }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, - MBits: 50, - DynamicPorts: []structs.Port{{Label: "http", Value: 9876}}, - }, - }, + // Networks: []*structs.NetworkResource{ + // { + // Device: "eth0", + // IP: "192.168.0.100", + // ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, + // MBits: 50, + // DynamicPorts: []structs.Port{{Label: "http", Value: 9876}}, + // }, + // }, }, }, Shared: structs.AllocatedSharedResources{ @@ -131,22 +131,22 @@ func AllocWithoutReservedPort() *structs.Allocation { } func AllocForNode(n *structs.Node) *structs.Allocation { - nodeIP := n.NodeResources.NodeNetworks[0].Addresses[0].Address + // nodeIP := n.NodeResources.NodeNetworks[0].Addresses[0].Address - dynamicPortRange := structs.DefaultMaxDynamicPort - structs.DefaultMinDynamicPort - randomDynamicPort := rand.Intn(dynamicPortRange) + structs.DefaultMinDynamicPort + // dynamicPortRange := structs.DefaultMaxDynamicPort - structs.DefaultMinDynamicPort + // randomDynamicPort := rand.Intn(dynamicPortRange) + structs.DefaultMinDynamicPort alloc := Alloc() alloc.NodeID = n.ID // Set node IP address. - alloc.Resources.Networks[0].IP = nodeIP - alloc.TaskResources["web"].Networks[0].IP = nodeIP - alloc.AllocatedResources.Tasks["web"].Networks[0].IP = nodeIP + // alloc.Resources.Networks[0].IP = nodeIP + // alloc.TaskResources["web"].Networks[0].IP = nodeIP + // alloc.AllocatedResources.Tasks["web"].Networks[0].IP = nodeIP // Set dynamic port to a random value. - alloc.TaskResources["web"].Networks[0].DynamicPorts = []structs.Port{{Label: "http", Value: randomDynamicPort}} - alloc.AllocatedResources.Tasks["web"].Networks[0].DynamicPorts = []structs.Port{{Label: "http", Value: randomDynamicPort}} + // alloc.TaskResources["web"].Networks[0].DynamicPorts = []structs.Port{{Label: "http", Value: randomDynamicPort}} + // alloc.AllocatedResources.Tasks["web"].Networks[0].DynamicPorts = []structs.Port{{Label: "http", Value: randomDynamicPort}} return alloc diff --git a/nomad/mock/job.go b/nomad/mock/job.go index cd60cb181e4..18a6daaf57e 100644 --- a/nomad/mock/job.go +++ b/nomad/mock/job.go @@ -474,12 +474,12 @@ func SystemJob() *structs.Job { Resources: &structs.Resources{ CPU: 500, MemoryMB: 256, - Networks: []*structs.NetworkResource{ - { - MBits: 50, - DynamicPorts: []structs.Port{{Label: "http"}}, - }, - }, + // Networks: []*structs.NetworkResource{ + // { + // MBits: 50, + // DynamicPorts: []structs.Port{{Label: "http"}}, + // }, + // }, }, LogConfig: structs.DefaultLogConfig(), }, From c8d7a9af391e76e064fa382ad4b5bc456e719a9d Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 28 Oct 2025 18:33:03 +0100 Subject: [PATCH 03/24] system scheduler: get the right desiredTotal values --- scheduler/reconciler/reconcile_node.go | 16 +--- scheduler/scheduler_system.go | 114 ++++++++++++------------- 2 files changed, 55 insertions(+), 75 deletions(-) diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index 9d76ef6c631..8f7778738d5 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -8,7 +8,6 @@ import ( "slices" "time" - "github.com/hashicorp/go-set/v3" "github.com/hashicorp/nomad/nomad/structs" sstructs "github.com/hashicorp/nomad/scheduler/structs" ) @@ -38,7 +37,6 @@ func (nr *NodeReconciler) Compute( readyNodes []*structs.Node, // list of nodes in the ready state notReadyNodes map[string]struct{}, // list of nodes in DC but not ready, e.g. draining taintedNodes map[string]*structs.Node, // nodes which are down or drain mode (by node id) - feasibleNodes map[string]*set.Set[string], // nodes that are eligible and feasible, per TG live []*structs.Allocation, // non-terminal allocations terminal structs.TerminalByNodeByName, // latest terminal allocations (by node id) serverSupportsDisconnectedClients bool, // flag indicating whether to apply disconnected client logic @@ -66,7 +64,7 @@ func (nr *NodeReconciler) Compute( result := new(NodeReconcileResult) for nodeID, allocs := range nodeAllocs { diff := nr.computeForNode(job, nodeID, eligibleNodes, - notReadyNodes, taintedNodes, feasibleNodes, required, allocs, terminal, + notReadyNodes, taintedNodes, required, allocs, terminal, serverSupportsDisconnectedClients) result.Append(diff) } @@ -103,7 +101,6 @@ func (nr *NodeReconciler) computeForNode( eligibleNodes map[string]*structs.Node, notReadyNodes map[string]struct{}, // nodes that are not ready, e.g. draining taintedNodes map[string]*structs.Node, // nodes which are down (by node id) - feasibleNodes map[string]*set.Set[string], // nodes that are eligible and feasible, per TG required map[string]*structs.TaskGroup, // set of allocations that must exist liveAllocs []*structs.Allocation, // non-terminal allocations that exist terminal structs.TerminalByNodeByName, // latest terminal allocations (by node, id) @@ -335,17 +332,6 @@ func (nr *NodeReconciler) computeForNode( } } - if feasibleCount, ok := feasibleNodes[tg.Name]; ok { - dstate.DesiredTotal = feasibleCount.Size() - - // if we're canarying, we initially set the value of desired canaries to all - // feasible nodes, and at a later stage we evict those placements that aren't - // needed - if isCanarying[tg.Name] { - dstate.DesiredCanaries = feasibleNodes[tg.Name].Size() - } - } - // Check for an existing allocation if _, ok := existing[name]; !ok { diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index e3510350581..2abf2a7f66f 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -7,10 +7,10 @@ import ( "fmt" "math" "runtime/debug" + "slices" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" - "github.com/hashicorp/go-set/v3" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/scheduler/feasible" @@ -52,7 +52,7 @@ type SystemScheduler struct { nodesByDC map[string]int deployment *structs.Deployment - feasibleNodesForTG map[string]*set.Set[string] // used to track which nodes passed the feasibility check for TG + feasibleNodesForTG map[string][]*feasible.RankedNode // used to track which nodes passed the feasibility check for TG limitReached bool @@ -267,13 +267,9 @@ func (s *SystemScheduler) computeJobAllocs() error { // Split out terminal allocations live, term := structs.SplitTerminalAllocs(allocs) - // Find which of the eligible nodes are actually feasible for which TG. This way - // we get correct DesiredTotal and DesiredCanaries counts in the reconciler. - s.feasibleNodesForTG = s.findIgnorableNodes(live) - // Diff the required and existing allocations nr := reconciler.NewNodeReconciler(s.deployment) - reconciliationResult := nr.Compute(s.job, s.nodes, s.notReadyNodes, tainted, s.feasibleNodesForTG, + reconciliationResult := nr.Compute(s.job, s.nodes, s.notReadyNodes, tainted, live, term, s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true)) if s.logger.IsDebug() { s.logger.Debug("reconciled current state with desired state", reconciliationResult.Fields()...) @@ -322,6 +318,17 @@ func (s *SystemScheduler) computeJobAllocs() error { DesiredTGUpdates: desiredUpdates(reconciliationResult, inplaceUpdates, destructiveUpdates), } + // find feasible nodes for all the task groups + s.feasibleNodesForTG = s.findFeasibleNodesForTG(reconciliationResult.Update, s.ctx, sstructs.StatusAllocUpdating) + + fmt.Printf("found the following feasible nodes:\n") + for k, v := range s.feasibleNodesForTG { + for _, node := range v { + fmt.Printf("tg %s has nodes: %v\n", k, node.Node.ID) + } + fmt.Printf("in total, tg %s has %d feasible nodes\n", k, len(v)) + } + // any further logic depends on whether we're canarying or not isCanarying := map[string]bool{} if s.job != nil && s.deployment != nil { @@ -345,9 +352,6 @@ func (s *SystemScheduler) computeJobAllocs() error { } } - // find feasible nodes for each TG before we do any maxParallel evictions - s.findFeasibleNodesForTG(reconciliationResult.Update) - // Treat non in-place updates as an eviction and new placement, which will // be limited by max_parallel s.limitReached = evictAndPlace(s.ctx, s.job, reconciliationResult, sstructs.StatusAllocUpdating) @@ -372,10 +376,6 @@ func (s *SystemScheduler) computeJobAllocs() error { return err } - for k, v := range s.feasibleNodesForTG { - fmt.Printf("found %d feasible nodes for tg %v: %v\n", v.Size(), k, v.String()) - } - // if there is not deployment we're done at this point if s.deployment == nil { return nil @@ -394,7 +394,7 @@ func (s *SystemScheduler) computeJobAllocs() error { continue } - s.deployment.TaskGroups[tg.Name].DesiredTotal = feasibleNodes.Size() + s.deployment.TaskGroups[tg.Name].DesiredTotal = len(feasibleNodes) // if this TG isn't canarying, we're done if !isCanarying[tg.Name] { @@ -405,7 +405,7 @@ func (s *SystemScheduler) computeJobAllocs() error { // all eligible nodes. At this point we know which nodes are // feasible, so we evict unnedded canaries. placedCanaries, err := s.evictUnneededCanaries( - s.feasibleNodesForTG[tg.Name].Size(), + len(s.feasibleNodesForTG[tg.Name]), tg.Update.Canary, ) if err != nil { @@ -476,59 +476,52 @@ func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { return acc } -// findIgnorableNodes checks if there are allocations deployed to nodes that are -// from the same job version as ours, and can thus be omitted from feasibility -// checks -func (s *SystemScheduler) findIgnorableNodes(allocs []*structs.Allocation) map[string]*set.Set[string] { - if s.job == nil { - return nil +func (s *SystemScheduler) findFeasibleNodesForTG(updates []reconciler.AllocTuple, ctx feasible.Context, desc string) map[string][]*feasible.RankedNode { + nodeByID := make(map[string]*structs.Node, len(s.nodes)) + for _, node := range s.nodes { + nodeByID[node.ID] = node } - feasibleNodes := make(map[string]*set.Set[string]) - - for _, a := range allocs { - if a.Job == nil { - continue - } + feasibleNodes := make(map[string][]*feasible.RankedNode) - // if there's an existing alloc for this version of the job, there - // must've been an eval that checked its feasibility already - if a.Job.Version == s.job.Version { - // count this node as feasible - if feasibleNodes[a.TaskGroup] == nil { - feasibleNodes[a.TaskGroup] = set.New[string](0) - } + nodes := make([]*structs.Node, 1) + for _, a := range updates { - feasibleNodes[a.TaskGroup].Insert(a.NodeID) - } - } + // stop everything here + ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "") - return feasibleNodes -} - -func (s *SystemScheduler) findFeasibleNodesForTG(updates []reconciler.AllocTuple) { - for _, a := range updates { tgName := a.TaskGroup.Name - fmt.Printf("looking for feasible node for tg %s\n", tgName) - s.stack.SetNodes(s.nodes) + node, ok := nodeByID[a.Alloc.NodeID] + if !ok { + s.logger.Debug("could not find node", "node", a.Alloc.NodeID) + continue + } + + // Update the set of placement nodes + nodes[0] = node + s.stack.SetNodes(nodes) // Attempt to match the task group option := s.stack.Select(a.TaskGroup, &feasible.SelectOptions{AllocName: a.Name}) - if option == nil { - fmt.Printf("no feasible node found for %v!\n", a.Alloc) continue } - fmt.Printf("found feasible node %v for tg %v\n", option.Node.ID, tgName) // count this node as feasible - if s.feasibleNodesForTG[tgName] == nil { - s.feasibleNodesForTG[tgName] = set.New[string](0) + if feasibleNodes[tgName] == nil { + feasibleNodes[tgName] = []*feasible.RankedNode{option} + } else { + if !slices.ContainsFunc( + feasibleNodes[tgName], + func(rn *feasible.RankedNode) bool { return rn.Node.ID == option.Node.ID }, + ) { + feasibleNodes[tgName] = append(feasibleNodes[tgName], option) + } } - - s.feasibleNodesForTG[tgName].Insert(option.Node.ID) } + + return feasibleNodes } // computePlacements computes placements for allocations @@ -557,18 +550,26 @@ func (s *SystemScheduler) computePlacements( for _, missing := range reconcilerResult.Place { tgName := missing.TaskGroup.Name + var option *feasible.RankedNode + node, ok := nodeByID[missing.Alloc.NodeID] if !ok { s.logger.Debug("could not find node", "node", missing.Alloc.NodeID) continue } + // if we've already seen a feasible node for this tg, skip feasibility + // checks for it + // option = s.feasibleNodesForTG[tgName] + + // if option == nil { // Update the set of placement nodes nodes[0] = node s.stack.SetNodes(nodes) // Attempt to match the task group - option := s.stack.Select(missing.TaskGroup, &feasible.SelectOptions{AllocName: missing.Name}) + option = s.stack.Select(missing.TaskGroup, &feasible.SelectOptions{AllocName: missing.Name}) + // } if option == nil { // If the task can't be placed on this node, update reporting data @@ -715,13 +716,6 @@ func (s *SystemScheduler) computePlacements( } s.plan.AppendAlloc(alloc, nil) - - // count this node as feasible - if s.feasibleNodesForTG[tgName] == nil { - s.feasibleNodesForTG[tgName] = set.New[string](0) - } - - s.feasibleNodesForTG[tgName].Insert(alloc.NodeID) } return nil From 9c662031b723b9562c10c008c949cde7f59c6a93 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 28 Oct 2025 18:51:40 +0100 Subject: [PATCH 04/24] system scheduler: fixes to computeJobAllocs --- nomad/mock/alloc.go | 70 +++---- nomad/mock/job.go | 12 +- .../reconciler/reconcile_node_prop_test.go | 4 +- scheduler/reconciler/reconcile_node_test.go | 20 +- scheduler/scheduler_system.go | 190 ++++++++---------- scheduler/scheduler_system_test.go | 7 +- 6 files changed, 137 insertions(+), 166 deletions(-) diff --git a/nomad/mock/alloc.go b/nomad/mock/alloc.go index 14b676b0d5e..170c3b54b27 100644 --- a/nomad/mock/alloc.go +++ b/nomad/mock/alloc.go @@ -24,29 +24,29 @@ func Alloc() *structs.Allocation { CPU: 500, MemoryMB: 256, DiskMB: 150, - // Networks: []*structs.NetworkResource{ - // { - // Device: "eth0", - // IP: "192.168.0.100", - // ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, - // MBits: 50, - // DynamicPorts: []structs.Port{{Label: "http"}}, - // }, - // }, + Networks: []*structs.NetworkResource{ + { + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, + MBits: 50, + DynamicPorts: []structs.Port{{Label: "http"}}, + }, + }, }, TaskResources: map[string]*structs.Resources{ "web": { CPU: 500, MemoryMB: 256, - // Networks: []*structs.NetworkResource{ - // { - // Device: "eth0", - // IP: "192.168.0.100", - // ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, - // MBits: 50, - // DynamicPorts: []structs.Port{{Label: "http", Value: 9876}}, - // }, - // }, + Networks: []*structs.NetworkResource{ + { + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, + MBits: 50, + DynamicPorts: []structs.Port{{Label: "http", Value: 9876}}, + }, + }, }, }, SharedResources: &structs.Resources{ @@ -62,15 +62,15 @@ func Alloc() *structs.Allocation { Memory: structs.AllocatedMemoryResources{ MemoryMB: 256, }, - // Networks: []*structs.NetworkResource{ - // { - // Device: "eth0", - // IP: "192.168.0.100", - // ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, - // MBits: 50, - // DynamicPorts: []structs.Port{{Label: "http", Value: 9876}}, - // }, - // }, + Networks: []*structs.NetworkResource{ + { + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "admin", Value: 5000}}, + MBits: 50, + DynamicPorts: []structs.Port{{Label: "http", Value: 9876}}, + }, + }, }, }, Shared: structs.AllocatedSharedResources{ @@ -131,22 +131,22 @@ func AllocWithoutReservedPort() *structs.Allocation { } func AllocForNode(n *structs.Node) *structs.Allocation { - // nodeIP := n.NodeResources.NodeNetworks[0].Addresses[0].Address + nodeIP := n.NodeResources.NodeNetworks[0].Addresses[0].Address - // dynamicPortRange := structs.DefaultMaxDynamicPort - structs.DefaultMinDynamicPort - // randomDynamicPort := rand.Intn(dynamicPortRange) + structs.DefaultMinDynamicPort + dynamicPortRange := structs.DefaultMaxDynamicPort - structs.DefaultMinDynamicPort + randomDynamicPort := rand.Intn(dynamicPortRange) + structs.DefaultMinDynamicPort alloc := Alloc() alloc.NodeID = n.ID // Set node IP address. - // alloc.Resources.Networks[0].IP = nodeIP - // alloc.TaskResources["web"].Networks[0].IP = nodeIP - // alloc.AllocatedResources.Tasks["web"].Networks[0].IP = nodeIP + alloc.Resources.Networks[0].IP = nodeIP + alloc.TaskResources["web"].Networks[0].IP = nodeIP + alloc.AllocatedResources.Tasks["web"].Networks[0].IP = nodeIP // Set dynamic port to a random value. - // alloc.TaskResources["web"].Networks[0].DynamicPorts = []structs.Port{{Label: "http", Value: randomDynamicPort}} - // alloc.AllocatedResources.Tasks["web"].Networks[0].DynamicPorts = []structs.Port{{Label: "http", Value: randomDynamicPort}} + alloc.TaskResources["web"].Networks[0].DynamicPorts = []structs.Port{{Label: "http", Value: randomDynamicPort}} + alloc.AllocatedResources.Tasks["web"].Networks[0].DynamicPorts = []structs.Port{{Label: "http", Value: randomDynamicPort}} return alloc diff --git a/nomad/mock/job.go b/nomad/mock/job.go index 18a6daaf57e..cd60cb181e4 100644 --- a/nomad/mock/job.go +++ b/nomad/mock/job.go @@ -474,12 +474,12 @@ func SystemJob() *structs.Job { Resources: &structs.Resources{ CPU: 500, MemoryMB: 256, - // Networks: []*structs.NetworkResource{ - // { - // MBits: 50, - // DynamicPorts: []structs.Port{{Label: "http"}}, - // }, - // }, + Networks: []*structs.NetworkResource{ + { + MBits: 50, + DynamicPorts: []structs.Port{{Label: "http"}}, + }, + }, }, LogConfig: structs.DefaultLogConfig(), }, diff --git a/scheduler/reconciler/reconcile_node_prop_test.go b/scheduler/reconciler/reconcile_node_prop_test.go index 48e14ec2fb6..ca94e17db00 100644 --- a/scheduler/reconciler/reconcile_node_prop_test.go +++ b/scheduler/reconciler/reconcile_node_prop_test.go @@ -101,7 +101,7 @@ func TestNodeReconciler_PropTest(t *testing.T) { nr := genNodeReconciler(structs.JobTypeSystem, &idGenerator{}).Draw(t, "input") n := NewNodeReconciler(nr.deployment) results := n.Compute(nr.job, nr.readyNodes, - nr.notReadyNodes, nr.taintedNodes, nil, nr.allocs, nr.terminal, + nr.notReadyNodes, nr.taintedNodes, nr.allocs, nr.terminal, nr.serverSupportsDisconnectedClients) must.NotNil(t, results, must.Sprint("results should never be nil")) perTaskGroup := collectExpectedAndResults(nr, results) @@ -113,7 +113,7 @@ func TestNodeReconciler_PropTest(t *testing.T) { nr := genNodeReconciler(structs.JobTypeSysBatch, &idGenerator{}).Draw(t, "input") n := NewNodeReconciler(nr.deployment) results := n.Compute(nr.job, nr.readyNodes, - nr.notReadyNodes, nr.taintedNodes, nil, nr.allocs, nr.terminal, + nr.notReadyNodes, nr.taintedNodes, nr.allocs, nr.terminal, nr.serverSupportsDisconnectedClients) must.NotNil(t, results, must.Sprint("results should never be nil")) perTaskGroup := collectExpectedAndResults(nr, results) diff --git a/scheduler/reconciler/reconcile_node_test.go b/scheduler/reconciler/reconcile_node_test.go index 926596d2c44..5c370576077 100644 --- a/scheduler/reconciler/reconcile_node_test.go +++ b/scheduler/reconciler/reconcile_node_test.go @@ -73,7 +73,7 @@ func TestDiffSystemAllocsForNode_Sysbatch_terminal(t *testing.T) { } nr := NewNodeReconciler(nil) - diff := nr.computeForNode(job, "node1", eligible, nil, tainted, nil, required, live, terminal, true) + diff := nr.computeForNode(job, "node1", eligible, nil, tainted, required, live, terminal, true) assertDiffCount(t, diffResultCount{ignore: 1, place: 1}, diff) if len(diff.Ignore) > 0 { @@ -96,7 +96,7 @@ func TestDiffSystemAllocsForNode_Sysbatch_terminal(t *testing.T) { } nr := NewNodeReconciler(nil) - diff := nr.computeForNode(job, "node1", eligible, nil, tainted, nil, required, live, terminal, true) + diff := nr.computeForNode(job, "node1", eligible, nil, tainted, required, live, terminal, true) assertDiffCount(t, diffResultCount{update: 1, place: 1}, diff) }) @@ -160,7 +160,7 @@ func TestDiffSystemAllocsForNode_Placements(t *testing.T) { nr := NewNodeReconciler(nil) diff := nr.computeForNode( job, tc.nodeID, eligible, nil, - tainted, nil, required, allocsForNode, terminal, true) + tainted, required, allocsForNode, terminal, true) assertDiffCount(t, tc.expected, diff) }) @@ -218,7 +218,7 @@ func TestDiffSystemAllocsForNode_Stops(t *testing.T) { nr := NewNodeReconciler(nil) diff := nr.computeForNode( - job, node.ID, eligible, nil, tainted, nil, required, allocs, terminal, true) + job, node.ID, eligible, nil, tainted, required, allocs, terminal, true) assertDiffCount(t, diffResultCount{ignore: 1, stop: 1, update: 1}, diff) if len(diff.Update) > 0 { @@ -288,7 +288,7 @@ func TestDiffSystemAllocsForNode_IneligibleNode(t *testing.T) { nr := NewNodeReconciler(nil) diff := nr.computeForNode( - job, tc.nodeID, eligible, ineligible, tainted, nil, + job, tc.nodeID, eligible, ineligible, tainted, required, []*structs.Allocation{alloc}, terminal, true, ) assertDiffCount(t, tc.expect, diff) @@ -346,7 +346,7 @@ func TestDiffSystemAllocsForNode_DrainingNode(t *testing.T) { nr := NewNodeReconciler(nil) diff := nr.computeForNode( job, drainNode.ID, map[string]*structs.Node{}, nil, - tainted, nil, required, allocs, terminal, true) + tainted, required, allocs, terminal, true) assertDiffCount(t, diffResultCount{migrate: 1, ignore: 1}, diff) if len(diff.Migrate) > 0 { @@ -398,7 +398,7 @@ func TestDiffSystemAllocsForNode_LostNode(t *testing.T) { nr := NewNodeReconciler(nil) diff := nr.computeForNode( job, deadNode.ID, map[string]*structs.Node{}, nil, - tainted, nil, required, allocs, terminal, true) + tainted, required, allocs, terminal, true) assertDiffCount(t, diffResultCount{lost: 2}, diff) if len(diff.Migrate) > 0 { @@ -523,7 +523,7 @@ func TestDiffSystemAllocsForNode_DisconnectedNode(t *testing.T) { nr := NewNodeReconciler(nil) got := nr.computeForNode( - job, tc.node.ID, eligibleNodes, nil, taintedNodes, nil, + job, tc.node.ID, eligibleNodes, nil, taintedNodes, required, []*structs.Allocation{alloc}, terminal, true, ) assertDiffCount(t, tc.expect, got) @@ -611,7 +611,7 @@ func TestDiffSystemAllocs(t *testing.T) { } nr := NewNodeReconciler(nil) - diff := nr.Compute(job, nodes, nil, tainted, nil, allocs, terminal, true) + diff := nr.Compute(job, nodes, nil, tainted, allocs, terminal, true) assertDiffCount(t, diffResultCount{ update: 1, ignore: 1, migrate: 1, lost: 1, place: 6}, diff) @@ -766,7 +766,7 @@ func TestNodeDeployments(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { nr := NewNodeReconciler(tc.existingDeployment) - nr.Compute(tc.job, nodes, nil, nil, nil, tc.liveAllocs, tc.terminalAllocs, true) + nr.Compute(tc.job, nodes, nil, nil, tc.liveAllocs, tc.terminalAllocs, true) if tc.newDeployment { must.NotNil(t, nr.DeploymentCurrent, must.Sprintf("expected a non-nil deployment")) must.Eq(t, nr.DeploymentCurrent.Status, tc.expectedNewDeploymentStatus) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 2abf2a7f66f..5f98c6215fa 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -319,42 +319,11 @@ func (s *SystemScheduler) computeJobAllocs() error { } // find feasible nodes for all the task groups - s.feasibleNodesForTG = s.findFeasibleNodesForTG(reconciliationResult.Update, s.ctx, sstructs.StatusAllocUpdating) - - fmt.Printf("found the following feasible nodes:\n") - for k, v := range s.feasibleNodesForTG { - for _, node := range v { - fmt.Printf("tg %s has nodes: %v\n", k, node.Node.ID) - } - fmt.Printf("in total, tg %s has %d feasible nodes\n", k, len(v)) - } - - // any further logic depends on whether we're canarying or not - isCanarying := map[string]bool{} - if s.job != nil && s.deployment != nil { - for _, tg := range s.job.TaskGroups { - dstate, ok := s.deployment.TaskGroups[tg.Name] - if !ok { - continue - } - // a system job is canarying if: - // - it has a non-empty update block (just a sanity check, all - // submitted jobs should have a non-empty update block as part of - // canonicalization) - // - canary parameter in the update block has to be positive - // - deployment has to be non-nil and it cannot have been promoted - // - this cannot be the initial job version - isCanarying[tg.Name] = !tg.Update.IsEmpty() && - tg.Update.Canary > 0 && - dstate != nil && - !dstate.Promoted && - s.job.Version != 0 - } - } + s.feasibleNodesForTG = s.findFeasibleNodesForTG(reconciliationResult) // Treat non in-place updates as an eviction and new placement, which will // be limited by max_parallel - s.limitReached = evictAndPlace(s.ctx, s.job, reconciliationResult, sstructs.StatusAllocUpdating) + s.limitReached = s.evictAndPlace(reconciliationResult, sstructs.StatusAllocUpdating) if !s.job.Stopped() { for _, tg := range s.job.TaskGroups { @@ -376,7 +345,7 @@ func (s *SystemScheduler) computeJobAllocs() error { return err } - // if there is not deployment we're done at this point + // if there is no deployment we're done at this point if s.deployment == nil { return nil } @@ -385,6 +354,7 @@ func (s *SystemScheduler) computeJobAllocs() error { // nodes, so for system jobs we do it backwards a bit: the "desired" total // is the total we were able to place. // track if any of the task groups is doing a canary update now + deploymentComplete := true for _, tg := range s.job.TaskGroups { feasibleNodes, ok := s.feasibleNodesForTG[tg.Name] if !ok { @@ -394,51 +364,47 @@ func (s *SystemScheduler) computeJobAllocs() error { continue } + // we can set the desired total now, it's always the amount of all + // feasible nodes s.deployment.TaskGroups[tg.Name].DesiredTotal = len(feasibleNodes) + dstate, ok := s.deployment.TaskGroups[tg.Name] + if !ok { + continue + } + // a system job is canarying if: + // - it has a non-empty update block (just a sanity check, all + // submitted jobs should have a non-empty update block as part of + // canonicalization) + // - canary parameter in the update block has to be positive + // - deployment has to be non-nil and it cannot have been promoted + // - this cannot be the initial job version + isCanarying := !tg.Update.IsEmpty() && + tg.Update.Canary > 0 && + dstate != nil && + !dstate.Promoted && + s.job.Version != 0 + // if this TG isn't canarying, we're done - if !isCanarying[tg.Name] { + if !isCanarying { continue } + // we can now also set the desired canaries: it's the tg.Update.Canary + // percent of all the feasible nodes, rounded up to the nearest int + requiredCanaries := int(math.Ceil(float64(tg.Update.Canary) * float64(len(feasibleNodes)) / 100)) + s.deployment.TaskGroups[tg.Name].DesiredCanaries = requiredCanaries + // Initially, if the job requires canaries, we place all of them on // all eligible nodes. At this point we know which nodes are // feasible, so we evict unnedded canaries. - placedCanaries, err := s.evictUnneededCanaries( - len(s.feasibleNodesForTG[tg.Name]), - tg.Update.Canary, - ) + placedCanaries, err := s.evictUnneededCanaries(requiredCanaries) if err != nil { return fmt.Errorf("failed to evict canaries for job '%s': %v", s.eval.JobID, err) } - - s.deployment.TaskGroups[tg.Name].DesiredCanaries = len(placedCanaries) s.deployment.TaskGroups[tg.Name].PlacedCanaries = placedCanaries - for nodeID, allocs := range s.plan.NodeUpdate { - filtered := []*structs.Allocation{} - for _, a := range allocs { - if a.DesiredDescription == sstructs.StatusAllocNotNeeded { - filtered = append(filtered, a) - continue - } - - // we only keep allocs that are in the plan.NodeAllocation for - // this node - if _, ok := s.plan.NodeAllocation[a.NodeID]; ok { - filtered = append(filtered, a) - } - } - - s.plan.NodeUpdate[nodeID] = filtered - } - - } - - // check if the deployment is complete - deploymentComplete := true - for _, tg := range s.job.TaskGroups { - groupComplete := s.isDeploymentComplete(tg.Name, reconciliationResult, isCanarying[tg.Name]) + groupComplete := s.isDeploymentComplete(tg.Name, reconciliationResult, isCanarying) deploymentComplete = deploymentComplete && groupComplete } @@ -476,7 +442,7 @@ func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { return acc } -func (s *SystemScheduler) findFeasibleNodesForTG(updates []reconciler.AllocTuple, ctx feasible.Context, desc string) map[string][]*feasible.RankedNode { +func (s *SystemScheduler) findFeasibleNodesForTG(buckets *reconciler.NodeReconcileResult) map[string][]*feasible.RankedNode { nodeByID := make(map[string]*structs.Node, len(s.nodes)) for _, node := range s.nodes { nodeByID[node.ID] = node @@ -485,10 +451,7 @@ func (s *SystemScheduler) findFeasibleNodesForTG(updates []reconciler.AllocTuple feasibleNodes := make(map[string][]*feasible.RankedNode) nodes := make([]*structs.Node, 1) - for _, a := range updates { - - // stop everything here - ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "") + for _, a := range slices.Concat(buckets.Place, buckets.Update) { tgName := a.TaskGroup.Name @@ -502,6 +465,14 @@ func (s *SystemScheduler) findFeasibleNodesForTG(updates []reconciler.AllocTuple nodes[0] = node s.stack.SetNodes(nodes) + if a.Alloc.ID != "" { + // temporarily include the old alloc from a destructive update so + // that we can account for resources that will be freed by that + // allocation. We'll back this change out if we end up needing to + // limit placements by max_parallel or canaries. + s.plan.AppendStoppedAlloc(a.Alloc, sstructs.StatusAllocUpdating, "", "") + } + // Attempt to match the task group option := s.stack.Select(a.TaskGroup, &feasible.SelectOptions{AllocName: a.Name}) if option == nil { @@ -542,34 +513,25 @@ func (s *SystemScheduler) computePlacements( deploymentID = s.deployment.ID } - for _, r := range reconcilerResult.Place { - fmt.Println(r.Name) - } - - nodes := make([]*structs.Node, 1) for _, missing := range reconcilerResult.Place { tgName := missing.TaskGroup.Name - var option *feasible.RankedNode - node, ok := nodeByID[missing.Alloc.NodeID] if !ok { s.logger.Debug("could not find node", "node", missing.Alloc.NodeID) continue } - // if we've already seen a feasible node for this tg, skip feasibility - // checks for it - // option = s.feasibleNodesForTG[tgName] - - // if option == nil { - // Update the set of placement nodes - nodes[0] = node - s.stack.SetNodes(nodes) - - // Attempt to match the task group - option = s.stack.Select(missing.TaskGroup, &feasible.SelectOptions{AllocName: missing.Name}) - // } + // we're already performed feasibility check for all the task groups and + // nodes, so look up + var option *feasible.RankedNode + optionsForTG := s.feasibleNodesForTG[tgName] + if existing := slices.IndexFunc( + optionsForTG, + func(rn *feasible.RankedNode) bool { return rn.Node == node }, + ); existing != -1 { + option = optionsForTG[existing] + } if option == nil { // If the task can't be placed on this node, update reporting data @@ -768,15 +730,15 @@ func (s *SystemScheduler) canHandle(trigger string) bool { // evictAndPlace is used to mark allocations for evicts and add them to the // placement queue. evictAndPlace modifies the reconciler result. It returns // true if the limit has been reached for any task group. -func evictAndPlace(ctx feasible.Context, job *structs.Job, reconciled *reconciler.NodeReconcileResult, desc string) bool { +func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResult, desc string) bool { limits := map[string]int{} // per task group limits - if !job.Stopped() { + if !s.job.Stopped() { jobLimit := len(reconciled.Update) - if job.Update.MaxParallel > 0 { - jobLimit = job.Update.MaxParallel + if s.job.Update.MaxParallel > 0 { + jobLimit = s.job.Update.MaxParallel } - for _, tg := range job.TaskGroups { + for _, tg := range s.job.TaskGroups { if tg.Update != nil && tg.Update.MaxParallel > 0 { limits[tg.Name] = tg.Update.MaxParallel } else { @@ -785,35 +747,50 @@ func evictAndPlace(ctx feasible.Context, job *structs.Job, reconciled *reconcile } } + // findFeasibleNodesForTG method marks all old allocs from a destructive + // update as stopped in order to get an accurate feasible nodes count + // (accounting for resources that would be freed). Now we need to remove all + // the allocs that have StatusAllocUpdating from NodeAllocation, and only + // place the ones that correspond to updates limited by max parallel. + for node, allocations := range s.plan.NodeAllocation { + n := 0 + for _, alloc := range allocations { + if alloc.DesiredDescription == sstructs.StatusAllocUpdating { + allocations[n] = alloc + n += 1 + } + } + s.plan.NodeAllocation[node] = allocations[:n] + } + limited := false for _, a := range reconciled.Update { if limit := limits[a.Alloc.TaskGroup]; limit > 0 { - ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "") + s.ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "") reconciled.Place = append(reconciled.Place, a) + limits[a.Alloc.TaskGroup]-- } else { limited = true } } + return limited } // evictAndPlaceCanaries checks how many canaries are needed against the amount // of feasible nodes, and removes unnecessary placements from the plan. -func (s *SystemScheduler) evictUnneededCanaries(feasibleNodes int, canary int) ([]string, error) { +func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int) ([]string, error) { - desiredCanaries := []string{} // FIXME: make this better - - // calculate how many canary placement we expect each task group to have: it - // should be the tg.update.canary percentage of eligible nodes, rounded up - // to the nearest integer - requiredCanaries := int(math.Ceil(float64(canary) * float64(feasibleNodes) / 100)) + desiredCanaries := make([]string, requiredCanaries) // no canaries to consider, quit early if requiredCanaries == 0 { return desiredCanaries, nil } + canaryCounter := requiredCanaries + // iterate over node allocations to find canary allocs for node, allocations := range s.plan.NodeAllocation { n := 0 @@ -822,8 +799,8 @@ func (s *SystemScheduler) evictUnneededCanaries(feasibleNodes int, canary int) ( continue } if alloc.DeploymentStatus.Canary { - if requiredCanaries != 0 { - requiredCanaries -= 1 + if canaryCounter != 0 { + canaryCounter -= 1 desiredCanaries = append(desiredCanaries, alloc.ID) allocations[n] = alloc // we do this in order to avoid allocating another slice @@ -840,11 +817,6 @@ func (s *SystemScheduler) evictUnneededCanaries(feasibleNodes int, canary int) ( func (s *SystemScheduler) isDeploymentComplete(groupName string, buckets *reconciler.NodeReconcileResult, isCanarying bool) bool { complete := len(buckets.Place)+len(buckets.Migrate)+len(buckets.Update) == 0 - // fmt.Printf("complete? %v\n", complete) - // fmt.Printf("buckets.Place: %v buckets.Migrate: %v buckets.Update: %v\n", len(buckets.Place), len(buckets.Migrate), len(buckets.Update)) - // fmt.Printf("s.deployment == nil? %v\n", s.deployment == nil) - // fmt.Printf("isCanarying? %v\n", isCanarying) - if !complete || s.deployment == nil || isCanarying { return false } diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index cef49c1daa4..f2a5bf0508d 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -3569,14 +3569,13 @@ func TestSystemSched_UpdateBlock(t *testing.T) { tg1: {DesiredTotal: 10, PlacedAllocs: 10}, tg2: {DesiredTotal: 10, PlacedAllocs: 10}, }, - expectAllocs: map[string]int{tg1: 3, tg2: 5}, - expectStop: map[string]int{tg1: 3, tg2: 5}, + expectAllocs: map[string]int{tg1: 2, tg2: 5}, + expectStop: map[string]int{tg1: 2, tg2: 5}, expectDState: map[string]*structs.DeploymentState{ tg1: { DesiredTotal: 10, DesiredCanaries: 3, - // PlacedCanaries: []string{"0", "1", "2"}, - PlacedAllocs: 2, + PlacedAllocs: 2, }, tg2: {DesiredTotal: 10, PlacedAllocs: 5}, }, From f96bff6318cc7189b89ac3f995351c8f9d2ad67f Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 29 Oct 2025 09:16:34 -0400 Subject: [PATCH 05/24] system deployment tests: fix and annotate counts (#27006) Fix some previously broken counts in the system deployments test and add comments to most of the placement counts to make it a little easier to read the test and verify it's correct. --- scheduler/scheduler_system_test.go | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index f2a5bf0508d..97d4366818b 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -3473,8 +3473,8 @@ func TestSystemSched_UpdateBlock(t *testing.T) { expectAllocs: map[string]int{tg1: 2, tg2: 3}, expectStop: map[string]int{tg1: 2, tg2: 3}, expectDState: map[string]*structs.DeploymentState{ - tg1: {DesiredTotal: 10, PlacedAllocs: 4}, - tg2: {DesiredTotal: 10, PlacedAllocs: 6}, + tg1: {DesiredTotal: 10, PlacedAllocs: 4}, // 2 previous + 2 destructive + tg2: {DesiredTotal: 10, PlacedAllocs: 6}, // 3 previous + 3 destructive }, }, @@ -3547,8 +3547,8 @@ func TestSystemSched_UpdateBlock(t *testing.T) { expectAllocs: map[string]int{tg1: 7, tg2: 10}, expectStop: map[string]int{tg1: 2, tg2: 3}, expectDState: map[string]*structs.DeploymentState{ - tg1: {DesiredTotal: 10, PlacedAllocs: 7}, - tg2: {DesiredTotal: 10, PlacedAllocs: 10}, + tg1: {DesiredTotal: 10, PlacedAllocs: 7}, // 2 destructive + 5 new + tg2: {DesiredTotal: 10, PlacedAllocs: 10}, // 3 destructive + 7 new }, }, @@ -3563,11 +3563,11 @@ func TestSystemSched_UpdateBlock(t *testing.T) { }, existingPrevious: map[string][]int{ tg1: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - tg2: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + tg2: {0, 1, 2, 3, 4, 5, 6, 7}, // only 8 were previously eligible }, existingOldDState: map[string]*structs.DeploymentState{ tg1: {DesiredTotal: 10, PlacedAllocs: 10}, - tg2: {DesiredTotal: 10, PlacedAllocs: 10}, + tg2: {DesiredTotal: 8, PlacedAllocs: 8}, }, expectAllocs: map[string]int{tg1: 2, tg2: 5}, expectStop: map[string]int{tg1: 2, tg2: 5}, @@ -3575,9 +3575,13 @@ func TestSystemSched_UpdateBlock(t *testing.T) { tg1: { DesiredTotal: 10, DesiredCanaries: 3, - PlacedAllocs: 2, + PlacedCanaries: []string{"0", "1"}, + PlacedAllocs: 2, // want 3 canaries, limited by max_parallel + }, + tg2: { + DesiredTotal: 10, + PlacedAllocs: 7, // 2 new + 5 destructive updates }, - tg2: {DesiredTotal: 10, PlacedAllocs: 5}, }, }, @@ -3623,7 +3627,7 @@ func TestSystemSched_UpdateBlock(t *testing.T) { DesiredTotal: 10, DesiredCanaries: 3, PlacedCanaries: []string{"7", "8", "9"}, - PlacedAllocs: 5, + PlacedAllocs: 5, // 2 failed + 2 replacements + 1 new }, tg2: {DesiredTotal: 10, PlacedAllocs: 10}, }, @@ -3668,7 +3672,7 @@ func TestSystemSched_UpdateBlock(t *testing.T) { DesiredTotal: 10, DesiredCanaries: 3, PlacedCanaries: []string{"7", "8", "9"}, - PlacedAllocs: 3, + PlacedAllocs: 3, // 1 existing canary + 2 new canaries }, tg2: {DesiredTotal: 10, PlacedAllocs: 10}, }, @@ -3713,7 +3717,7 @@ func TestSystemSched_UpdateBlock(t *testing.T) { DesiredTotal: 10, DesiredCanaries: 3, PlacedCanaries: []string{"7", "8", "9"}, - PlacedAllocs: 3, + PlacedAllocs: 3, // unchanged because we're not promoted }, tg2: {DesiredTotal: 10, PlacedAllocs: 10}, }, @@ -3755,7 +3759,7 @@ func TestSystemSched_UpdateBlock(t *testing.T) { DesiredTotal: 10, DesiredCanaries: 3, PlacedCanaries: []string{"7", "8", "9"}, - PlacedAllocs: 5, + PlacedAllocs: 5, // 3 running canaries + 2 new limited by max_parallel }, tg2: {DesiredTotal: 10, PlacedAllocs: 10}, }, From 19e5a8bfc4be8871eab2e3a39007c655ae7925c6 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 29 Oct 2025 15:31:41 +0100 Subject: [PATCH 06/24] system scheduler: fixes to computeJobAllocs --- scheduler/scheduler_system.go | 73 +++++++------- scheduler/scheduler_system_test.go | 151 ++++++++++++++++++++++++++++- 2 files changed, 187 insertions(+), 37 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 5f98c6215fa..227f61e455a 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -364,6 +364,11 @@ func (s *SystemScheduler) computeJobAllocs() error { continue } + // no deployment for this TG + if _, ok := s.deployment.TaskGroups[tg.Name]; !ok { + continue + } + // we can set the desired total now, it's always the amount of all // feasible nodes s.deployment.TaskGroups[tg.Name].DesiredTotal = len(feasibleNodes) @@ -385,24 +390,18 @@ func (s *SystemScheduler) computeJobAllocs() error { !dstate.Promoted && s.job.Version != 0 - // if this TG isn't canarying, we're done - if !isCanarying { - continue - } + if isCanarying { + // we can now also set the desired canaries: it's the tg.Update.Canary + // percent of all the feasible nodes, rounded up to the nearest int + requiredCanaries := int(math.Ceil(float64(tg.Update.Canary) * float64(len(feasibleNodes)) / 100)) + s.deployment.TaskGroups[tg.Name].DesiredCanaries = requiredCanaries - // we can now also set the desired canaries: it's the tg.Update.Canary - // percent of all the feasible nodes, rounded up to the nearest int - requiredCanaries := int(math.Ceil(float64(tg.Update.Canary) * float64(len(feasibleNodes)) / 100)) - s.deployment.TaskGroups[tg.Name].DesiredCanaries = requiredCanaries - - // Initially, if the job requires canaries, we place all of them on - // all eligible nodes. At this point we know which nodes are - // feasible, so we evict unnedded canaries. - placedCanaries, err := s.evictUnneededCanaries(requiredCanaries) - if err != nil { - return fmt.Errorf("failed to evict canaries for job '%s': %v", s.eval.JobID, err) + // Initially, if the job requires canaries, we place all of them on + // all eligible nodes. At this point we know which nodes are + // feasible, so we evict unnedded canaries. + placedCanaries := s.evictUnneededCanaries(requiredCanaries, tg.Name) + s.deployment.TaskGroups[tg.Name].PlacedCanaries = placedCanaries } - s.deployment.TaskGroups[tg.Name].PlacedCanaries = placedCanaries groupComplete := s.isDeploymentComplete(tg.Name, reconciliationResult, isCanarying) deploymentComplete = deploymentComplete && groupComplete @@ -451,7 +450,7 @@ func (s *SystemScheduler) findFeasibleNodesForTG(buckets *reconciler.NodeReconci feasibleNodes := make(map[string][]*feasible.RankedNode) nodes := make([]*structs.Node, 1) - for _, a := range slices.Concat(buckets.Place, buckets.Update) { + for _, a := range slices.Concat(buckets.Place, buckets.Update, buckets.Ignore) { tgName := a.TaskGroup.Name @@ -482,13 +481,11 @@ func (s *SystemScheduler) findFeasibleNodesForTG(buckets *reconciler.NodeReconci // count this node as feasible if feasibleNodes[tgName] == nil { feasibleNodes[tgName] = []*feasible.RankedNode{option} - } else { - if !slices.ContainsFunc( - feasibleNodes[tgName], - func(rn *feasible.RankedNode) bool { return rn.Node.ID == option.Node.ID }, - ) { - feasibleNodes[tgName] = append(feasibleNodes[tgName], option) - } + } else if !slices.ContainsFunc( + feasibleNodes[tgName], + func(rn *feasible.RankedNode) bool { return rn.Node.ID == option.Node.ID }, + ) { + feasibleNodes[tgName] = append(feasibleNodes[tgName], option) } } @@ -752,15 +749,15 @@ func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResu // (accounting for resources that would be freed). Now we need to remove all // the allocs that have StatusAllocUpdating from NodeAllocation, and only // place the ones that correspond to updates limited by max parallel. - for node, allocations := range s.plan.NodeAllocation { + for node, allocations := range s.plan.NodeUpdate { n := 0 for _, alloc := range allocations { - if alloc.DesiredDescription == sstructs.StatusAllocUpdating { + if alloc.DesiredDescription != sstructs.StatusAllocUpdating { allocations[n] = alloc n += 1 } } - s.plan.NodeAllocation[node] = allocations[:n] + s.plan.NodeUpdate[node] = allocations[:n] } limited := false @@ -780,13 +777,13 @@ func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResu // evictAndPlaceCanaries checks how many canaries are needed against the amount // of feasible nodes, and removes unnecessary placements from the plan. -func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int) ([]string, error) { +func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int, tgName string) []string { - desiredCanaries := make([]string, requiredCanaries) + desiredCanaries := make([]string, 0) // no canaries to consider, quit early if requiredCanaries == 0 { - return desiredCanaries, nil + return desiredCanaries } canaryCounter := requiredCanaries @@ -795,23 +792,31 @@ func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int) ([]string, for node, allocations := range s.plan.NodeAllocation { n := 0 for _, alloc := range allocations { - if alloc.DeploymentStatus == nil { + + // these are the allocs we keep + if alloc.DeploymentStatus == nil || !alloc.DeploymentStatus.Canary || alloc.TaskGroup != tgName { + allocations[n] = alloc + n += 1 continue } + + // if it's a canary, we only keep up to desiredCanaries amount of + // them if alloc.DeploymentStatus.Canary { if canaryCounter != 0 { canaryCounter -= 1 - desiredCanaries = append(desiredCanaries, alloc.ID) - allocations[n] = alloc // we do this in order to avoid allocating another slice + + allocations[n] = alloc n += 1 } } } + // because of this nifty trick we don't need to allocate an extra slice s.plan.NodeAllocation[node] = allocations[:n] } - return desiredCanaries, nil + return desiredCanaries } func (s *SystemScheduler) isDeploymentComplete(groupName string, buckets *reconciler.NodeReconcileResult, isCanarying bool) bool { diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 97d4366818b..f62151ab95b 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -3352,7 +3352,14 @@ func TestEvictAndPlace(t *testing.T) { diff := &reconciler.NodeReconcileResult{Update: allocs} _, ctx := feasible.MockContext(t) - must.Eq(t, tc.expectLimited, evictAndPlace(ctx, job, diff, ""), + s := SystemScheduler{ctx: ctx, job: job, plan: &structs.Plan{ + EvalID: uuid.Generate(), + NodeUpdate: make(map[string][]*structs.Allocation), + NodeAllocation: make(map[string][]*structs.Allocation), + NodePreemptions: make(map[string][]*structs.Allocation), + }} + + must.Eq(t, tc.expectLimited, s.evictAndPlace(diff, ""), must.Sprintf("limited")) must.Len(t, tc.expectPlace, diff.Place, must.Sprintf( "evictAndReplace() didn't insert into diffResult properly: %v", diff.Place)) @@ -3569,7 +3576,7 @@ func TestSystemSched_UpdateBlock(t *testing.T) { tg1: {DesiredTotal: 10, PlacedAllocs: 10}, tg2: {DesiredTotal: 8, PlacedAllocs: 8}, }, - expectAllocs: map[string]int{tg1: 2, tg2: 5}, + expectAllocs: map[string]int{tg1: 2, tg2: 7}, expectStop: map[string]int{tg1: 2, tg2: 5}, expectDState: map[string]*structs.DeploymentState{ tg1: { @@ -3671,7 +3678,7 @@ func TestSystemSched_UpdateBlock(t *testing.T) { tg1: { DesiredTotal: 10, DesiredCanaries: 3, - PlacedCanaries: []string{"7", "8", "9"}, + PlacedCanaries: []string{"8", "9"}, PlacedAllocs: 3, // 1 existing canary + 2 new canaries }, tg2: {DesiredTotal: 10, PlacedAllocs: 10}, @@ -3958,3 +3965,141 @@ func TestSystemSched_UpdateBlock(t *testing.T) { } } + +func TestSystemSched_evictUnneededCanaries(t *testing.T) { + tests := []struct { + name string + requiredCanaries int + tgName string + nodeAllocation map[string][]*structs.Allocation + expectedDesiredCanaries []string + expectedNodeAllocation map[string][]*structs.Allocation + }{ + { + name: "no required canaries", + requiredCanaries: 0, + tgName: "foo", + nodeAllocation: nil, + expectedDesiredCanaries: []string{}, + expectedNodeAllocation: nil, + }, + { + name: "existing allocs for 2 task groups: tg1 with no canaries, tg2 with canaries, calling for tg1", + requiredCanaries: 1, + tgName: "tg1", + nodeAllocation: map[string][]*structs.Allocation{ + "node1": { + { + ID: "tg1_alloc1", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}, + TaskGroup: "tg1", + }, + { + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg2", + }, + }, + "node2": { + { + ID: "tg1_alloc2", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}, + TaskGroup: "tg1", + }, + { + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg2", + }, + }, + }, + expectedDesiredCanaries: []string{}, + expectedNodeAllocation: map[string][]*structs.Allocation{ + "node1": { + { + ID: "tg1_alloc1", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}, + TaskGroup: "tg1", + }, + { + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg2", + }, + }, + "node2": { + { + ID: "tg1_alloc2", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}, + TaskGroup: "tg1", + }, + { + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg2", + }, + }, + }, + }, + { + name: "existing allocs for 2 task groups with canaries", + requiredCanaries: 1, + tgName: "tg1", + nodeAllocation: map[string][]*structs.Allocation{ + "node1": { + { + ID: "tg1_alloc1", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg1", + }, + { + ID: "tg2_alloc1", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg2", + }, + }, + "node2": { + { + ID: "tg1_alloc2", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg1", + }, + { + ID: "tg2_alloc2", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg2", + }, + }, + }, + expectedDesiredCanaries: []string{"tg1_alloc1"}, + expectedNodeAllocation: map[string][]*structs.Allocation{ + "node1": { + { + ID: "tg1_alloc1", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg1", + }, + { + ID: "tg2_alloc1", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg2", + }, + }, + "node2": { + { + ID: "tg2_alloc2", + DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, + TaskGroup: "tg2", + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := SystemScheduler{ + plan: mock.Plan(), + } + s.plan.NodeAllocation = tt.nodeAllocation + + must.Eq(t, tt.expectedDesiredCanaries, s.evictUnneededCanaries(tt.requiredCanaries, tt.tgName), must.Sprint("unexpected desired canaries")) + must.Eq(t, tt.expectedNodeAllocation, s.plan.NodeAllocation, must.Sprintf("unexpected node allocation")) + }) + } +} From 21c770879ee5efd580f883fa734bd2ee420a3d06 Mon Sep 17 00:00:00 2001 From: Chris Roberts Date: Wed, 29 Oct 2025 18:13:01 -0700 Subject: [PATCH 07/24] system scheduler: evictUnneededCanaries fixes --- scheduler/reconciler/reconcile_node.go | 7 ++- scheduler/scheduler_system.go | 59 +++++++++++++++++++++++--- scheduler/scheduler_system_test.go | 4 +- 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index 8f7778738d5..0366b1d849c 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -315,7 +315,6 @@ func (nr *NodeReconciler) computeForNode( // Scan the required groups for name, tg := range required { - // populate deployment state for this task group var dstate = new(structs.DeploymentState) var existingDeployment bool @@ -334,7 +333,6 @@ func (nr *NodeReconciler) computeForNode( // Check for an existing allocation if _, ok := existing[name]; !ok { - // Check for a terminal sysbatch allocation, which should be not placed // again unless the job has been updated. if job.Type == structs.JobTypeSysBatch { @@ -379,6 +377,11 @@ func (nr *NodeReconciler) computeForNode( Alloc: termOnNode, } + // If the terminal allocation was a canary, mark it as such. + if termOnNode != nil && termOnNode.DeploymentStatus != nil && termOnNode.DeploymentStatus.Canary { + allocTuple.Canary = true + } + // If the new allocation isn't annotated with a previous allocation // or if the previous allocation isn't from the same node then we // annotate the allocTuple with a new Allocation diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 227f61e455a..8f7109c03c2 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -399,7 +399,7 @@ func (s *SystemScheduler) computeJobAllocs() error { // Initially, if the job requires canaries, we place all of them on // all eligible nodes. At this point we know which nodes are // feasible, so we evict unnedded canaries. - placedCanaries := s.evictUnneededCanaries(requiredCanaries, tg.Name) + placedCanaries := s.evictUnneededCanaries(requiredCanaries, tg.Name, reconciliationResult) s.deployment.TaskGroups[tg.Name].PlacedCanaries = placedCanaries } @@ -777,7 +777,7 @@ func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResu // evictAndPlaceCanaries checks how many canaries are needed against the amount // of feasible nodes, and removes unnecessary placements from the plan. -func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int, tgName string) []string { +func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int, tgName string, buckets *reconciler.NodeReconcileResult) []string { desiredCanaries := make([]string, 0) @@ -788,11 +788,45 @@ func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int, tgName str canaryCounter := requiredCanaries + // Start with finding any existing failed canaries + failedCanaries := map[string]struct{}{} + for _, alloc := range buckets.Place { + if alloc.Alloc != nil && alloc.Alloc.DeploymentStatus != nil && alloc.Alloc.DeploymentStatus.Canary { + failedCanaries[alloc.Alloc.ID] = struct{}{} + } + } + + // Generate a list of preferred allocations for + // canaries. These are existing canary applications + // that are failed. + preferCanary := map[string]struct{}{} + for _, allocations := range s.plan.NodeAllocation { + for _, alloc := range allocations { + if _, ok := failedCanaries[alloc.PreviousAllocation]; ok { + preferCanary[alloc.ID] = struct{}{} + } + } + } + + // Remove the number of preferred canaries found + // from the counter. + canaryCounter -= len(preferCanary) + + // Check for any canaries that are already running. For any + // that are found, add to the desired list and decrement + // the counter. + for _, tuple := range buckets.Ignore { + if tuple.TaskGroup.Name == tgName && tuple.Alloc != nil && + tuple.Alloc.DeploymentStatus != nil && tuple.Alloc.DeploymentStatus.Canary { + desiredCanaries = append(desiredCanaries, tuple.Alloc.ID) + canaryCounter-- + } + } + // iterate over node allocations to find canary allocs for node, allocations := range s.plan.NodeAllocation { n := 0 for _, alloc := range allocations { - // these are the allocs we keep if alloc.DeploymentStatus == nil || !alloc.DeploymentStatus.Canary || alloc.TaskGroup != tgName { allocations[n] = alloc @@ -803,15 +837,30 @@ func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int, tgName str // if it's a canary, we only keep up to desiredCanaries amount of // them if alloc.DeploymentStatus.Canary { - if canaryCounter != 0 { + // Check if this is a preferred allocation for the canary + _, preferred := preferCanary[alloc.ID] + + // If it is a preferred allocation, or the counter is not exhausted, + // keep the allocation + if canaryCounter > 0 || preferred { canaryCounter -= 1 desiredCanaries = append(desiredCanaries, alloc.ID) - allocations[n] = alloc n += 1 + } else { + // If the counter has been exhausted the allocation will not be + // placed, but a stop will have been appended for the update. + // Locate it and remove it. + idx := slices.IndexFunc(s.plan.NodeUpdate[alloc.NodeID], func(a *structs.Allocation) bool { + return a.ID == alloc.PreviousAllocation + }) + if idx > -1 { + s.plan.NodeUpdate[alloc.NodeID] = append(s.plan.NodeUpdate[alloc.NodeID][0:idx], s.plan.NodeUpdate[alloc.NodeID][idx+1:]...) + } } } } + // because of this nifty trick we don't need to allocate an extra slice s.plan.NodeAllocation[node] = allocations[:n] } diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index f62151ab95b..e2e9d565875 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -3678,7 +3678,7 @@ func TestSystemSched_UpdateBlock(t *testing.T) { tg1: { DesiredTotal: 10, DesiredCanaries: 3, - PlacedCanaries: []string{"8", "9"}, + PlacedCanaries: []string{"7", "8", "9"}, PlacedAllocs: 3, // 1 existing canary + 2 new canaries }, tg2: {DesiredTotal: 10, PlacedAllocs: 10}, @@ -4098,7 +4098,7 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { } s.plan.NodeAllocation = tt.nodeAllocation - must.Eq(t, tt.expectedDesiredCanaries, s.evictUnneededCanaries(tt.requiredCanaries, tt.tgName), must.Sprint("unexpected desired canaries")) + must.Eq(t, tt.expectedDesiredCanaries, s.evictUnneededCanaries(tt.requiredCanaries, tt.tgName, &reconciler.NodeReconcileResult{}), must.Sprint("unexpected desired canaries")) must.Eq(t, tt.expectedNodeAllocation, s.plan.NodeAllocation, must.Sprintf("unexpected node allocation")) }) } From a60950bfd13b56f2655b7b796e1765fdf1c55bf4 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 30 Oct 2025 09:04:12 +0100 Subject: [PATCH 08/24] system scheduler: unit test fixes --- scheduler/scheduler_system.go | 12 ++++---- scheduler/scheduler_system_test.go | 46 +++++++++++------------------- 2 files changed, 22 insertions(+), 36 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 8f7109c03c2..54f0a2db56e 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -481,12 +481,12 @@ func (s *SystemScheduler) findFeasibleNodesForTG(buckets *reconciler.NodeReconci // count this node as feasible if feasibleNodes[tgName] == nil { feasibleNodes[tgName] = []*feasible.RankedNode{option} - } else if !slices.ContainsFunc( - feasibleNodes[tgName], - func(rn *feasible.RankedNode) bool { return rn.Node.ID == option.Node.ID }, - ) { + } else { feasibleNodes[tgName] = append(feasibleNodes[tgName], option) } + + // update the plan annotations for this task group + s.planAnnotations.DesiredTGUpdates[tgName].Place = uint64(len(feasibleNodes[tgName])) } return feasibleNodes @@ -747,8 +747,8 @@ func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResu // findFeasibleNodesForTG method marks all old allocs from a destructive // update as stopped in order to get an accurate feasible nodes count // (accounting for resources that would be freed). Now we need to remove all - // the allocs that have StatusAllocUpdating from NodeAllocation, and only - // place the ones that correspond to updates limited by max parallel. + // the allocs that have StatusAllocUpdating from the set we're stopping (NodeUpdate) and + // only place and stop the ones the ones that correspond to updates limited by max parallel. for node, allocations := range s.plan.NodeUpdate { n := 0 for _, alloc := range allocations { diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index e2e9d565875..53b91989262 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -23,6 +23,7 @@ import ( "github.com/hashicorp/nomad/scheduler/tests" "github.com/shoenig/test" "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" ) func TestSystemSched_JobRegister(t *testing.T) { @@ -330,7 +331,7 @@ func TestSystemSched_JobRegister_Annotate(t *testing.T) { h := tests.NewHarness(t) // Create some nodes - for i := 0; i < 10; i++ { + for i := range 10 { node := mock.Node() if i < 9 { node.NodeClass = "foo" @@ -364,15 +365,10 @@ func TestSystemSched_JobRegister_Annotate(t *testing.T) { must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + must.NoError(t, h.Process(NewSystemScheduler, eval)) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + must.SliceLen(t, 1, h.Plans) plan := h.Plans[0] // Ensure the plan allocated @@ -380,9 +376,7 @@ func TestSystemSched_JobRegister_Annotate(t *testing.T) { for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 9 { - t.Fatalf("bad: %#v %d", planned, len(planned)) - } + must.SliceLen(t, 9, planned) // Lookup the allocations by JobID ws := memdb.NewWatchSet() @@ -390,9 +384,7 @@ func TestSystemSched_JobRegister_Annotate(t *testing.T) { must.NoError(t, err) // Ensure all allocations placed - if len(out) != 9 { - t.Fatalf("bad: %#v", out) - } + must.SliceLen(t, 9, out) // Check the available nodes if count, ok := out[0].Metrics.NodesAvailable["dc1"]; !ok || count != 10 { @@ -404,23 +396,14 @@ func TestSystemSched_JobRegister_Annotate(t *testing.T) { h.AssertEvalStatus(t, structs.EvalStatusComplete) // Ensure the plan had annotations. - if plan.Annotations == nil { - t.Fatalf("expected annotations") - } + must.NotNil(t, plan.Annotations) desiredTGs := plan.Annotations.DesiredTGUpdates - if l := len(desiredTGs); l != 1 { - t.Fatalf("incorrect number of task groups; got %v; want %v", l, 1) - } + must.MapLen(t, 1, desiredTGs, must.Sprint("incorrect number of task groups")) desiredChanges, ok := desiredTGs["web"] - if !ok { - t.Fatalf("expected task group web to have desired changes") - } - - expected := &structs.DesiredUpdates{Place: 9} - must.Eq(t, desiredChanges, expected) - + must.True(t, ok, must.Sprint("expected task group web to have desired changes")) + must.Eq(t, 9, desiredChanges.Place) } func TestSystemSched_JobRegister_AddNode(t *testing.T) { @@ -1771,7 +1754,10 @@ func TestSystemSched_ConstraintErrors(t *testing.T) { // QueuedAllocations is drained val, ok := h.Evals[0].QueuedAllocations["web"] must.True(t, ok) - must.Eq(t, 0, val) + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + return val == 0 + }))) // The plan has two NodeAllocations must.Eq(t, 1, len(h.Plans)) @@ -3717,8 +3703,8 @@ func TestSystemSched_UpdateBlock(t *testing.T) { }, tg2: {DesiredTotal: 10, PlacedAllocs: 10, HealthyAllocs: 10}, }, - expectAllocs: nil, - expectStop: nil, + expectAllocs: map[string]int{}, + expectStop: map[string]int{}, expectDState: map[string]*structs.DeploymentState{ tg1: { DesiredTotal: 10, From 8a1a40258fe09e8b6e8a07f403ddd7e554c417c4 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:53:28 +0100 Subject: [PATCH 09/24] scheduler: separate the sysbatch scheduler from non-batch system scheduler (#27016) --- scheduler/scheduler_sysbatch.go | 546 +++++++++++++++++++++++++++ scheduler/scheduler_sysbatch_test.go | 6 +- scheduler/scheduler_system.go | 74 +--- scheduler/scheduler_system_test.go | 10 +- scheduler/util.go | 24 ++ 5 files changed, 592 insertions(+), 68 deletions(-) create mode 100644 scheduler/scheduler_sysbatch.go diff --git a/scheduler/scheduler_sysbatch.go b/scheduler/scheduler_sysbatch.go new file mode 100644 index 00000000000..c8c9be7f3d3 --- /dev/null +++ b/scheduler/scheduler_sysbatch.go @@ -0,0 +1,546 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package scheduler + +import ( + "fmt" + "runtime/debug" + + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/scheduler/feasible" + "github.com/hashicorp/nomad/scheduler/reconciler" + sstructs "github.com/hashicorp/nomad/scheduler/structs" +) + +const ( + // maxSysBatchScheduleAttempts is used to limit the number of times we will + // attempt to schedule if we continue to hit conflicts for sysbatch jobs. + maxSysBatchScheduleAttempts = 2 +) + +// SysBatchScheduler is used for 'sysbatch' jobs. This scheduler is designed for +// jobs that should be run on every client. The 'system' mode will ensure those +// jobs continuously run regardless of successful task exits, whereas 'sysbatch' +// considers the task complete on success. +type SysBatchScheduler struct { + logger log.Logger + eventsCh chan<- interface{} + state sstructs.State + planner sstructs.Planner + + eval *structs.Evaluation + job *structs.Job + plan *structs.Plan + planResult *structs.PlanResult + ctx *feasible.EvalContext + stack *feasible.SystemStack + + nodes []*structs.Node + notReadyNodes map[string]struct{} + nodesByDC map[string]int + + limitReached bool + + failedTGAllocs map[string]*structs.AllocMetric + queuedAllocs map[string]int + planAnnotations *structs.PlanAnnotations +} + +func NewSysBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state sstructs.State, planner sstructs.Planner) sstructs.Scheduler { + return &SysBatchScheduler{ + logger: logger.Named("sysbatch_sched"), + eventsCh: eventsCh, + state: state, + planner: planner, + } +} + +// Process is used to handle a single evaluation. +func (s *SysBatchScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + s.logger.Error("processing eval panicked scheduler - please report this as a bug!", "eval_id", eval.ID, "error", r, "stack_trace", string(debug.Stack())) + err = fmt.Errorf("failed to process eval: %v", r) + } + }() + + // Store the evaluation + s.eval = eval + + // Update our logger with the eval's information + s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) + + // Verify the evaluation trigger reason is understood + if !s.canHandle(eval.TriggeredBy) { + desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) + return setStatus(s.logger, s.planner, s.eval, nil, nil, + s.failedTGAllocs, s.planAnnotations, structs.EvalStatusFailed, desc, + s.queuedAllocs, "") + } + + limit := maxSysBatchScheduleAttempts + + // Retry up to the maxSystemScheduleAttempts and reset if progress is made. + progress := func() bool { return progressMade(s.planResult) } + if err := retryMax(limit, s.process, progress); err != nil { + if statusErr, ok := err.(*SetStatusError); ok { + return setStatus(s.logger, s.planner, s.eval, nil, nil, + s.failedTGAllocs, s.planAnnotations, statusErr.EvalStatus, err.Error(), + s.queuedAllocs, "") + } + return err + } + + // Update the status to complete + return setStatus(s.logger, s.planner, s.eval, nil, nil, + s.failedTGAllocs, s.planAnnotations, structs.EvalStatusComplete, "", + s.queuedAllocs, "") +} + +// process is wrapped in retryMax to iteratively run the handler until we have no +// further work or we've made the maximum number of attempts. +func (s *SysBatchScheduler) process() (bool, error) { + // Lookup the Job by ID + var err error + ws := memdb.NewWatchSet() + s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) + if err != nil { + return false, fmt.Errorf("failed to get job '%s': %v", s.eval.JobID, err) + } + + numTaskGroups := 0 + if !s.job.Stopped() { + numTaskGroups = len(s.job.TaskGroups) + } + s.queuedAllocs = make(map[string]int, numTaskGroups) + + // Get the ready nodes in the required datacenters + if !s.job.Stopped() { + s.nodes, s.notReadyNodes, s.nodesByDC, err = readyNodesInDCsAndPool( + s.state, s.job.Datacenters, s.job.NodePool) + if err != nil { + return false, fmt.Errorf("failed to get ready nodes: %v", err) + } + } + + // Create a plan + s.plan = s.eval.MakePlan(s.job) + + // Reset the failed allocations + s.failedTGAllocs = nil + + // Create an evaluation context + s.ctx = feasible.NewEvalContext(s.eventsCh, s.state, s.plan, s.logger) + + // Construct the placement stack + s.stack = feasible.NewSystemStack(true, s.ctx) + if !s.job.Stopped() { + s.setJob(s.job) + } + + // Compute the target job allocations + if err := s.computeJobAllocs(); err != nil { + s.logger.Error("failed to compute job allocations", "error", err) + return false, err + } + + // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan + // anyways to get the annotations. + if s.plan.IsNoOp() && !s.eval.AnnotatePlan { + return true, nil + } + + // Submit the plan + if s.eval.AnnotatePlan { + s.plan.Annotations = s.planAnnotations + } + result, newState, err := s.planner.SubmitPlan(s.plan) + s.planResult = result + if err != nil { + return false, err + } + + // Decrement the number of allocations pending per task group based on the + // number of allocations successfully placed + adjustQueuedAllocations(s.logger, result, s.queuedAllocs) + + // If we got a state refresh, try again since we have stale data + if newState != nil { + s.logger.Debug("refresh forced") + s.state = newState + return false, nil + } + + // Try again if the plan was not fully committed, potential conflict + fullCommit, expected, actual := result.FullCommit(s.plan) + if !fullCommit { + s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual) + return false, nil + } + + // Success! + return true, nil +} + +// setJob updates the stack with the given job and job's node pool scheduler +// configuration. +func (s *SysBatchScheduler) setJob(job *structs.Job) error { + // Fetch node pool and global scheduler configuration to determine how to + // configure the scheduler. + pool, err := s.state.NodePoolByName(nil, job.NodePool) + if err != nil { + return fmt.Errorf("failed to get job node pool %q: %v", job.NodePool, err) + } + + _, schedConfig, err := s.state.SchedulerConfig() + if err != nil { + return fmt.Errorf("failed to get scheduler configuration: %v", err) + } + + s.stack.SetJob(job) + s.stack.SetSchedulerConfiguration(schedConfig.WithNodePool(pool)) + return nil +} + +// computeJobAllocs is used to reconcile differences between the job, +// existing allocations and node status to update the allocations. +func (s *SysBatchScheduler) computeJobAllocs() error { + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) + if err != nil { + return fmt.Errorf("failed to get allocs for job '%s': %v", s.eval.JobID, err) + } + + // Determine the tainted nodes containing job allocs + tainted, err := taintedNodes(s.state, allocs) + if err != nil { + return fmt.Errorf("failed to get tainted nodes for job '%s': %v", s.eval.JobID, err) + } + + // Update the allocations which are in pending/running state on tainted + // nodes to lost. + updateNonTerminalAllocsToLost(s.plan, tainted, allocs) + + // Split out terminal allocations + live, term := structs.SplitTerminalAllocs(allocs) + + // Diff the required and existing allocations + nr := reconciler.NewNodeReconciler(nil) + r := nr.Compute(s.job, s.nodes, s.notReadyNodes, tainted, live, term, + s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true)) + if s.logger.IsDebug() { + s.logger.Debug("reconciled current state with desired state", r.Fields()...) + } + + // Add all the allocs to stop + for _, e := range r.Stop { + s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocNotNeeded, "", "") + } + + // Add all the allocs to migrate + for _, e := range r.Migrate { + s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocNodeTainted, "", "") + } + + // Lost allocations should be transitioned to desired status stop and client + // status lost. + for _, e := range r.Lost { + s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocLost, structs.AllocClientStatusLost, "") + } + + for _, e := range r.Disconnecting { + s.plan.AppendUnknownAlloc(e.Alloc) + } + + allocExistsForTaskGroup := map[string]bool{} + // Attempt to do the upgrades in place. + // Reconnecting allocations need to be updated to persists alloc state + // changes. + updates := make([]reconciler.AllocTuple, 0, len(r.Update)+len(r.Reconnecting)) + updates = append(updates, r.Update...) + updates = append(updates, r.Reconnecting...) + destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, updates) + r.Update = destructiveUpdates + + for _, inplaceUpdate := range inplaceUpdates { + allocExistsForTaskGroup[inplaceUpdate.TaskGroup.Name] = true + } + + s.planAnnotations = &structs.PlanAnnotations{ + DesiredTGUpdates: desiredUpdates(r, inplaceUpdates, destructiveUpdates), + } + + // Treat non in-place updates as an eviction and new placement, which will + // be limited by max_parallel + s.limitReached = evictAndPlace(s.ctx, s.job, r, sstructs.StatusAllocUpdating) + + // Nothing remaining to do if placement is not required + if len(r.Place) == 0 { + if !s.job.Stopped() { + for _, tg := range s.job.TaskGroups { + s.queuedAllocs[tg.Name] = 0 + } + } + return nil + } + + // Record the number of allocations that needs to be placed per Task Group + for _, allocTuple := range r.Place { + s.queuedAllocs[allocTuple.TaskGroup.Name] += 1 + } + + for _, ignoredAlloc := range r.Ignore { + allocExistsForTaskGroup[ignoredAlloc.TaskGroup.Name] = true + } + + // Compute the placements + return s.computePlacements(r.Place, allocExistsForTaskGroup) +} + +// computePlacements computes placements for allocations +func (s *SysBatchScheduler) computePlacements(place []reconciler.AllocTuple, existingByTaskGroup map[string]bool) error { + nodeByID := make(map[string]*structs.Node, len(s.nodes)) + for _, node := range s.nodes { + nodeByID[node.ID] = node + } + + // track node filtering, to only report an error if all nodes have been filtered + var filteredMetrics map[string]*structs.AllocMetric + + nodes := make([]*structs.Node, 1) + for _, missing := range place { + tgName := missing.TaskGroup.Name + + node, ok := nodeByID[missing.Alloc.NodeID] + if !ok { + s.logger.Debug("could not find node", "node", missing.Alloc.NodeID) + continue + } + + // Update the set of placement nodes + nodes[0] = node + s.stack.SetNodes(nodes) + + // Attempt to match the task group + option := s.stack.Select(missing.TaskGroup, &feasible.SelectOptions{AllocName: missing.Name}) + + if option == nil { + // If the task can't be placed on this node, update reporting data + // and continue to short circuit the loop + + // If this node was filtered because of constraint + // mismatches and we couldn't create an allocation then + // decrement queuedAllocs for that task group. + if s.ctx.Metrics().NodesFiltered > 0 { + queued := s.queuedAllocs[tgName] - 1 + s.queuedAllocs[tgName] = queued + + if filteredMetrics == nil { + filteredMetrics = map[string]*structs.AllocMetric{} + } + filteredMetrics[tgName] = mergeNodeFiltered(filteredMetrics[tgName], s.ctx.Metrics()) + + // If no tasks have been placed and there aren't any previously + // existing (ignored or updated) tasks on the node, mark the alloc as failed to be placed + // if queued <= 0 && !existingByTaskGroup[tgName] { + if queued <= 0 && !existingByTaskGroup[tgName] { + if s.failedTGAllocs == nil { + s.failedTGAllocs = make(map[string]*structs.AllocMetric) + } + s.failedTGAllocs[tgName] = filteredMetrics[tgName] + } + + // If we are annotating the plan, then decrement the desired + // placements based on whether the node meets the constraints + if s.planAnnotations != nil && + s.planAnnotations.DesiredTGUpdates != nil { + s.planAnnotations.DesiredTGUpdates[tgName].Place -= 1 + } + + // Filtered nodes are not reported to users, just omitted from the job status + continue + } + + // Check if this task group has already failed, reported to the user as a count + if metric, ok := s.failedTGAllocs[tgName]; ok { + metric.CoalescedFailures += 1 + metric.ExhaustResources(missing.TaskGroup) + continue + } + + // Store the available nodes by datacenter + s.ctx.Metrics().NodesAvailable = s.nodesByDC + s.ctx.Metrics().NodesInPool = len(s.nodes) + s.ctx.Metrics().NodePool = s.job.NodePool + + // Compute top K scoring node metadata + s.ctx.Metrics().PopulateScoreMetaData() + + // Lazy initialize the failed map + if s.failedTGAllocs == nil { + s.failedTGAllocs = make(map[string]*structs.AllocMetric) + } + + // Update metrics with the resources requested by the task group. + s.ctx.Metrics().ExhaustResources(missing.TaskGroup) + + // Actual failure to start this task on this candidate node, report it individually + s.failedTGAllocs[tgName] = s.ctx.Metrics() + s.addBlocked(node) + + continue + } + + // Store the available nodes by datacenter + s.ctx.Metrics().NodesAvailable = s.nodesByDC + s.ctx.Metrics().NodesInPool = len(s.nodes) + + // Compute top K scoring node metadata + s.ctx.Metrics().PopulateScoreMetaData() + + // Set fields based on if we found an allocation option + resources := &structs.AllocatedResources{ + Tasks: option.TaskResources, + TaskLifecycles: option.TaskLifecycles, + Shared: structs.AllocatedSharedResources{ + DiskMB: int64(missing.TaskGroup.EphemeralDisk.SizeMB), + }, + } + + if option.AllocResources != nil { + resources.Shared.Networks = option.AllocResources.Networks + resources.Shared.Ports = option.AllocResources.Ports + } + + // Create an allocation for this + alloc := &structs.Allocation{ + ID: uuid.Generate(), + Namespace: s.job.Namespace, + EvalID: s.eval.ID, + Name: missing.Name, + JobID: s.job.ID, + TaskGroup: tgName, + Metrics: s.ctx.Metrics(), + NodeID: option.Node.ID, + NodeName: option.Node.Name, + TaskResources: resources.OldTaskResources(), + AllocatedResources: resources, + DesiredStatus: structs.AllocDesiredStatusRun, + ClientStatus: structs.AllocClientStatusPending, + // SharedResources is considered deprecated, will be removed in 0.11. + // It is only set for compat reasons + SharedResources: &structs.Resources{ + DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB, + Networks: resources.Shared.Networks, + }, + } + + // If the new allocation is replacing an older allocation then we record the + // older allocation id so that they are chained + if missing.Alloc != nil { + alloc.PreviousAllocation = missing.Alloc.ID + } + + // If this placement involves preemption, set DesiredState to evict for those allocations + if option.PreemptedAllocs != nil { + var preemptedAllocIDs []string + for _, stop := range option.PreemptedAllocs { + s.plan.AppendPreemptedAlloc(stop, alloc.ID) + + preemptedAllocIDs = append(preemptedAllocIDs, stop.ID) + if s.eval.AnnotatePlan && s.planAnnotations != nil { + s.planAnnotations.PreemptedAllocs = append(s.planAnnotations.PreemptedAllocs, stop.Stub(nil)) + if s.planAnnotations.DesiredTGUpdates != nil { + desired := s.planAnnotations.DesiredTGUpdates[tgName] + desired.Preemptions += 1 + } + } + } + alloc.PreemptedAllocations = preemptedAllocIDs + } + + s.plan.AppendAlloc(alloc, nil) + } + + return nil +} + +// addBlocked creates a new blocked eval for this job on this node +// and submit to the planner (worker.go), which keeps the eval for execution later +func (s *SysBatchScheduler) addBlocked(node *structs.Node) error { + e := s.ctx.Eligibility() + escaped := e.HasEscaped() + + // Only store the eligible classes if the eval hasn't escaped. + var classEligibility map[string]bool + if !escaped { + classEligibility = e.GetClasses() + } + + blocked := s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached(), s.failedTGAllocs) + blocked.StatusDescription = sstructs.DescBlockedEvalFailedPlacements + blocked.NodeID = node.ID + + return s.planner.CreateEval(blocked) +} + +func (s *SysBatchScheduler) canHandle(trigger string) bool { + switch trigger { + case structs.EvalTriggerJobRegister: + case structs.EvalTriggerNodeUpdate: + case structs.EvalTriggerFailedFollowUp: + case structs.EvalTriggerJobDeregister: + case structs.EvalTriggerRollingUpdate: + case structs.EvalTriggerPreemption: + case structs.EvalTriggerDeploymentWatcher: + case structs.EvalTriggerNodeDrain: + case structs.EvalTriggerAllocStop: + case structs.EvalTriggerQueuedAllocs: + case structs.EvalTriggerScaling: + case structs.EvalTriggerReconnect: + default: + return trigger == structs.EvalTriggerPeriodicJob + } + return true +} + +// evictAndPlace is used to mark allocations for evicts and add them to the +// placement queue. evictAndPlace modifies the diffResult. It returns true if +// the limit has been reached for any task group. +func evictAndPlace(ctx feasible.Context, job *structs.Job, diff *reconciler.NodeReconcileResult, desc string) bool { + + limits := map[string]int{} // per task group limits + if !job.Stopped() { + jobLimit := len(diff.Update) + if job.Update.MaxParallel > 0 { + jobLimit = job.Update.MaxParallel + } + for _, tg := range job.TaskGroups { + if tg.Update != nil && tg.Update.MaxParallel > 0 { + limits[tg.Name] = tg.Update.MaxParallel + } else { + limits[tg.Name] = jobLimit + } + } + } + + limited := false + for _, a := range diff.Update { + if limit := limits[a.Alloc.TaskGroup]; limit > 0 { + ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "") + diff.Place = append(diff.Place, a) + if !a.Canary { + limits[a.Alloc.TaskGroup]-- + } + } else { + limited = true + } + } + return limited +} diff --git a/scheduler/scheduler_sysbatch_test.go b/scheduler/scheduler_sysbatch_test.go index 11d82c5dd86..9101acbdd04 100644 --- a/scheduler/scheduler_sysbatch_test.go +++ b/scheduler/scheduler_sysbatch_test.go @@ -1122,7 +1122,7 @@ func TestSysBatch_JobConstraint_RunMultiple(t *testing.T) { must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - err := h.Process(NewSystemScheduler, eval) + err := h.Process(NewSysBatchScheduler, eval) must.NoError(t, err) // Create a mock evaluation to run the job again, which will not place any @@ -1138,7 +1138,7 @@ func TestSysBatch_JobConstraint_RunMultiple(t *testing.T) { } must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval2})) - err = h.Process(NewSystemScheduler, eval2) + err = h.Process(NewSysBatchScheduler, eval2) must.NoError(t, err) // Ensure a single plan @@ -1822,7 +1822,7 @@ func TestSysBatch_Preemption(t *testing.T) { func TestSysBatch_canHandle(t *testing.T) { ci.Parallel(t) - s := SystemScheduler{sysbatch: true} + s := SysBatchScheduler{} t.Run("sysbatch register", func(t *testing.T) { must.True(t, s.canHandle(structs.EvalTriggerJobRegister)) }) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 54f0a2db56e..0e9b1565376 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -23,22 +23,17 @@ const ( // we will attempt to schedule if we continue to hit conflicts for system // jobs. maxSystemScheduleAttempts = 5 - - // maxSysBatchScheduleAttempts is used to limit the number of times we will - // attempt to schedule if we continue to hit conflicts for sysbatch jobs. - maxSysBatchScheduleAttempts = 2 ) -// SystemScheduler is used for 'system' and 'sysbatch' jobs. This scheduler is -// designed for jobs that should be run on every client. The 'system' mode -// will ensure those jobs continuously run regardless of successful task exits, -// whereas 'sysbatch' considers the task complete on success. +// SystemScheduler is used for 'system' jobs. This scheduler is designed for +// jobs that should be run on every client. The 'system' mode will ensure those +// jobs continuously run regardless of successful task exits, whereas 'sysbatch' +// considers the task complete on success. type SystemScheduler struct { logger log.Logger eventsCh chan<- interface{} state sstructs.State planner sstructs.Planner - sysbatch bool eval *structs.Evaluation job *structs.Job @@ -69,17 +64,6 @@ func NewSystemScheduler(logger log.Logger, eventsCh chan<- interface{}, state ss eventsCh: eventsCh, state: state, planner: planner, - sysbatch: false, - } -} - -func NewSysBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state sstructs.State, planner sstructs.Planner) sstructs.Scheduler { - return &SystemScheduler{ - logger: logger.Named("sysbatch_sched"), - eventsCh: eventsCh, - state: state, - planner: planner, - sysbatch: true, } } @@ -108,9 +92,6 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { } limit := maxSystemScheduleAttempts - if s.sysbatch { - limit = maxSysBatchScheduleAttempts - } // Retry up to the maxSystemScheduleAttempts and reset if progress is made. progress := func() bool { return progressMade(s.planResult) } @@ -155,15 +136,13 @@ func (s *SystemScheduler) process() (bool, error) { } } - if !s.sysbatch { - s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID) - if err != nil { - return false, fmt.Errorf("failed to get deployment for job %q: %w", s.eval.JobID, err) - } - // system deployments may be mutated in the reconciler because the node - // count can change between evaluations - s.deployment = s.deployment.Copy() + s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID) + if err != nil { + return false, fmt.Errorf("failed to get deployment for job %q: %w", s.eval.JobID, err) } + // system deployments may be mutated in the reconciler because the node + // count can change between evaluations + s.deployment = s.deployment.Copy() // Create a plan s.plan = s.eval.MakePlan(s.job) @@ -175,7 +154,7 @@ func (s *SystemScheduler) process() (bool, error) { s.ctx = feasible.NewEvalContext(s.eventsCh, s.state, s.plan, s.logger) // Construct the placement stack - s.stack = feasible.NewSystemStack(s.sysbatch, s.ctx) + s.stack = feasible.NewSystemStack(false, s.ctx) if !s.job.Stopped() { s.setJob(s.job) } @@ -417,30 +396,6 @@ func (s *SystemScheduler) computeJobAllocs() error { return nil } -func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { - if acc == nil { - return curr.Copy() - } - - acc.NodesEvaluated += curr.NodesEvaluated - acc.NodesFiltered += curr.NodesFiltered - - if acc.ClassFiltered == nil { - acc.ClassFiltered = make(map[string]int) - } - for k, v := range curr.ClassFiltered { - acc.ClassFiltered[k] += v - } - if acc.ConstraintFiltered == nil { - acc.ConstraintFiltered = make(map[string]int) - } - for k, v := range curr.ConstraintFiltered { - acc.ConstraintFiltered[k] += v - } - acc.AllocationTime += curr.AllocationTime - return acc -} - func (s *SystemScheduler) findFeasibleNodesForTG(buckets *reconciler.NodeReconcileResult) map[string][]*feasible.RankedNode { nodeByID := make(map[string]*structs.Node, len(s.nodes)) for _, node := range s.nodes { @@ -714,12 +669,7 @@ func (s *SystemScheduler) canHandle(trigger string) bool { case structs.EvalTriggerScaling: case structs.EvalTriggerReconnect: default: - switch s.sysbatch { - case true: - return trigger == structs.EvalTriggerPeriodicJob - case false: - return false - } + return false } return true } diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 53b91989262..10f6a8abcde 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -2293,7 +2293,7 @@ func TestSystemSched_Preemption(t *testing.T) { func TestSystemSched_canHandle(t *testing.T) { ci.Parallel(t) - s := SystemScheduler{sysbatch: false} + s := SystemScheduler{} t.Run("system register", func(t *testing.T) { must.True(t, s.canHandle(structs.EvalTriggerJobRegister)) }) @@ -3106,8 +3106,12 @@ func TestSystemSched_NodeDisconnected(t *testing.T) { must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - err := h.Process(NewSystemScheduler, eval) - must.NoError(t, err) + if tc.jobType == structs.JobTypeSystem { + must.NoError(t, h.Process(NewSystemScheduler, eval)) + } + if tc.jobType == structs.JobTypeSysBatch { + must.NoError(t, h.Process(NewSysBatchScheduler, eval)) + } // Ensure a single plan must.Len(t, tc.expectedPlanCount, h.Plans) diff --git a/scheduler/util.go b/scheduler/util.go index 9d7b03cb4c9..2c691ed8d8f 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -918,3 +918,27 @@ func genericAllocUpdateFn(ctx feasible.Context, stack feasible.Stack, evalID str return false, false, newAlloc } } + +func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { + if acc == nil { + return curr.Copy() + } + + acc.NodesEvaluated += curr.NodesEvaluated + acc.NodesFiltered += curr.NodesFiltered + + if acc.ClassFiltered == nil { + acc.ClassFiltered = make(map[string]int) + } + for k, v := range curr.ClassFiltered { + acc.ClassFiltered[k] += v + } + if acc.ConstraintFiltered == nil { + acc.ConstraintFiltered = make(map[string]int) + } + for k, v := range curr.ConstraintFiltered { + acc.ConstraintFiltered[k] += v + } + acc.AllocationTime += curr.AllocationTime + return acc +} From 0b4de7688b8adc2480c4d70515b205a87422a9db Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 30 Oct 2025 16:18:52 +0100 Subject: [PATCH 10/24] system scheduler: do not leave empty keys in plan.NodeUpdate If we leave empty keys in that map, the plan will not be considered a no-op, which it otherwise should be. --- scheduler/scheduler_system.go | 8 ++++++++ scheduler/scheduler_system_test.go | 16 ++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 0e9b1565376..342c8a71434 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -5,6 +5,7 @@ package scheduler import ( "fmt" + "maps" "math" "runtime/debug" "slices" @@ -722,6 +723,13 @@ func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResu } } + // it may be the case that there are keys in the NodeUpdate that are empty. + // We should delete them, otherwise the plan won't be correctly recognize as + // a no-op. + maps.DeleteFunc(s.plan.NodeUpdate, func(k string, v []*structs.Allocation) bool { + return len(v) == 0 + }) + return limited } diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 10f6a8abcde..f05a651176b 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -23,7 +23,6 @@ import ( "github.com/hashicorp/nomad/scheduler/tests" "github.com/shoenig/test" "github.com/shoenig/test/must" - "github.com/shoenig/test/wait" ) func TestSystemSched_JobRegister(t *testing.T) { @@ -1276,11 +1275,11 @@ func TestSystemSched_Queued_With_Constraints(t *testing.T) { must.Zero(t, val) } -// This test ensures that the scheduler correctly ignores ineligible -// nodes when scheduling due to a new node being added. The job has two -// task groups constrained to a particular node class. The desired behavior -// should be that the TaskGroup constrained to the newly added node class is -// added and that the TaskGroup constrained to the ineligible node is ignored. +// This test ensures that the scheduler correctly ignores ineligible nodes when +// scheduling due to a new node being added. The job has two task groups +// constrained to a particular node class. The desired behavior should be that +// the TaskGroup constrained to the newly added node class is added and that the +// TaskGroup constrained to the ineligible node is ignored. func TestSystemSched_JobConstraint_AddNode(t *testing.T) { ci.Parallel(t) @@ -1754,10 +1753,7 @@ func TestSystemSched_ConstraintErrors(t *testing.T) { // QueuedAllocations is drained val, ok := h.Evals[0].QueuedAllocations["web"] must.True(t, ok) - must.Wait(t, wait.InitialSuccess( - wait.BoolFunc(func() bool { - return val == 0 - }))) + must.Eq(t, 0, val) // The plan has two NodeAllocations must.Eq(t, 1, len(h.Plans)) From d318f0fefe1e3aa81e757e03a6b6744c5c0f83e2 Mon Sep 17 00:00:00 2001 From: Chris Roberts Date: Thu, 30 Oct 2025 17:35:00 -0700 Subject: [PATCH 11/24] scheduler: maintain node feasibility information --- scheduler/scheduler_system.go | 137 +++++++++++++++++++++------------- 1 file changed, 84 insertions(+), 53 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 342c8a71434..8217dcf1063 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -47,8 +47,9 @@ type SystemScheduler struct { notReadyNodes map[string]struct{} nodesByDC map[string]int - deployment *structs.Deployment - feasibleNodesForTG map[string][]*feasible.RankedNode // used to track which nodes passed the feasibility check for TG + deployment *structs.Deployment + nodesForTG map[string]taskGroupNodes // used to track node feasibility information for each TG + filteredNodeMetricsForTG map[string]*structs.AllocMetric // used to track filtered node metrics for each TG limitReached bool @@ -57,6 +58,34 @@ type SystemScheduler struct { planAnnotations *structs.PlanAnnotations } +// taskGroupNodes are a collection of taskGroupNode which include +// node feasibility information for a task group. +type taskGroupNodes []*taskGroupNode + +// feasible returns all taskGroupNode that are feasible for placement +func (t taskGroupNodes) feasible() (feasibleNodes []*taskGroupNode) { + for _, tgn := range t { + if tgn.isFeasible() { + feasibleNodes = append(feasibleNodes, tgn) + } + } + + return +} + +// taskGroupNode is a container for holding node feasibility +// information for a task group. +type taskGroupNode struct { + NodeID string + RankedNode *feasible.RankedNode + Metrics *structs.AllocMetric +} + +// isFeasible returns if the node is feasible for the task group. +func (t *taskGroupNode) isFeasible() bool { + return t.RankedNode != nil +} + // NewSystemScheduler is a factory function to instantiate a new system // scheduler. func NewSystemScheduler(logger log.Logger, eventsCh chan<- interface{}, state sstructs.State, planner sstructs.Planner) sstructs.Scheduler { @@ -298,8 +327,14 @@ func (s *SystemScheduler) computeJobAllocs() error { DesiredTGUpdates: desiredUpdates(reconciliationResult, inplaceUpdates, destructiveUpdates), } - // find feasible nodes for all the task groups - s.feasibleNodesForTG = s.findFeasibleNodesForTG(reconciliationResult) + // Gather node information for all the task groups. + s.nodesForTG, s.filteredNodeMetricsForTG = s.findNodesForTG(reconciliationResult) + + // Set placement totals for task groups. The totals will be the available + // feasible nodes for each group. + for tgName := range s.nodesForTG { + s.planAnnotations.DesiredTGUpdates[tgName].Place = uint64(len(s.nodesForTG[tgName].feasible())) + } // Treat non in-place updates as an eviction and new placement, which will // be limited by max_parallel @@ -336,8 +371,8 @@ func (s *SystemScheduler) computeJobAllocs() error { // track if any of the task groups is doing a canary update now deploymentComplete := true for _, tg := range s.job.TaskGroups { - feasibleNodes, ok := s.feasibleNodesForTG[tg.Name] - if !ok { + feasibleNodes := s.nodesForTG[tg.Name].feasible() + if len(feasibleNodes) < 1 { // this will happen if we're seeing a TG that shouldn't be placed; we only ever // get feasible node counts for placements. These TGs get their DesiredTotal set // in the reconciler and we don't touch it. @@ -397,18 +432,23 @@ func (s *SystemScheduler) computeJobAllocs() error { return nil } -func (s *SystemScheduler) findFeasibleNodesForTG(buckets *reconciler.NodeReconcileResult) map[string][]*feasible.RankedNode { +// findNodesForTG runs feasibility checks on nodes. The result includes all nodes for each +// task group (feasible and infeasible) along with metrics information on the checks. +func (s *SystemScheduler) findNodesForTG(buckets *reconciler.NodeReconcileResult) (tgNodes map[string]taskGroupNodes, filteredMetrics map[string]*structs.AllocMetric) { + tgNodes = make(map[string]taskGroupNodes) + filteredMetrics = make(map[string]*structs.AllocMetric) + nodeByID := make(map[string]*structs.Node, len(s.nodes)) for _, node := range s.nodes { nodeByID[node.ID] = node } - feasibleNodes := make(map[string][]*feasible.RankedNode) - nodes := make([]*structs.Node, 1) for _, a := range slices.Concat(buckets.Place, buckets.Update, buckets.Ignore) { - tgName := a.TaskGroup.Name + if tgNodes[tgName] == nil { + tgNodes[tgName] = taskGroupNodes{} + } node, ok := nodeByID[a.Alloc.NodeID] if !ok { @@ -430,22 +470,19 @@ func (s *SystemScheduler) findFeasibleNodesForTG(buckets *reconciler.NodeReconci // Attempt to match the task group option := s.stack.Select(a.TaskGroup, &feasible.SelectOptions{AllocName: a.Name}) - if option == nil { - continue - } - // count this node as feasible - if feasibleNodes[tgName] == nil { - feasibleNodes[tgName] = []*feasible.RankedNode{option} - } else { - feasibleNodes[tgName] = append(feasibleNodes[tgName], option) - } + // Always store the results. Keep the metrics that were generated + // for the match attempt so they can be used during placement. + tgNodes[tgName] = append(tgNodes[tgName], &taskGroupNode{node.ID, option, s.ctx.Metrics().Copy()}) - // update the plan annotations for this task group - s.planAnnotations.DesiredTGUpdates[tgName].Place = uint64(len(feasibleNodes[tgName])) + if option == nil { + // When no match is found, merge the filter metrics for the task + // group so proper reporting can be done during placement. + filteredMetrics[tgName] = mergeNodeFiltered(filteredMetrics[tgName], s.ctx.Metrics()) + } } - return feasibleNodes + return } // computePlacements computes placements for allocations @@ -458,9 +495,6 @@ func (s *SystemScheduler) computePlacements( nodeByID[node.ID] = node } - // track node filtering, to only report an error if all nodes have been filtered - var filteredMetrics map[string]*structs.AllocMetric - var deploymentID string if s.deployment != nil && s.deployment.Active() { deploymentID = s.deployment.ID @@ -475,15 +509,24 @@ func (s *SystemScheduler) computePlacements( continue } - // we're already performed feasibility check for all the task groups and + // we've already performed feasibility check for all the task groups and // nodes, so look up var option *feasible.RankedNode - optionsForTG := s.feasibleNodesForTG[tgName] + var metrics *structs.AllocMetric + optionsForTG := s.nodesForTG[tgName] if existing := slices.IndexFunc( optionsForTG, - func(rn *feasible.RankedNode) bool { return rn.Node == node }, + func(rn *taskGroupNode) bool { return rn.NodeID == node.ID }, ); existing != -1 { - option = optionsForTG[existing] + option = optionsForTG[existing].RankedNode + metrics = optionsForTG[existing].Metrics + } else { + // we should have an entry for every node that is looked + // up. if we don't, something must be wrong + s.logger.Error("failed to locate node feasibility information", + "node-id", node.ID, "task_group", tgName) + // provide a stubbed metric to work with + metrics = &structs.AllocMetric{} } if option == nil { @@ -493,15 +536,10 @@ func (s *SystemScheduler) computePlacements( // If this node was filtered because of constraint // mismatches and we couldn't create an allocation then // decrement queuedAllocs for that task group. - if s.ctx.Metrics().NodesFiltered > 0 { + if metrics.NodesFiltered > 0 { queued := s.queuedAllocs[tgName] - 1 s.queuedAllocs[tgName] = queued - if filteredMetrics == nil { - filteredMetrics = map[string]*structs.AllocMetric{} - } - filteredMetrics[tgName] = mergeNodeFiltered(filteredMetrics[tgName], s.ctx.Metrics()) - // If no tasks have been placed and there aren't any previously // existing (ignored or updated) tasks on the node, mark the alloc as failed to be placed // if queued <= 0 && !existingByTaskGroup[tgName] { @@ -509,14 +547,7 @@ func (s *SystemScheduler) computePlacements( if s.failedTGAllocs == nil { s.failedTGAllocs = make(map[string]*structs.AllocMetric) } - s.failedTGAllocs[tgName] = filteredMetrics[tgName] - } - - // If we are annotating the plan, then decrement the desired - // placements based on whether the node meets the constraints - if s.planAnnotations != nil && - s.planAnnotations.DesiredTGUpdates != nil { - s.planAnnotations.DesiredTGUpdates[tgName].Place -= 1 + s.failedTGAllocs[tgName] = s.filteredNodeMetricsForTG[tgName] } // Filtered nodes are not reported to users, just omitted from the job status @@ -531,12 +562,12 @@ func (s *SystemScheduler) computePlacements( } // Store the available nodes by datacenter - s.ctx.Metrics().NodesAvailable = s.nodesByDC - s.ctx.Metrics().NodesInPool = len(s.nodes) - s.ctx.Metrics().NodePool = s.job.NodePool + metrics.NodesAvailable = s.nodesByDC + metrics.NodesInPool = len(s.nodes) + metrics.NodePool = s.job.NodePool // Compute top K scoring node metadata - s.ctx.Metrics().PopulateScoreMetaData() + metrics.PopulateScoreMetaData() // Lazy initialize the failed map if s.failedTGAllocs == nil { @@ -544,21 +575,21 @@ func (s *SystemScheduler) computePlacements( } // Update metrics with the resources requested by the task group. - s.ctx.Metrics().ExhaustResources(missing.TaskGroup) + metrics.ExhaustResources(missing.TaskGroup) // Actual failure to start this task on this candidate node, report it individually - s.failedTGAllocs[tgName] = s.ctx.Metrics() + s.failedTGAllocs[tgName] = metrics s.addBlocked(node) continue } // Store the available nodes by datacenter - s.ctx.Metrics().NodesAvailable = s.nodesByDC - s.ctx.Metrics().NodesInPool = len(s.nodes) + metrics.NodesAvailable = s.nodesByDC + metrics.NodesInPool = len(s.nodes) // Compute top K scoring node metadata - s.ctx.Metrics().PopulateScoreMetaData() + metrics.PopulateScoreMetaData() // Set fields based on if we found an allocation option resources := &structs.AllocatedResources{ @@ -582,7 +613,7 @@ func (s *SystemScheduler) computePlacements( Name: missing.Name, JobID: s.job.ID, TaskGroup: tgName, - Metrics: s.ctx.Metrics(), + Metrics: metrics, NodeID: option.Node.ID, NodeName: option.Node.Name, DeploymentID: deploymentID, From e51cce3aa40d2da1f6f27f1388d309d80b274ef0 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Fri, 31 Oct 2025 09:06:35 +0100 Subject: [PATCH 12/24] scheduler: un-flake TestSystemSched_evictUnneededCanaries --- scheduler/scheduler_system_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index f05a651176b..8be928cad17 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -3958,7 +3958,7 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { requiredCanaries int tgName string nodeAllocation map[string][]*structs.Allocation - expectedDesiredCanaries []string + expectedDesiredCanaries int expectedNodeAllocation map[string][]*structs.Allocation }{ { @@ -3966,7 +3966,7 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { requiredCanaries: 0, tgName: "foo", nodeAllocation: nil, - expectedDesiredCanaries: []string{}, + expectedDesiredCanaries: 0, expectedNodeAllocation: nil, }, { @@ -3997,7 +3997,7 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { }, }, }, - expectedDesiredCanaries: []string{}, + expectedDesiredCanaries: 0, expectedNodeAllocation: map[string][]*structs.Allocation{ "node1": { { @@ -4053,7 +4053,7 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { }, }, }, - expectedDesiredCanaries: []string{"tg1_alloc1"}, + expectedDesiredCanaries: 1, expectedNodeAllocation: map[string][]*structs.Allocation{ "node1": { { @@ -4084,7 +4084,7 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { } s.plan.NodeAllocation = tt.nodeAllocation - must.Eq(t, tt.expectedDesiredCanaries, s.evictUnneededCanaries(tt.requiredCanaries, tt.tgName, &reconciler.NodeReconcileResult{}), must.Sprint("unexpected desired canaries")) + must.SliceLen(t, tt.expectedDesiredCanaries, s.evictUnneededCanaries(tt.requiredCanaries, tt.tgName, &reconciler.NodeReconcileResult{}), must.Sprint("unexpected desired canaries")) must.Eq(t, tt.expectedNodeAllocation, s.plan.NodeAllocation, must.Sprintf("unexpected node allocation")) }) } From fcbe34e59f218d319d1503b6adcd39f6aa93ddb5 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 3 Oct 2025 16:11:24 -0400 Subject: [PATCH 13/24] system deployments: failing tests An attempt to work up some failing unit tests for #26885 and #26886. Ref: https://github.com/hashicorp/nomad/issues/26885 Ref: https://github.com/hashicorp/nomad/issues/26886 --- scheduler/scheduler_system_test.go | 148 +++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 8be928cad17..1a6b798e9f9 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -4089,3 +4089,151 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { }) } } + +func TestSystemSched_NoOpEvalWithInfeasibleNodes(t *testing.T) { + ci.Parallel(t) + h := tests.NewHarness(t) + + nodes := make([]*structs.Node, 4) + eligible := []string{} + for i := range 4 { + node := mock.Node() + if i%2 == 0 { + node.Attributes["kernel.name"] = "not-linux" + } else { + eligible = append(eligible, node.ID) + } + nodes[i] = node + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + job := mock.SystemJob() + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + + existingAllocIDs := []string{} + allocs := []*structs.Allocation{} + for i := range 4 { + if i%2 != 0 { + alloc := mock.MinAllocForJob(job) + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.NodeID = nodes[i].ID + alloc.Name = structs.AllocName(job.Name, job.TaskGroups[0].Name, 0) + existingAllocIDs = append(existingAllocIDs, alloc.ID) + allocs = append(allocs, alloc) + } + } + must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + d := mock.Deployment() + d.JobID = job.ID + d.JobVersion = job.Version + d.Status = structs.DeploymentStatusSuccessful + must.NoError(t, h.State.UpsertDeployment(h.NextIndex(), d)) + + eval := &structs.Evaluation{ + Namespace: job.Namespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + AnnotatePlan: true, + } + must.NoError(t, h.State.UpsertEvals( + structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + job = job.Copy() + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + + err := h.Process(NewSystemScheduler, eval) + must.NoError(t, err) + must.Len(t, 1, h.Plans) + plan := h.Plans[0] + must.Nil(t, plan.Deployment, must.Sprintf("expected no new deployment")) + must.Eq(t, 2, plan.Annotations.DesiredTGUpdates["web"].InPlaceUpdate) + must.MapLen(t, 0, plan.NodeUpdate, must.Sprintf("expected no stops")) + must.MapLen(t, 2, plan.NodeAllocation) + for nodeID, allocs := range plan.NodeAllocation { + must.SliceContains(t, eligible, nodeID) + must.Len(t, 1, allocs) + must.SliceContains(t, existingAllocIDs, allocs[0].ID, + must.Sprintf("expected existing alloc to be updated in-place")) + } +} + +func TestSystemSched_CanariesWithInfeasibleNodes(t *testing.T) { + ci.Parallel(t) + h := tests.NewHarness(t) + + nodes := make([]*structs.Node, 4) + eligible := []string{} + for i := range 4 { + node := mock.Node() + if i%2 == 0 { + node.Attributes["kernel.name"] = "not-linux" + } else { + eligible = append(eligible, node.ID) + } + nodes[i] = node + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + job := mock.SystemJob() + job.TaskGroups[0].Update = &structs.UpdateStrategy{ + MaxParallel: 4, + Canary: 100, // blue-green + } + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + + existingAllocIDs := []string{} + allocs := []*structs.Allocation{} + for i := range 4 { + if i%2 != 0 { + alloc := mock.MinAllocForJob(job) + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.NodeID = nodes[i].ID + alloc.Name = structs.AllocName(job.Name, job.TaskGroups[0].Name, 0) + existingAllocIDs = append(existingAllocIDs, alloc.ID) + allocs = append(allocs, alloc) + } + } + must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + d := mock.Deployment() + d.JobID = job.ID + d.JobVersion = job.Version + d.Status = structs.DeploymentStatusSuccessful + must.NoError(t, h.State.UpsertDeployment(h.NextIndex(), d)) + + // destructively update the job + + job = job.Copy() + job.TaskGroups[0].Tasks[0].Resources.CPU++ + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + + eval := &structs.Evaluation{ + Namespace: job.Namespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + AnnotatePlan: true, + } + must.NoError(t, h.State.UpsertEvals( + structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + err := h.Process(NewSystemScheduler, eval) + must.NoError(t, err) + must.Len(t, 1, h.Plans) + plan := h.Plans[0] + must.NotNil(t, plan.Deployment, must.Sprintf("expected a new deployment")) + + dstate := plan.Deployment.TaskGroups["web"] + test.Len(t, 2, dstate.PlacedCanaries) + test.Eq(t, 2, dstate.DesiredCanaries) + test.Eq(t, 2, dstate.DesiredTotal) + + must.Eq(t, 2, plan.Annotations.DesiredTGUpdates["web"].Canary, + must.Sprintf("expected canaries: %#v", plan.Annotations.DesiredTGUpdates)) +} From 91c7acc94aa271d49d9cf76eb2b72e75b1af27f8 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Fri, 31 Oct 2025 11:10:10 +0100 Subject: [PATCH 14/24] system scheduler: handle empty deployment states correctly --- scheduler/scheduler_system.go | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 8217dcf1063..e8da5858c8c 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -373,9 +373,15 @@ func (s *SystemScheduler) computeJobAllocs() error { for _, tg := range s.job.TaskGroups { feasibleNodes := s.nodesForTG[tg.Name].feasible() if len(feasibleNodes) < 1 { - // this will happen if we're seeing a TG that shouldn't be placed; we only ever - // get feasible node counts for placements. These TGs get their DesiredTotal set - // in the reconciler and we don't touch it. + // this will happen if we're seeing a TG that shouldn't be placed. + // + // in case the deployment is in a successful state, this indicate a + // noop eval due to infeasible nodes. In this case we set the dstate + // for this task group to nil. + if s.deployment.Status == structs.DeploymentStatusSuccessful { + s.deployment.TaskGroups[tg.Name] = nil + } + continue } @@ -425,6 +431,20 @@ func (s *SystemScheduler) computeJobAllocs() error { // adjust the deployment updates and set the right deployment status nr.DeploymentUpdates = append(nr.DeploymentUpdates, s.setDeploymentStatusAndUpdates(deploymentComplete, s.job)...) + // Check if perhaps we're dealing with a nil deployment, i.e., a deployment + // which is in successful state and where all task groups have a nil dstate. + // In this case, set the deployment to nil. + nilDstates := true + for _, tg := range s.deployment.TaskGroups { + if tg != nil { + nilDstates = false + } + } + if nilDstates { + s.deployment = nil + nr.DeploymentUpdates = nil + } + // Add the deployment changes to the plan s.plan.Deployment = s.deployment s.plan.DeploymentUpdates = nr.DeploymentUpdates From 6fe7a984d5dfcb95c86533b3228a5c1198629726 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Fri, 31 Oct 2025 14:52:13 +0100 Subject: [PATCH 15/24] comments from @jrasell --- scheduler/scheduler_system.go | 7 ++----- scheduler/util.go | 1 + 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index e8da5858c8c..730a812ab21 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -385,8 +385,9 @@ func (s *SystemScheduler) computeJobAllocs() error { continue } + dstate, ok := s.deployment.TaskGroups[tg.Name] // no deployment for this TG - if _, ok := s.deployment.TaskGroups[tg.Name]; !ok { + if !ok { continue } @@ -394,10 +395,6 @@ func (s *SystemScheduler) computeJobAllocs() error { // feasible nodes s.deployment.TaskGroups[tg.Name].DesiredTotal = len(feasibleNodes) - dstate, ok := s.deployment.TaskGroups[tg.Name] - if !ok { - continue - } // a system job is canarying if: // - it has a non-empty update block (just a sanity check, all // submitted jobs should have a non-empty update block as part of diff --git a/scheduler/util.go b/scheduler/util.go index 2c691ed8d8f..e003891ece3 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -919,6 +919,7 @@ func genericAllocUpdateFn(ctx feasible.Context, stack feasible.Stack, evalID str } } +// mergeNodeFiltered merges allocation metrics for task group func mergeNodeFiltered(acc, curr *structs.AllocMetric) *structs.AllocMetric { if acc == nil { return curr.Copy() From d55ab6c1edb280d14fde99d0e9c948429d13c674 Mon Sep 17 00:00:00 2001 From: Chris Roberts Date: Fri, 31 Oct 2025 10:15:53 -0700 Subject: [PATCH 16/24] system scheduler: reset eligibility when selecting nodes --- scheduler/feasible/context.go | 7 +++++++ scheduler/feasible/stack.go | 5 +++++ scheduler/scheduler_system.go | 3 +++ scheduler/scheduler_system_test.go | 16 +++++++--------- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/scheduler/feasible/context.go b/scheduler/feasible/context.go index a03b62cfe9a..9d9817c31e3 100644 --- a/scheduler/feasible/context.go +++ b/scheduler/feasible/context.go @@ -252,6 +252,13 @@ func NewEvalEligibility() *EvalEligibility { } } +// Reset clears the contents of the eval eligibility +func (e *EvalEligibility) Reset() { + e.job = make(map[string]ComputedClassFeasibility) + e.taskGroups = make(map[string]map[string]ComputedClassFeasibility) + e.tgEscapedConstraints = make(map[string]bool) +} + // SetJob takes the job being evaluated and calculates the escaped constraints // at the job and task group level. func (e *EvalEligibility) SetJob(job *structs.Job) { diff --git a/scheduler/feasible/stack.go b/scheduler/feasible/stack.go index 9941cbe9706..52404d4113b 100644 --- a/scheduler/feasible/stack.go +++ b/scheduler/feasible/stack.go @@ -352,6 +352,11 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran // Reset the binpack selector and context s.scoreNorm.Reset() s.ctx.Reset() + + // Since the system stack is always evaluating a single + // node, previous eligibility information is not applicable + // so reset it + s.ctx.Eligibility().Reset() start := time.Now() // Get the task groups constraints. diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 730a812ab21..56aa39d9e6a 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -418,7 +418,10 @@ func (s *SystemScheduler) computeJobAllocs() error { // all eligible nodes. At this point we know which nodes are // feasible, so we evict unnedded canaries. placedCanaries := s.evictUnneededCanaries(requiredCanaries, tg.Name, reconciliationResult) + + // Update deployment and plan annotation with canaries that were placed s.deployment.TaskGroups[tg.Name].PlacedCanaries = placedCanaries + s.planAnnotations.DesiredTGUpdates[tg.Name].Canary = uint64(len(placedCanaries)) } groupComplete := s.isDeploymentComplete(tg.Name, reconciliationResult, isCanarying) diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 1a6b798e9f9..07400acea47 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -4187,15 +4187,13 @@ func TestSystemSched_CanariesWithInfeasibleNodes(t *testing.T) { existingAllocIDs := []string{} allocs := []*structs.Allocation{} - for i := range 4 { - if i%2 != 0 { - alloc := mock.MinAllocForJob(job) - alloc.ClientStatus = structs.AllocClientStatusRunning - alloc.NodeID = nodes[i].ID - alloc.Name = structs.AllocName(job.Name, job.TaskGroups[0].Name, 0) - existingAllocIDs = append(existingAllocIDs, alloc.ID) - allocs = append(allocs, alloc) - } + for _, eligibleNode := range eligible { + alloc := mock.MinAllocForJob(job) + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.NodeID = eligibleNode + alloc.Name = structs.AllocName(job.Name, job.TaskGroups[0].Name, 0) + existingAllocIDs = append(existingAllocIDs, alloc.ID) + allocs = append(allocs, alloc) } must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) From c6fbb8af7d5204323e3a214edb3dbc8424a32282 Mon Sep 17 00:00:00 2001 From: pkazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Fri, 31 Oct 2025 18:41:34 +0100 Subject: [PATCH 17/24] system scheduler: calculate deployment completion based on deployment state Previously we copied the behavior found in the generic scheduler, where we rely on reconciler results to decide if there's enough placements made. In the system scheduler we always know exactly how many placements there should be based on the DesiredTotal field of the deployment state, so a better way to check completeness of the deployment is to simplify it and base on dstate alone. --- scheduler/scheduler_system.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 56aa39d9e6a..3346b07ec5b 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -424,7 +424,7 @@ func (s *SystemScheduler) computeJobAllocs() error { s.planAnnotations.DesiredTGUpdates[tg.Name].Canary = uint64(len(placedCanaries)) } - groupComplete := s.isDeploymentComplete(tg.Name, reconciliationResult, isCanarying) + groupComplete := s.isDeploymentComplete(dstate, isCanarying) deploymentComplete = deploymentComplete && groupComplete } @@ -877,18 +877,16 @@ func (s *SystemScheduler) evictUnneededCanaries(requiredCanaries int, tgName str return desiredCanaries } -func (s *SystemScheduler) isDeploymentComplete(groupName string, buckets *reconciler.NodeReconcileResult, isCanarying bool) bool { - complete := len(buckets.Place)+len(buckets.Migrate)+len(buckets.Update) == 0 - - if !complete || s.deployment == nil || isCanarying { +func (s *SystemScheduler) isDeploymentComplete(dstate *structs.DeploymentState, isCanarying bool) bool { + if s.deployment == nil || isCanarying { return false } + complete := true + // ensure everything is healthy - if dstate, ok := s.deployment.TaskGroups[groupName]; ok { - if dstate.HealthyAllocs < dstate.DesiredTotal { // Make sure we have enough healthy allocs - complete = false - } + if dstate.HealthyAllocs < dstate.DesiredTotal { // Make sure we have enough healthy allocs + complete = false } return complete From 06ebff0861b88a18671bb3e6968495e3c43e9ec7 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 3 Nov 2025 10:11:34 +0100 Subject: [PATCH 18/24] system scheduler: remove obsolete limitReached property --- scheduler/scheduler_system.go | 11 ++--------- scheduler/scheduler_system_test.go | 3 +-- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 3346b07ec5b..73f32f264cc 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -51,8 +51,6 @@ type SystemScheduler struct { nodesForTG map[string]taskGroupNodes // used to track node feasibility information for each TG filteredNodeMetricsForTG map[string]*structs.AllocMetric // used to track filtered node metrics for each TG - limitReached bool - failedTGAllocs map[string]*structs.AllocMetric queuedAllocs map[string]int planAnnotations *structs.PlanAnnotations @@ -338,7 +336,7 @@ func (s *SystemScheduler) computeJobAllocs() error { // Treat non in-place updates as an eviction and new placement, which will // be limited by max_parallel - s.limitReached = s.evictAndPlace(reconciliationResult, sstructs.StatusAllocUpdating) + s.evictAndPlace(reconciliationResult, sstructs.StatusAllocUpdating) if !s.job.Stopped() { for _, tg := range s.job.TaskGroups { @@ -729,7 +727,7 @@ func (s *SystemScheduler) canHandle(trigger string) bool { // evictAndPlace is used to mark allocations for evicts and add them to the // placement queue. evictAndPlace modifies the reconciler result. It returns // true if the limit has been reached for any task group. -func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResult, desc string) bool { +func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResult, desc string) { limits := map[string]int{} // per task group limits if !s.job.Stopped() { @@ -762,15 +760,12 @@ func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResu s.plan.NodeUpdate[node] = allocations[:n] } - limited := false for _, a := range reconciled.Update { if limit := limits[a.Alloc.TaskGroup]; limit > 0 { s.ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "") reconciled.Place = append(reconciled.Place, a) limits[a.Alloc.TaskGroup]-- - } else { - limited = true } } @@ -780,8 +775,6 @@ func (s *SystemScheduler) evictAndPlace(reconciled *reconciler.NodeReconcileResu maps.DeleteFunc(s.plan.NodeUpdate, func(k string, v []*structs.Allocation) bool { return len(v) == 0 }) - - return limited } // evictAndPlaceCanaries checks how many canaries are needed against the amount diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 07400acea47..6415f9e8cd0 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -3345,8 +3345,7 @@ func TestEvictAndPlace(t *testing.T) { NodePreemptions: make(map[string][]*structs.Allocation), }} - must.Eq(t, tc.expectLimited, s.evictAndPlace(diff, ""), - must.Sprintf("limited")) + s.evictAndPlace(diff, "") must.Len(t, tc.expectPlace, diff.Place, must.Sprintf( "evictAndReplace() didn't insert into diffResult properly: %v", diff.Place)) }) From 55ed5625796c70c897006af79d5c2a13b4120341 Mon Sep 17 00:00:00 2001 From: pkazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 3 Nov 2025 21:45:38 +0100 Subject: [PATCH 19/24] system scheduler: handle old deployments correctly in the node reconciler In contrast to the cluster reconciler, in the node reconciler we go node-by-node as opposed to alloc-by-alloc, and thus the state of the reconciler has to be managed differently. If we override old deployments on every run of `cancelUnneededDeployments`, we end up with unnecessarily created deployments for job version that already had them. --- scheduler/reconciler/deployments.go | 57 ----------------------- scheduler/reconciler/reconcile_cluster.go | 53 ++++++++++++++++++++- scheduler/reconciler/reconcile_node.go | 53 +++++++++++++++++++-- 3 files changed, 102 insertions(+), 61 deletions(-) delete mode 100644 scheduler/reconciler/deployments.go diff --git a/scheduler/reconciler/deployments.go b/scheduler/reconciler/deployments.go deleted file mode 100644 index a43fbbedb89..00000000000 --- a/scheduler/reconciler/deployments.go +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: BUSL-1.1 - -package reconciler - -import "github.com/hashicorp/nomad/nomad/structs" - -// cancelUnneededDeployments cancels any deployment that is not needed. -// A deployment update will be staged for jobs that should stop or have the -// wrong version. Unneeded deployments include: -// 1. Jobs that are marked for stop, but there is a non-terminal deployment. -// 2. Deployments that are active, but referencing a different job version. -// 3. Deployments that are already successful. -// -// returns: old deployment, current deployment and a slice of deployment status -// updates. -func cancelUnneededDeployments(j *structs.Job, d *structs.Deployment) (*structs.Deployment, *structs.Deployment, []*structs.DeploymentStatusUpdate) { - var updates []*structs.DeploymentStatusUpdate - - // If the job is stopped and there is a non-terminal deployment, cancel it - if j.Stopped() { - if d != nil && d.Active() { - updates = append(updates, &structs.DeploymentStatusUpdate{ - DeploymentID: d.ID, - Status: structs.DeploymentStatusCancelled, - StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, - }) - } - - // Nothing else to do - return d, nil, updates - } - - if d == nil { - return nil, nil, nil - } - - // Check if the deployment is active and referencing an older job and cancel it - if d.JobCreateIndex != j.CreateIndex || d.JobVersion != j.Version { - if d.Active() { - updates = append(updates, &structs.DeploymentStatusUpdate{ - DeploymentID: d.ID, - Status: structs.DeploymentStatusCancelled, - StatusDescription: structs.DeploymentStatusDescriptionNewerJob, - }) - } - - return d, nil, updates - } - - // Clear it as the current deployment if it is successful - if d.Status == structs.DeploymentStatusSuccessful { - return d, nil, updates - } - - return nil, d, updates -} diff --git a/scheduler/reconciler/reconcile_cluster.go b/scheduler/reconciler/reconcile_cluster.go index 5469fa9f890..8ecbc119657 100644 --- a/scheduler/reconciler/reconcile_cluster.go +++ b/scheduler/reconciler/reconcile_cluster.go @@ -277,7 +277,7 @@ func (a *AllocReconciler) Compute() *ReconcileResults { // Create the allocation matrix m := newAllocMatrix(a.jobState.Job, a.jobState.ExistingAllocs) - a.jobState.DeploymentOld, a.jobState.DeploymentCurrent, result.DeploymentUpdates = cancelUnneededDeployments(a.jobState.Job, a.jobState.DeploymentCurrent) + a.jobState.DeploymentOld, a.jobState.DeploymentCurrent, result.DeploymentUpdates = cancelUnneededServiceDeployments(a.jobState.Job, a.jobState.DeploymentCurrent) // If we are just stopping a job we do not need to do anything more than // stopping all running allocs @@ -569,6 +569,57 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe return result, deploymentComplete } +// cancelUnneededServiceDeployments cancels any deployment that is not needed. +// A deployment update will be staged for jobs that should stop or have the +// wrong version. Unneeded deployments include: +// 1. Jobs that are marked for stop, but there is a non-terminal deployment. +// 2. Deployments that are active, but referencing a different job version. +// 3. Deployments that are already successful. +// +// returns: old deployment, current deployment and a slice of deployment status +// updates. +func cancelUnneededServiceDeployments(j *structs.Job, d *structs.Deployment) (*structs.Deployment, *structs.Deployment, []*structs.DeploymentStatusUpdate) { + var updates []*structs.DeploymentStatusUpdate + + // If the job is stopped and there is a non-terminal deployment, cancel it + if j.Stopped() { + if d != nil && d.Active() { + updates = append(updates, &structs.DeploymentStatusUpdate{ + DeploymentID: d.ID, + Status: structs.DeploymentStatusCancelled, + StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, + }) + } + + // Nothing else to do + return d, nil, updates + } + + if d == nil { + return nil, nil, nil + } + + // Check if the deployment is active and referencing an older job and cancel it + if d.JobCreateIndex != j.CreateIndex || d.JobVersion != j.Version { + if d.Active() { + updates = append(updates, &structs.DeploymentStatusUpdate{ + DeploymentID: d.ID, + Status: structs.DeploymentStatusCancelled, + StatusDescription: structs.DeploymentStatusDescriptionNewerJob, + }) + } + + return d, nil, updates + } + + // Clear it as the current deployment if it is successful + if d.Status == structs.DeploymentStatusSuccessful { + return d, nil, updates + } + + return nil, d, updates +} + // setDeploymentStatusAndUpdates sets status for a.deployment if necessary and // returns an array of DeploymentStatusUpdates. func (a *AllocReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, createdDeployment *structs.Deployment) []*structs.DeploymentStatusUpdate { diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index 0366b1d849c..075d04f8936 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -109,9 +109,7 @@ func (nr *NodeReconciler) computeForNode( result := new(NodeReconcileResult) // cancel deployments that aren't needed anymore - var deploymentUpdates []*structs.DeploymentStatusUpdate - nr.DeploymentOld, nr.DeploymentCurrent, deploymentUpdates = cancelUnneededDeployments(job, nr.DeploymentCurrent) - nr.DeploymentUpdates = append(nr.DeploymentUpdates, deploymentUpdates...) + nr.cancelUnnededSystemDeployments(job) // set deployment paused and failed, if we currently have a deployment var deploymentPaused, deploymentFailed bool @@ -416,6 +414,49 @@ func (nr *NodeReconciler) computeForNode( return result } +// cancelUnneededServiceDeployments cancels any deployment that is not needed. +// A deployment update will be staged for jobs that should stop or have the +// wrong version. Unneeded deployments include: +// 1. Jobs that are marked for stop, but there is a non-terminal deployment. +// 2. Deployments that are active, but referencing a different job version. +// 3. Deployments that are already successful. +func (nr *NodeReconciler) cancelUnnededSystemDeployments(j *structs.Job) { + // If the job is stopped and there is a non-terminal deployment, cancel it + if j.Stopped() { + if nr.DeploymentCurrent != nil && nr.DeploymentCurrent.Active() { + nr.DeploymentUpdates = append(nr.DeploymentUpdates, &structs.DeploymentStatusUpdate{ + DeploymentID: nr.DeploymentCurrent.ID, + Status: structs.DeploymentStatusCancelled, + StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, + }) + } + + // Nothing else to do + return + } + + if nr.DeploymentCurrent == nil { + return + } + + // Check if the deployment is active and referencing an older job and cancel it + if nr.DeploymentCurrent.JobCreateIndex != j.CreateIndex || nr.DeploymentCurrent.JobVersion != j.Version { + if nr.DeploymentCurrent.Active() { + nr.DeploymentUpdates = append(nr.DeploymentUpdates, &structs.DeploymentStatusUpdate{ + DeploymentID: nr.DeploymentCurrent.ID, + Status: structs.DeploymentStatusCancelled, + StatusDescription: structs.DeploymentStatusDescriptionNewerJob, + }) + } + } + + // Clear it as the current deployment if it is successful + if nr.DeploymentCurrent.Status == structs.DeploymentStatusSuccessful { + nr.DeploymentOld = nr.DeploymentCurrent + nr.DeploymentCurrent = nil + } +} + func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGroup, dstate *structs.DeploymentState, updates int, allocs []*structs.Allocation, terminal map[string]*structs.Allocation) { @@ -424,6 +465,12 @@ func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGro return } + // if there's an old deployment for the same job version as we're + // processing, never create a new one + if nr.DeploymentOld != nil && nr.DeploymentOld.JobVersion == job.Version { + return + } + updatingSpec := updates != 0 hadRunning := false From f8fbb95a14e8ab5c118fdbc817ee881e80537bc0 Mon Sep 17 00:00:00 2001 From: Chris Roberts Date: Mon, 3 Nov 2025 18:40:56 -0800 Subject: [PATCH 20/24] system scheduler: unset current deployment when canceling --- scheduler/reconciler/reconcile_node.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index 075d04f8936..612e4f0a5df 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -447,6 +447,10 @@ func (nr *NodeReconciler) cancelUnnededSystemDeployments(j *structs.Job) { Status: structs.DeploymentStatusCancelled, StatusDescription: structs.DeploymentStatusDescriptionNewerJob, }) + + nr.DeploymentOld = nr.DeploymentCurrent + nr.DeploymentCurrent = nil + return } } @@ -478,9 +482,7 @@ func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGro return a.Job.ID == job.ID && a.Job.Version == job.Version && a.Job.CreateIndex == job.CreateIndex } - if slices.ContainsFunc(allocs, func(alloc *structs.Allocation) bool { - return hadRunningCondition(alloc) - }) { + if slices.ContainsFunc(allocs, hadRunningCondition) { nr.compatHasSameVersionAllocs = true hadRunning = true } From c844d1378c7472be67d362ca948daff7c8046a9f Mon Sep 17 00:00:00 2001 From: Chris Roberts Date: Mon, 3 Nov 2025 18:44:18 -0800 Subject: [PATCH 21/24] system scheduler: stop ineligible allocs if job modified --- scheduler/reconciler/reconcile_node.go | 14 ++++++- scheduler/scheduler_system_test.go | 52 ++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index 612e4f0a5df..0dbdad92a3a 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -265,8 +265,20 @@ func (nr *NodeReconciler) computeForNode( } // For an existing allocation, if the nodeID is no longer - // eligible, the diff should be ignored + // eligible, the diff should be ignored unless the job + // definition has been updated. If the definition has been + // updated, stop the allocation. if _, ineligible := notReadyNodes[nodeID]; ineligible { + if job.ModifyIndex != alloc.Job.JobModifyIndex { + result.Stop = append(result.Stop, AllocTuple{ + Name: name, + TaskGroup: tg, + Alloc: alloc, + }) + + continue + } + goto IGNORE } diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index 6415f9e8cd0..c6baae9ec8e 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -3416,6 +3416,8 @@ func TestSystemSched_UpdateBlock(t *testing.T) { expectAllocs map[string]int // plan NodeAllocations group -> count expectStop map[string]int // plan NodeUpdates group -> count expectDState map[string]*structs.DeploymentState + + ineligibleNodes int // number of nodes to mark as ineligible }{ { name: "legacy upgrade non-deployment", @@ -3756,6 +3758,52 @@ func TestSystemSched_UpdateBlock(t *testing.T) { tg2: {DesiredTotal: 10, PlacedAllocs: 10}, }, }, + + { + name: "deployment complete with ineligible nodes", + tg1UpdateBlock: &structs.UpdateStrategy{ + MaxParallel: 10, + Canary: 30, + AutoPromote: true, + }, + existingPrevious: map[string][]int{ + tg1: {0, 1, 2, 3, 4, 5, 6}, + }, + existingRunning: map[string][]int{ + tg1: {7, 8, 9}, + tg2: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + }, + existingCanary: map[string][]int{ + tg1: {7, 8, 9}, + }, + existingCurrentDState: map[string]*structs.DeploymentState{ + tg1: { + Promoted: true, + PlacedCanaries: []string{"7", "8", "9"}, + DesiredCanaries: 3, + DesiredTotal: 10, + PlacedAllocs: 3, + HealthyAllocs: 3, + UnhealthyAllocs: 0, + }, + tg2: {DesiredTotal: 10, PlacedAllocs: 10, HealthyAllocs: 10}, + }, + expectAllocs: map[string]int{tg1: 5}, // 7 to replace minus 2 on ineligible nodes + expectStop: map[string]int{tg1: 7}, // stop all previous versions + expectDState: map[string]*structs.DeploymentState{ + tg1: { + DesiredTotal: 8, // 10 nodes minus 2 ineligble nodes + DesiredCanaries: 3, + PlacedCanaries: []string{"7", "8", "9"}, + PlacedAllocs: 8, + }, + tg2: { + DesiredTotal: 8, // 10 nodes minus 2 ineligble nodes + PlacedAllocs: 10, // New allocations were already placed + }, + }, + ineligibleNodes: 2, + }, } for _, tc := range testCases { @@ -3764,6 +3812,10 @@ func TestSystemSched_UpdateBlock(t *testing.T) { h := tests.NewHarness(t) nodes := createNodes(t, h, 10) + for i := range tc.ineligibleNodes { + nodes[i].SchedulingEligibility = structs.NodeSchedulingIneligible + } + oldJob := mock.SystemJob() oldJob.TaskGroups[0].Update = tc.tg1UpdateBlock oldJob.TaskGroups[0].Name = tg1 From bde8049a4a5bd71e2585b331f46a775a5de57245 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 4 Nov 2025 09:52:55 +0100 Subject: [PATCH 22/24] system scheduler: unflake TestSystemSched_evictUnneededCanaries --- scheduler/scheduler_system_test.go | 62 +++++++----------------------- 1 file changed, 14 insertions(+), 48 deletions(-) diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index c6baae9ec8e..8a93eeb3097 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -4010,7 +4010,7 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { tgName string nodeAllocation map[string][]*structs.Allocation expectedDesiredCanaries int - expectedNodeAllocation map[string][]*structs.Allocation + expectedNodeAllocation []string }{ { name: "no required canaries", @@ -4031,48 +4031,27 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}, TaskGroup: "tg1", }, - { - DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, - TaskGroup: "tg2", - }, - }, - "node2": { { ID: "tg1_alloc2", - DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}, - TaskGroup: "tg1", - }, - { - DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, - TaskGroup: "tg2", - }, - }, - }, - expectedDesiredCanaries: 0, - expectedNodeAllocation: map[string][]*structs.Allocation{ - "node1": { - { - ID: "tg1_alloc1", - DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}, - TaskGroup: "tg1", - }, - { DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, TaskGroup: "tg2", }, }, "node2": { { - ID: "tg1_alloc2", + ID: "tg2_alloc1", DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}, TaskGroup: "tg1", }, { + ID: "tg2_alloc2", DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, TaskGroup: "tg2", }, }, }, + expectedDesiredCanaries: 0, + expectedNodeAllocation: []string{"tg1_alloc1", "tg1_alloc2", "tg2_alloc1", "tg2_alloc2"}, }, { name: "existing allocs for 2 task groups with canaries", @@ -4105,27 +4084,7 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { }, }, expectedDesiredCanaries: 1, - expectedNodeAllocation: map[string][]*structs.Allocation{ - "node1": { - { - ID: "tg1_alloc1", - DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, - TaskGroup: "tg1", - }, - { - ID: "tg2_alloc1", - DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, - TaskGroup: "tg2", - }, - }, - "node2": { - { - ID: "tg2_alloc2", - DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, - TaskGroup: "tg2", - }, - }, - }, + expectedNodeAllocation: []string{"tg1_alloc1", "tg2_alloc1", "tg2_alloc2"}, }, } for _, tt := range tests { @@ -4136,7 +4095,14 @@ func TestSystemSched_evictUnneededCanaries(t *testing.T) { s.plan.NodeAllocation = tt.nodeAllocation must.SliceLen(t, tt.expectedDesiredCanaries, s.evictUnneededCanaries(tt.requiredCanaries, tt.tgName, &reconciler.NodeReconcileResult{}), must.Sprint("unexpected desired canaries")) - must.Eq(t, tt.expectedNodeAllocation, s.plan.NodeAllocation, must.Sprintf("unexpected node allocation")) + allocsOnNodes := []*structs.Allocation{} + for _, a := range s.plan.NodeAllocation { + allocsOnNodes = append(allocsOnNodes, a...) + } + must.SliceContainsAllFunc(t, allocsOnNodes, tt.expectedNodeAllocation, + func(a *structs.Allocation, id string) bool { + return a.ID == id + }) }) } } From 690e26509125d9003dce6db26b8a6513a3f513dc Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 4 Nov 2025 10:33:25 +0100 Subject: [PATCH 23/24] system scheduler: correct a typo in the node reconciler incorrect condition for detecting jobspec change on ineligible nodes --- scheduler/reconciler/reconcile_node.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index 0dbdad92a3a..9cbdc0f29eb 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -269,7 +269,7 @@ func (nr *NodeReconciler) computeForNode( // definition has been updated. If the definition has been // updated, stop the allocation. if _, ineligible := notReadyNodes[nodeID]; ineligible { - if job.ModifyIndex != alloc.Job.JobModifyIndex { + if job.JobModifyIndex != alloc.Job.JobModifyIndex { result.Stop = append(result.Stop, AllocTuple{ Name: name, TaskGroup: tg, From f9b6c1f17dfb29dd2fd2a93f539b1004d3119613 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 4 Nov 2025 12:03:25 +0100 Subject: [PATCH 24/24] e2e: correction to TestSystemScheduler/testCanaryDeploymentToAllEligibleNodes --- e2e/scheduler_system/systemsched_test.go | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/e2e/scheduler_system/systemsched_test.go b/e2e/scheduler_system/systemsched_test.go index ee17ada6d91..dfa27a07b49 100644 --- a/e2e/scheduler_system/systemsched_test.go +++ b/e2e/scheduler_system/systemsched_test.go @@ -218,16 +218,6 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) { ) t.Cleanup(cleanup2) - // how many eligible nodes do we have? - nodesApi := job2.NodesApi() - nodesList, _, err := nodesApi.List(nil) - must.Nil(t, err) - must.SliceNotEmpty(t, nodesList) - - // Get updated allocations - allocs := job2.Allocs() - must.SliceNotEmpty(t, allocs) - deploymentsApi := job2.DeploymentsApi() deploymentsList, _, err := deploymentsApi.List(nil) must.NoError(t, err) @@ -253,6 +243,10 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) { return false }) + // Get updated allocations + allocs := job2.Allocs() + must.SliceNotEmpty(t, allocs) + // find allocations from v1 version of the job, they should all be canaries count := 0 for _, a := range allocs { @@ -263,7 +257,10 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) { } must.Eq(t, len(initialAllocs), count, must.Sprint("expected canaries to be placed on all eligible nodes")) + updatedDeployment, _, err := deploymentsApi.Info(deployment.ID, nil) + must.NoError(t, err) + // deployment must not be terminal and needs to have the right status // description set - must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, deployment.StatusDescription) + must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, updatedDeployment.StatusDescription) }