Skip to content

Commit 00e69d9

Browse files
authored
Merge pull request #26953 from hashicorp/f-system-deployments-canaries-evict-refactor
scheduler: perform feasibility checks for system canaries before computing placements
2 parents 6235838 + f9b6c1f commit 00e69d9

File tree

12 files changed

+1540
-916
lines changed

12 files changed

+1540
-916
lines changed

e2e/scheduler_system/systemsched_test.go

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -218,16 +218,6 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) {
218218
)
219219
t.Cleanup(cleanup2)
220220

221-
// how many eligible nodes do we have?
222-
nodesApi := job2.NodesApi()
223-
nodesList, _, err := nodesApi.List(nil)
224-
must.Nil(t, err)
225-
must.SliceNotEmpty(t, nodesList)
226-
227-
// Get updated allocations
228-
allocs := job2.Allocs()
229-
must.SliceNotEmpty(t, allocs)
230-
231221
deploymentsApi := job2.DeploymentsApi()
232222
deploymentsList, _, err := deploymentsApi.List(nil)
233223
must.NoError(t, err)
@@ -253,6 +243,10 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) {
253243
return false
254244
})
255245

246+
// Get updated allocations
247+
allocs := job2.Allocs()
248+
must.SliceNotEmpty(t, allocs)
249+
256250
// find allocations from v1 version of the job, they should all be canaries
257251
count := 0
258252
for _, a := range allocs {
@@ -263,7 +257,10 @@ func testCanaryDeploymentToAllEligibleNodes(t *testing.T) {
263257
}
264258
must.Eq(t, len(initialAllocs), count, must.Sprint("expected canaries to be placed on all eligible nodes"))
265259

260+
updatedDeployment, _, err := deploymentsApi.Info(deployment.ID, nil)
261+
must.NoError(t, err)
262+
266263
// deployment must not be terminal and needs to have the right status
267264
// description set
268-
must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, deployment.StatusDescription)
265+
must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, updatedDeployment.StatusDescription)
269266
}

scheduler/feasible/context.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,13 @@ func NewEvalEligibility() *EvalEligibility {
252252
}
253253
}
254254

255+
// Reset clears the contents of the eval eligibility
256+
func (e *EvalEligibility) Reset() {
257+
e.job = make(map[string]ComputedClassFeasibility)
258+
e.taskGroups = make(map[string]map[string]ComputedClassFeasibility)
259+
e.tgEscapedConstraints = make(map[string]bool)
260+
}
261+
255262
// SetJob takes the job being evaluated and calculates the escaped constraints
256263
// at the job and task group level.
257264
func (e *EvalEligibility) SetJob(job *structs.Job) {

scheduler/feasible/stack.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,11 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran
352352
// Reset the binpack selector and context
353353
s.scoreNorm.Reset()
354354
s.ctx.Reset()
355+
356+
// Since the system stack is always evaluating a single
357+
// node, previous eligibility information is not applicable
358+
// so reset it
359+
s.ctx.Eligibility().Reset()
355360
start := time.Now()
356361

357362
// Get the task groups constraints.

scheduler/reconciler/deployments.go

Lines changed: 0 additions & 57 deletions
This file was deleted.

scheduler/reconciler/reconcile_cluster.go

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ func (a *AllocReconciler) Compute() *ReconcileResults {
277277
// Create the allocation matrix
278278
m := newAllocMatrix(a.jobState.Job, a.jobState.ExistingAllocs)
279279

280-
a.jobState.DeploymentOld, a.jobState.DeploymentCurrent, result.DeploymentUpdates = cancelUnneededDeployments(a.jobState.Job, a.jobState.DeploymentCurrent)
280+
a.jobState.DeploymentOld, a.jobState.DeploymentCurrent, result.DeploymentUpdates = cancelUnneededServiceDeployments(a.jobState.Job, a.jobState.DeploymentCurrent)
281281

282282
// If we are just stopping a job we do not need to do anything more than
283283
// stopping all running allocs
@@ -569,6 +569,57 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe
569569
return result, deploymentComplete
570570
}
571571

572+
// cancelUnneededServiceDeployments cancels any deployment that is not needed.
573+
// A deployment update will be staged for jobs that should stop or have the
574+
// wrong version. Unneeded deployments include:
575+
// 1. Jobs that are marked for stop, but there is a non-terminal deployment.
576+
// 2. Deployments that are active, but referencing a different job version.
577+
// 3. Deployments that are already successful.
578+
//
579+
// returns: old deployment, current deployment and a slice of deployment status
580+
// updates.
581+
func cancelUnneededServiceDeployments(j *structs.Job, d *structs.Deployment) (*structs.Deployment, *structs.Deployment, []*structs.DeploymentStatusUpdate) {
582+
var updates []*structs.DeploymentStatusUpdate
583+
584+
// If the job is stopped and there is a non-terminal deployment, cancel it
585+
if j.Stopped() {
586+
if d != nil && d.Active() {
587+
updates = append(updates, &structs.DeploymentStatusUpdate{
588+
DeploymentID: d.ID,
589+
Status: structs.DeploymentStatusCancelled,
590+
StatusDescription: structs.DeploymentStatusDescriptionStoppedJob,
591+
})
592+
}
593+
594+
// Nothing else to do
595+
return d, nil, updates
596+
}
597+
598+
if d == nil {
599+
return nil, nil, nil
600+
}
601+
602+
// Check if the deployment is active and referencing an older job and cancel it
603+
if d.JobCreateIndex != j.CreateIndex || d.JobVersion != j.Version {
604+
if d.Active() {
605+
updates = append(updates, &structs.DeploymentStatusUpdate{
606+
DeploymentID: d.ID,
607+
Status: structs.DeploymentStatusCancelled,
608+
StatusDescription: structs.DeploymentStatusDescriptionNewerJob,
609+
})
610+
}
611+
612+
return d, nil, updates
613+
}
614+
615+
// Clear it as the current deployment if it is successful
616+
if d.Status == structs.DeploymentStatusSuccessful {
617+
return d, nil, updates
618+
}
619+
620+
return nil, d, updates
621+
}
622+
572623
// setDeploymentStatusAndUpdates sets status for a.deployment if necessary and
573624
// returns an array of DeploymentStatusUpdates.
574625
func (a *AllocReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, createdDeployment *structs.Deployment) []*structs.DeploymentStatusUpdate {

0 commit comments

Comments
 (0)