Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 24 additions & 31 deletions internal/controller/nodereadinessrule_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,15 +160,15 @@ func (r *RuleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
}
}

// Update rule status
if err := r.Controller.updateRuleStatus(ctx, rule); err != nil {
log.Error(err, "Failed to update rule status", "rule", rule.Name)
// Clean up status for deleted nodes before persisting to avoid writing stale entries.
if err := r.Controller.cleanupDeletedNodes(ctx, rule, nodeList); err != nil {
log.Error(err, "Failed to clean up deleted nodes", "rule", rule.Name)
return ctrl.Result{RequeueAfter: time.Minute}, err
}

// Clean up status for deleted nodes
if err := r.Controller.cleanupDeletedNodes(ctx, rule, nodeList); err != nil {
log.Error(err, "Failed to clean up deleted nodes", "rule", rule.Name)
// Update rule status
if err := r.Controller.updateRuleStatus(ctx, rule); err != nil {
log.Error(err, "Failed to update rule status", "rule", rule.Name)
return ctrl.Result{RequeueAfter: time.Minute}, err
}

Expand Down Expand Up @@ -214,46 +214,39 @@ func (r *RuleReadinessController) cleanupDeletedNodes(ctx context.Context, rule
existingNodes[node.Name] = true
}

// Filter out deleted nodes
// Filter out deleted nodes from NodeEvaluations
var newNodeEvaluations []readinessv1alpha1.NodeEvaluation
for _, evaluation := range rule.Status.NodeEvaluations {
if existingNodes[evaluation.NodeName] {
newNodeEvaluations = append(newNodeEvaluations, evaluation)
}
}

if len(newNodeEvaluations) == len(rule.Status.NodeEvaluations) {
// Filter out deleted nodes from FailedNodes
var newFailedNodes []readinessv1alpha1.NodeFailure
for _, failure := range rule.Status.FailedNodes {
if existingNodes[failure.NodeName] {
newFailedNodes = append(newFailedNodes, failure)
}
}

if len(newNodeEvaluations) == len(rule.Status.NodeEvaluations) && len(newFailedNodes) == len(rule.Status.FailedNodes) {
log.V(4).Info("No deleted nodes to clean up", "rule", rule.Name)
return nil
}

log.V(4).Info("Cleaning up deleted nodes from rule status",
"rule", rule.Name,
"before", len(rule.Status.NodeEvaluations),
"after", len(newNodeEvaluations))
"evaluationsBefore", len(rule.Status.NodeEvaluations),
"evaluationsAfter", len(newNodeEvaluations),
"failedBefore", len(rule.Status.FailedNodes),
"failedAfter", len(newFailedNodes))

// Use retry on conflict to update status to avoid race conditions from node updates
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
fresh := &readinessv1alpha1.NodeReadinessRule{}
if err := r.Get(ctx, client.ObjectKey{Name: rule.Name}, fresh); err != nil {
return err
}
// Update in-memory state so the subsequent status patch uses the cleaned evaluations.
rule.Status.NodeEvaluations = newNodeEvaluations
rule.Status.FailedNodes = newFailedNodes

var freshNodeEvaluations []readinessv1alpha1.NodeEvaluation
for _, evaluation := range fresh.Status.NodeEvaluations {
if existingNodes[evaluation.NodeName] {
freshNodeEvaluations = append(freshNodeEvaluations, evaluation)
}
}

if len(freshNodeEvaluations) == len(fresh.Status.NodeEvaluations) {
return nil
}

patch := client.MergeFrom(fresh.DeepCopy())
fresh.Status.NodeEvaluations = freshNodeEvaluations
return r.Status().Patch(ctx, fresh, patch)
})
return nil
}

// processAllNodesForRule processes all nodes when a rule changes.
Expand Down
44 changes: 44 additions & 0 deletions internal/controller/nodereadinessrule_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1412,6 +1412,50 @@ var _ = Describe("NodeReadinessRule Controller", func() {
return false
}, time.Second*5).Should(BeTrue())
})

It("should not write stale NodeEvaluations to status before cleanup runs", func() {
// This is a regression test for the ordering fix: cleanup must happen
// before updateRuleStatus so that the single status patch never contains
// an entry for a deleted node. Prior to the fix, cleanupDeletedNodes ran
// after updateRuleStatus, creating a dirty stale-write window.

// Initial reconcile to populate status with both nodes.
_, err := ruleReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: "delete-node-rule"}})
Expect(err).NotTo(HaveOccurred())

Eventually(func() int {
updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{}
_ = k8sClient.Get(ctx, types.NamespacedName{Name: "delete-node-rule"}, updatedRule)
return len(updatedRule.Status.NodeEvaluations)
}, time.Second*5).Should(Equal(2))

// Delete node1.
Expect(k8sClient.Delete(ctx, node1)).To(Succeed())

// Reconcile once. With the fix, the single status patch must already
// contain only node2 — no intermediate state with node1 is written.
_, err = ruleReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: "delete-node-rule"}})
Expect(err).NotTo(HaveOccurred())

// The status fetched immediately after reconcile must not contain node1.
// Without the ordering fix this would be flaky or fail because an
// intermediate patch wrote the stale entry first.
updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "delete-node-rule"}, updatedRule)).To(Succeed())

nodeNames := make([]string, 0, len(updatedRule.Status.NodeEvaluations))
for _, eval := range updatedRule.Status.NodeEvaluations {
nodeNames = append(nodeNames, eval.NodeName)
}
Expect(nodeNames).NotTo(ContainElement("node1"), "deleted node must not appear in the NodeEvaluations status patch written during reconcile")
Expect(nodeNames).To(ContainElement("node2"), "surviving node must remain in NodeEvaluations status")

failedNodeNames := make([]string, 0, len(updatedRule.Status.FailedNodes))
for _, eval := range updatedRule.Status.FailedNodes {
failedNodeNames = append(failedNodeNames, eval.NodeName)
}
Expect(failedNodeNames).NotTo(ContainElement("node1"), "deleted node must not appear in the FailedNodes status patch written during reconcile")
})
})

Context("when a rule's nodeSelector is modified", func() {
Expand Down