diff --git a/internal/controller/nodereadinessrule_controller.go b/internal/controller/nodereadinessrule_controller.go index 09ab6e1..f9d2324 100644 --- a/internal/controller/nodereadinessrule_controller.go +++ b/internal/controller/nodereadinessrule_controller.go @@ -160,15 +160,15 @@ func (r *RuleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. } } - // Update rule status - if err := r.Controller.updateRuleStatus(ctx, rule); err != nil { - log.Error(err, "Failed to update rule status", "rule", rule.Name) + // Clean up status for deleted nodes before persisting to avoid writing stale entries. + if err := r.Controller.cleanupDeletedNodes(ctx, rule, nodeList); err != nil { + log.Error(err, "Failed to clean up deleted nodes", "rule", rule.Name) return ctrl.Result{RequeueAfter: time.Minute}, err } - // Clean up status for deleted nodes - if err := r.Controller.cleanupDeletedNodes(ctx, rule, nodeList); err != nil { - log.Error(err, "Failed to clean up deleted nodes", "rule", rule.Name) + // Update rule status + if err := r.Controller.updateRuleStatus(ctx, rule); err != nil { + log.Error(err, "Failed to update rule status", "rule", rule.Name) return ctrl.Result{RequeueAfter: time.Minute}, err } @@ -214,7 +214,7 @@ func (r *RuleReadinessController) cleanupDeletedNodes(ctx context.Context, rule existingNodes[node.Name] = true } - // Filter out deleted nodes + // Filter out deleted nodes from NodeEvaluations var newNodeEvaluations []readinessv1alpha1.NodeEvaluation for _, evaluation := range rule.Status.NodeEvaluations { if existingNodes[evaluation.NodeName] { @@ -222,38 +222,31 @@ func (r *RuleReadinessController) cleanupDeletedNodes(ctx context.Context, rule } } - if len(newNodeEvaluations) == len(rule.Status.NodeEvaluations) { + // Filter out deleted nodes from FailedNodes + var newFailedNodes []readinessv1alpha1.NodeFailure + for _, failure := range rule.Status.FailedNodes { + if existingNodes[failure.NodeName] { + newFailedNodes = append(newFailedNodes, failure) + } + } + + if len(newNodeEvaluations) == len(rule.Status.NodeEvaluations) && len(newFailedNodes) == len(rule.Status.FailedNodes) { log.V(4).Info("No deleted nodes to clean up", "rule", rule.Name) return nil } log.V(4).Info("Cleaning up deleted nodes from rule status", "rule", rule.Name, - "before", len(rule.Status.NodeEvaluations), - "after", len(newNodeEvaluations)) + "evaluationsBefore", len(rule.Status.NodeEvaluations), + "evaluationsAfter", len(newNodeEvaluations), + "failedBefore", len(rule.Status.FailedNodes), + "failedAfter", len(newFailedNodes)) - // Use retry on conflict to update status to avoid race conditions from node updates - return retry.RetryOnConflict(retry.DefaultRetry, func() error { - fresh := &readinessv1alpha1.NodeReadinessRule{} - if err := r.Get(ctx, client.ObjectKey{Name: rule.Name}, fresh); err != nil { - return err - } + // Update in-memory state so the subsequent status patch uses the cleaned evaluations. + rule.Status.NodeEvaluations = newNodeEvaluations + rule.Status.FailedNodes = newFailedNodes - var freshNodeEvaluations []readinessv1alpha1.NodeEvaluation - for _, evaluation := range fresh.Status.NodeEvaluations { - if existingNodes[evaluation.NodeName] { - freshNodeEvaluations = append(freshNodeEvaluations, evaluation) - } - } - - if len(freshNodeEvaluations) == len(fresh.Status.NodeEvaluations) { - return nil - } - - patch := client.MergeFrom(fresh.DeepCopy()) - fresh.Status.NodeEvaluations = freshNodeEvaluations - return r.Status().Patch(ctx, fresh, patch) - }) + return nil } // processAllNodesForRule processes all nodes when a rule changes. diff --git a/internal/controller/nodereadinessrule_controller_test.go b/internal/controller/nodereadinessrule_controller_test.go index a1e02a6..80ce239 100644 --- a/internal/controller/nodereadinessrule_controller_test.go +++ b/internal/controller/nodereadinessrule_controller_test.go @@ -1412,6 +1412,50 @@ var _ = Describe("NodeReadinessRule Controller", func() { return false }, time.Second*5).Should(BeTrue()) }) + + It("should not write stale NodeEvaluations to status before cleanup runs", func() { + // This is a regression test for the ordering fix: cleanup must happen + // before updateRuleStatus so that the single status patch never contains + // an entry for a deleted node. Prior to the fix, cleanupDeletedNodes ran + // after updateRuleStatus, creating a dirty stale-write window. + + // Initial reconcile to populate status with both nodes. + _, err := ruleReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: "delete-node-rule"}}) + Expect(err).NotTo(HaveOccurred()) + + Eventually(func() int { + updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} + _ = k8sClient.Get(ctx, types.NamespacedName{Name: "delete-node-rule"}, updatedRule) + return len(updatedRule.Status.NodeEvaluations) + }, time.Second*5).Should(Equal(2)) + + // Delete node1. + Expect(k8sClient.Delete(ctx, node1)).To(Succeed()) + + // Reconcile once. With the fix, the single status patch must already + // contain only node2 — no intermediate state with node1 is written. + _, err = ruleReconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: "delete-node-rule"}}) + Expect(err).NotTo(HaveOccurred()) + + // The status fetched immediately after reconcile must not contain node1. + // Without the ordering fix this would be flaky or fail because an + // intermediate patch wrote the stale entry first. + updatedRule := &nodereadinessiov1alpha1.NodeReadinessRule{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "delete-node-rule"}, updatedRule)).To(Succeed()) + + nodeNames := make([]string, 0, len(updatedRule.Status.NodeEvaluations)) + for _, eval := range updatedRule.Status.NodeEvaluations { + nodeNames = append(nodeNames, eval.NodeName) + } + Expect(nodeNames).NotTo(ContainElement("node1"), "deleted node must not appear in the NodeEvaluations status patch written during reconcile") + Expect(nodeNames).To(ContainElement("node2"), "surviving node must remain in NodeEvaluations status") + + failedNodeNames := make([]string, 0, len(updatedRule.Status.FailedNodes)) + for _, eval := range updatedRule.Status.FailedNodes { + failedNodeNames = append(failedNodeNames, eval.NodeName) + } + Expect(failedNodeNames).NotTo(ContainElement("node1"), "deleted node must not appear in the FailedNodes status patch written during reconcile") + }) }) Context("when a rule's nodeSelector is modified", func() {