Skip to content

Commit e01faf0

Browse files
fix: fix unexpected alert race condition (#921)
--------- Co-authored-by: Zhiying Lin <[email protected]> Co-authored-by: Britania Rodriguez Reyes <[email protected]>
1 parent a8c5a2e commit e01faf0

File tree

4 files changed

+156
-111
lines changed

4 files changed

+156
-111
lines changed

pkg/controllers/workgenerator/controller.go

+43-13
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,21 @@ func (r *Reconciler) Reconcile(ctx context.Context, req controllerruntime.Reques
118118
return controllerruntime.Result{}, err
119119
}
120120

121+
// When the binding is in the unscheduled state, rollout controller won't update the condition anymore.
122+
// We treat the unscheduled binding as bound until the rollout controller deletes the binding and here controller still
123+
// updates the status for troubleshooting purpose.
124+
// Requeue until the rollout controller finishes processing the binding.
125+
if resourceBinding.Spec.State == fleetv1beta1.BindingStateBound {
126+
rolloutStartedCondition := resourceBinding.GetCondition(string(fleetv1beta1.ResourceBindingRolloutStarted))
127+
// Though the bounded binding is not taking the latest resourceSnapshot, we still needs to reconcile the works.
128+
if !condition.IsConditionStatusFalse(rolloutStartedCondition, resourceBinding.Generation) &&
129+
!condition.IsConditionStatusTrue(rolloutStartedCondition, resourceBinding.Generation) {
130+
// The rollout controller is still in the processing of updating the condition
131+
klog.V(2).InfoS("Requeue the resource binding until the rollout controller finishes updating the status", "resourceBinding", bindingRef, "generation", resourceBinding.Generation, "rolloutStartedCondition", rolloutStartedCondition)
132+
return controllerruntime.Result{Requeue: true}, nil
133+
}
134+
}
135+
121136
workUpdated := false
122137
overrideSucceeded := false
123138
// Reset the conditions and failed placements.
@@ -224,39 +239,39 @@ func (r *Reconciler) Reconcile(ctx context.Context, req controllerruntime.Reques
224239
// updateBindingStatusWithRetry sends the update request to API server with retry.
225240
func (r *Reconciler) updateBindingStatusWithRetry(ctx context.Context, resourceBinding *fleetv1beta1.ClusterResourceBinding) error {
226241
// Retry only for specific errors or conditions
242+
bindingRef := klog.KObj(resourceBinding)
227243
err := r.Client.Status().Update(ctx, resourceBinding)
228244
if err != nil {
229-
klog.ErrorS(err, "Failed to update the resourceBinding status, will retry", "resourceBinding", klog.KObj(resourceBinding), "resourceBindingStatus", resourceBinding.Status)
245+
klog.ErrorS(err, "Failed to update the resourceBinding status, will retry", "resourceBinding", bindingRef, "resourceBindingStatus", resourceBinding.Status)
230246
errAfterRetries := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
231247
var latestBinding fleetv1beta1.ClusterResourceBinding
232248
if err := r.Client.Get(ctx, client.ObjectKeyFromObject(resourceBinding), &latestBinding); err != nil {
233249
return err
234250
}
235251
// Work generator is the only controller that updates conditions excluding rollout started which is updated by rollout controller.
236252
if rolloutCond := latestBinding.GetCondition(string(fleetv1beta1.ResourceBindingRolloutStarted)); rolloutCond != nil {
237-
found := false
238253
for i := range resourceBinding.Status.Conditions {
239254
if resourceBinding.Status.Conditions[i].Type == rolloutCond.Type {
240255
// Replace the existing condition
241256
resourceBinding.Status.Conditions[i] = *rolloutCond
242-
found = true
243257
break
244258
}
245259
}
246-
if !found {
247-
return controller.NewUnexpectedBehaviorError(fmt.Errorf("found a resourceBinding %v without RolloutStarted condition", klog.KObj(resourceBinding)))
248-
}
260+
} else {
261+
// At least the RolloutStarted condition for the old generation should be set.
262+
// RolloutStarted condition won't be removed by the rollout controller.
263+
klog.ErrorS(controller.NewUnexpectedBehaviorError(fmt.Errorf("found an invalid resourceBinding")), "RolloutStarted condition is not set", "resourceBinding", bindingRef)
249264
}
250-
251-
if err := r.Client.Status().Update(ctx, resourceBinding); err != nil {
252-
klog.ErrorS(err, "Failed to update the resourceBinding status", "resourceBinding", klog.KObj(resourceBinding), "resourceBindingStatus", resourceBinding.Status)
265+
latestBinding.Status = resourceBinding.Status
266+
if err := r.Client.Status().Update(ctx, &latestBinding); err != nil {
267+
klog.ErrorS(err, "Failed to update the resourceBinding status", "resourceBinding", bindingRef, "resourceBindingStatus", latestBinding.Status)
253268
return err
254269
}
255-
klog.V(2).InfoS("Successfully updated the resourceBinding status", "resourceBinding", klog.KObj(resourceBinding), "resourceBindingStatus", resourceBinding.Status)
270+
klog.V(2).InfoS("Successfully updated the resourceBinding status", "resourceBinding", bindingRef, "resourceBindingStatus", latestBinding.Status)
256271
return nil
257272
})
258273
if errAfterRetries != nil {
259-
klog.ErrorS(errAfterRetries, "Failed to update resourceBinding status after retries", "resourceBinding", klog.KObj(resourceBinding))
274+
klog.ErrorS(errAfterRetries, "Failed to update resourceBinding status after retries", "resourceBinding", bindingRef)
260275
return errAfterRetries
261276
}
262277
return nil
@@ -935,8 +950,23 @@ func (r *Reconciler) SetupWithManager(mgr controllerruntime.Manager) error {
935950
return
936951
}
937952
} else {
938-
klog.V(2).InfoS("The work applied or available condition stayed as true, no need to reconcile", "oldWork", klog.KObj(oldWork), "newWork", klog.KObj(newWork))
939-
return
953+
oldResourceSnapshot := oldWork.Labels[fleetv1beta1.ParentResourceSnapshotIndexLabel]
954+
newResourceSnapshot := newWork.Labels[fleetv1beta1.ParentResourceSnapshotIndexLabel]
955+
if oldResourceSnapshot == "" || newResourceSnapshot == "" {
956+
klog.ErrorS(controller.NewUnexpectedBehaviorError(errors.New("found an invalid work without parent-resource-snapshot-index")),
957+
"Could not find the parent resource snapshot index label", "oldWork", klog.KObj(oldWork), "oldResourceSnapshotLabelValue", oldResourceSnapshot,
958+
"newWork", klog.KObj(newWork), "newResourceSnapshotLabelValue", newResourceSnapshot)
959+
return
960+
}
961+
// There is an edge case that, the work spec is the same but from different resourceSnapshots.
962+
// WorkGenerator will update the work because of the label changes, but the generation is the same.
963+
// When the normal update happens, the controller will set the applied condition as false and wait
964+
// until the work condition has been changed.
965+
// In this edge case, we need to requeue the binding to update the binding status.
966+
if oldResourceSnapshot == newResourceSnapshot {
967+
klog.V(2).InfoS("The work applied or available condition stayed as true, no need to reconcile", "oldWork", klog.KObj(oldWork), "newWork", klog.KObj(newWork))
968+
return
969+
}
940970
}
941971
}
942972
// We need to update the binding status in this case

0 commit comments

Comments
 (0)