From 1465e40d4add55b1a1d6154bb18b20a7c36474b1 Mon Sep 17 00:00:00 2001 From: karthick udayakumar Date: Fri, 26 Sep 2025 14:25:00 -0400 Subject: [PATCH 1/2] not to unclaim a claimed lrp --- db/sqldb/actual_lrp_db.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/db/sqldb/actual_lrp_db.go b/db/sqldb/actual_lrp_db.go index f46e77da..8cd092a3 100644 --- a/db/sqldb/actual_lrp_db.go +++ b/db/sqldb/actual_lrp_db.go @@ -212,8 +212,8 @@ func (db *SQLDB) UnclaimActualLRP(ctx context.Context, logger lager.Logger, key } beforeActualLRP = *actualLRP - if actualLRP.State == models.ActualLRPStateUnclaimed { - logger.Debug("already-unclaimed") + if actualLRP.State == models.ActualLRPStateUnclaimed || actualLRP.State == models.ActualLRPStateClaimed { + logger.Debug("already-" + actualLRP.State) return models.ErrActualLRPCannotBeUnclaimed } From 9c3ec5a565e74d5385191987abd8ebe2398d5f84 Mon Sep 17 00:00:00 2001 From: karthick udayakumar Date: Mon, 29 Sep 2025 11:50:16 -0400 Subject: [PATCH 2/2] stale lrps claimed state is retained --- controllers/evacuation_controller.go | 4 ++-- controllers/lrp_convergence_controller.go | 4 ++-- db/actual_lrp_db.go | 2 +- db/sqldb/actual_lrp_db.go | 8 ++++++-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/controllers/evacuation_controller.go b/controllers/evacuation_controller.go index 81f43e0b..b1863d17 100644 --- a/controllers/evacuation_controller.go +++ b/controllers/evacuation_controller.go @@ -167,7 +167,7 @@ func (h *EvacuationController) EvacuateClaimedActualLRP(ctx context.Context, log } // this is an ordinary LRP - before, after, err := h.actualLRPDB.UnclaimActualLRP(ctx, logger, actualLRPKey) + before, after, err := h.actualLRPDB.UnclaimActualLRP(ctx, logger, false, actualLRPKey) bbsErr := models.ConvertError(err) if bbsErr != nil { if bbsErr.Type == models.Error_ResourceNotFound { @@ -430,7 +430,7 @@ func (h *EvacuationController) evacuateInstance(ctx context.Context, logger lage return nil } - _, after, err := h.actualLRPDB.UnclaimActualLRP(ctx, logger, &actualLRP.ActualLRPKey) + _, after, err := h.actualLRPDB.UnclaimActualLRP(ctx, logger, false, &actualLRP.ActualLRPKey) if err != nil { return err } diff --git a/controllers/lrp_convergence_controller.go b/controllers/lrp_convergence_controller.go index bb1c7878..835fafd3 100644 --- a/controllers/lrp_convergence_controller.go +++ b/controllers/lrp_convergence_controller.go @@ -179,7 +179,7 @@ func (h *LRPConvergenceController) ConvergeLRPs(ctx context.Context) { for _, lrpKey := range convergenceResult.UnstartedLRPKeys { dereferencedKey := *lrpKey works = append(works, func() { - before, after, err := h.lrpDB.UnclaimActualLRP(ctx, logger, dereferencedKey.Key) + before, after, err := h.lrpDB.UnclaimActualLRP(ctx, logger, true, dereferencedKey.Key) if err != nil && err != models.ErrActualLRPCannotBeUnclaimed { logger.Error("cannot-unclaim-lrp", err, lager.Data{"key": dereferencedKey}) return @@ -218,7 +218,7 @@ func (h *LRPConvergenceController) ConvergeLRPs(ctx context.Context) { // there is a Suspect LRP already, unclaim this previously created // replacement and reauction it logger.Debug("found-suspect-lrp-unclaiming", lager.Data{"key": dereferencedKey.Key}) - before, after, err := h.lrpDB.UnclaimActualLRP(ctx, logger, dereferencedKey.Key) + before, after, err := h.lrpDB.UnclaimActualLRP(ctx, logger, false, dereferencedKey.Key) if err != nil { logger.Error("failed-unclaiming-lrp", err) return diff --git a/db/actual_lrp_db.go b/db/actual_lrp_db.go index f2ef593c..a40c6f4c 100644 --- a/db/actual_lrp_db.go +++ b/db/actual_lrp_db.go @@ -13,7 +13,7 @@ type ActualLRPDB interface { ActualLRPs(ctx context.Context, logger lager.Logger, filter models.ActualLRPFilter) ([]*models.ActualLRP, error) ActualLRPsByProcessGuids(ctx context.Context, logger lager.Logger, filter models.ActualLRPsByProcessGuidsFilter) ([]*models.ActualLRP, error) CreateUnclaimedActualLRP(ctx context.Context, logger lager.Logger, key *models.ActualLRPKey) (after *models.ActualLRP, err error) - UnclaimActualLRP(ctx context.Context, logger lager.Logger, key *models.ActualLRPKey) (before *models.ActualLRP, after *models.ActualLRP, err error) + UnclaimActualLRP(ctx context.Context, logger lager.Logger, isStaleUnclaimed bool, key *models.ActualLRPKey) (before *models.ActualLRP, after *models.ActualLRP, err error) ClaimActualLRP(ctx context.Context, logger lager.Logger, processGuid string, index int32, instanceKey *models.ActualLRPInstanceKey) (before *models.ActualLRP, after *models.ActualLRP, err error) StartActualLRP(ctx context.Context, logger lager.Logger, diff --git a/db/sqldb/actual_lrp_db.go b/db/sqldb/actual_lrp_db.go index 8cd092a3..ba7cba37 100644 --- a/db/sqldb/actual_lrp_db.go +++ b/db/sqldb/actual_lrp_db.go @@ -193,7 +193,7 @@ func (db *SQLDB) CreateUnclaimedActualLRP(ctx context.Context, logger lager.Logg return lrp, nil } -func (db *SQLDB) UnclaimActualLRP(ctx context.Context, logger lager.Logger, key *models.ActualLRPKey) (*models.ActualLRP, *models.ActualLRP, error) { +func (db *SQLDB) UnclaimActualLRP(ctx context.Context, logger lager.Logger, isStaleUnclaimed bool, key *models.ActualLRPKey) (*models.ActualLRP, *models.ActualLRP, error) { logger = logger.Session("db-unclaim-actual-lrp", lager.Data{"key": key}) logger.Info("starting") defer logger.Info("complete") @@ -212,10 +212,14 @@ func (db *SQLDB) UnclaimActualLRP(ctx context.Context, logger lager.Logger, key } beforeActualLRP = *actualLRP - if actualLRP.State == models.ActualLRPStateUnclaimed || actualLRP.State == models.ActualLRPStateClaimed { + if actualLRP.State == models.ActualLRPStateUnclaimed { logger.Debug("already-" + actualLRP.State) return models.ErrActualLRPCannotBeUnclaimed } + if isStaleUnclaimed && actualLRP.State == models.ActualLRPStateClaimed { + logger.Debug("a stale unstarted claim already-" + actualLRP.State + " by another cell.") + return models.ErrActualLRPCannotBeUnclaimed + } now := db.clock.Now().UnixNano() actualLRP.ModificationTag.Increment()