@@ -132,7 +132,8 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust
132132 removedProcessGroups := r .removeProcessGroups (ctx , logger , cluster , zoneRemovals , zonedRemovals [removals .TerminatingZone ])
133133 err = includeProcessGroup (ctx , logger , r , cluster , removedProcessGroups , status , adminClient )
134134 if err != nil {
135- return & requeue {curError : err , delayedRequeue : true }
135+ // If the inclusion is blocked or another issues happened we will retry in 60 seconds.
136+ return & requeue {curError : err , delayedRequeue : true , delay : 60 * time .Second }
136137 }
137138
138139 return nil
@@ -214,7 +215,7 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus
214215 return false , false , nil
215216 }
216217
217- // Pod is in terminating state so we don't want to block but we also don't want to include it
218+ // Pod is in terminating state so we don't want to block, but we also don't want to include it
218219 canBeIncluded = false
219220 }
220221
@@ -231,7 +232,7 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus
231232 return false , false , nil
232233 }
233234
234- // PVC is in terminating state so we don't want to block but we also don't want to include it
235+ // PVC is in terminating state so we don't want to block, but we also don't want to include it
235236 canBeIncluded = false
236237 } else if len (pvcs .Items ) > 1 {
237238 return false , false , fmt .Errorf ("multiple PVCs found for cluster %s, processGroupID %s" , cluster .Name , processGroup .ProcessGroupID )
@@ -251,20 +252,27 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus
251252 return false , false , nil
252253 }
253254
254- // Service is in terminating state so we don't want to block but we also don't want to include it
255+ // Service is in terminating state so we don't want to block, but we also don't want to include it
255256 canBeIncluded = false
256257 }
257258
258259 return true , canBeIncluded , nil
259260}
260261
261262func includeProcessGroup (ctx context.Context , logger logr.Logger , r * FoundationDBClusterReconciler , cluster * fdbv1beta2.FoundationDBCluster , removedProcessGroups map [fdbv1beta2.ProcessGroupID ]bool , status * fdbv1beta2.FoundationDBStatus , adminClient fdbadminclient.AdminClient ) error {
262- fdbProcessesToInclude , err := getProcessesToInclude (logger , cluster , removedProcessGroups , status )
263+ fdbProcessesToInclude , newProcessGroups , err := getProcessesToInclude (logger , cluster , removedProcessGroups , status )
263264 if err != nil {
264265 return err
265266 }
266267
267268 if len (fdbProcessesToInclude ) == 0 {
269+ // In case that the operator was removing a process group without exclusion.
270+ // We can update the process groups at this stage, as no other processes must be included.
271+ if len (cluster .Status .ProcessGroups ) != len (newProcessGroups ) {
272+ cluster .Status .ProcessGroups = newProcessGroups
273+ return r .updateOrApply (ctx , cluster )
274+ }
275+
268276 return nil
269277 }
270278
@@ -293,59 +301,63 @@ func includeProcessGroup(ctx context.Context, logger logr.Logger, r *FoundationD
293301 return err
294302 }
295303
296- // Reset the SecondsSinceLastRecovered sine the operator just included some processes, which will cause a recovery.
304+ // Reset the SecondsSinceLastRecovered since the operator just included some processes, which will cause a recovery.
297305 status .Cluster .RecoveryState .SecondsSinceLastRecovered = 0.0
306+ // Update the process group list and remove all removed and included process groups.
307+ cluster .Status .ProcessGroups = newProcessGroups
298308
299309 return r .updateOrApply (ctx , cluster )
300310}
301311
302- func getProcessesToInclude (logger logr.Logger , cluster * fdbv1beta2.FoundationDBCluster , removedProcessGroups map [fdbv1beta2.ProcessGroupID ]bool , status * fdbv1beta2.FoundationDBStatus ) ([]fdbv1beta2.ProcessAddress , error ) {
312+ func getProcessesToInclude (logger logr.Logger , cluster * fdbv1beta2.FoundationDBCluster , removedProcessGroups map [fdbv1beta2.ProcessGroupID ]bool , status * fdbv1beta2.FoundationDBStatus ) ([]fdbv1beta2.ProcessAddress , [] * fdbv1beta2. ProcessGroupStatus , error ) {
303313 fdbProcessesToInclude := make ([]fdbv1beta2.ProcessAddress , 0 )
304314
305315 if len (removedProcessGroups ) == 0 {
306- return fdbProcessesToInclude , nil
316+ return fdbProcessesToInclude , nil , nil
307317 }
308318
309319 excludedServers , err := fdbstatus .GetExclusions (status )
310320 if err != nil {
311- return fdbProcessesToInclude , fmt .Errorf ("unable to get excluded servers from status, %w" , err )
321+ return fdbProcessesToInclude , nil , fmt .Errorf ("unable to get excluded servers from status, %w" , err )
312322 }
313323 excludedServersMap := make (map [string ]fdbv1beta2.None , len (excludedServers ))
314324 for _ , excludedServer := range excludedServers {
315325 excludedServersMap [excludedServer .String ()] = fdbv1beta2.None {}
316326 }
317327
328+ processGroups := cluster .Status .DeepCopy ().ProcessGroups
318329 idx := 0
319- for _ , processGroup := range cluster . Status . ProcessGroups {
330+ for _ , processGroup := range processGroups {
320331 if processGroup .IsMarkedForRemoval () && removedProcessGroups [processGroup .ProcessGroupID ] {
321332 foundInExcludedServerList := false
322333 exclusionString := processGroup .GetExclusionString ()
323334 if _ , ok := excludedServersMap [exclusionString ]; ok {
324335 fdbProcessesToInclude = append (fdbProcessesToInclude , fdbv1beta2.ProcessAddress {StringAddress : exclusionString })
325336 foundInExcludedServerList = true
326337 }
338+
327339 for _ , pAddr := range processGroup .Addresses {
328340 if _ , ok := excludedServersMap [pAddr ]; ok {
329341 fdbProcessesToInclude = append (fdbProcessesToInclude , fdbv1beta2.ProcessAddress {IPAddress : net .ParseIP (pAddr )})
330342 foundInExcludedServerList = true
331343 }
332344 }
333- if ! foundInExcludedServerList {
345+
346+ if ! foundInExcludedServerList && ! processGroup .ExclusionSkipped {
334347 // This means that the process is marked for exclusion and is also removed in the previous step but is missing
335348 // its entry in the excluded servers in the status. This should not throw an error as this will block the
336349 // inclusion for other processes, but we should have a record of this event happening in the logs.
337- logger .Info ("processGroup is included but is missing from excluded server list" , "processGroup" , processGroup )
350+ logger .Info ("processGroup should be included but is missing from excluded server list" , "processGroup" , processGroup )
338351 }
352+
339353 continue
340354 }
341- cluster .Status .ProcessGroups [idx ] = processGroup
355+
356+ processGroups [idx ] = processGroup
342357 idx ++
343358 }
344359
345- // Remove the trailing duplicates.
346- cluster .Status .ProcessGroups = cluster .Status .ProcessGroups [:idx ]
347-
348- return fdbProcessesToInclude , nil
360+ return fdbProcessesToInclude , processGroups [:idx ], nil
349361}
350362
351363func (r * FoundationDBClusterReconciler ) getProcessGroupsToRemove (logger logr.Logger , cluster * fdbv1beta2.FoundationDBCluster , remainingMap map [string ]bool , cordSet map [string ]fdbv1beta2.None ) (bool , bool , []* fdbv1beta2.ProcessGroupStatus ) {
0 commit comments