Skip to content

Commit e5fd090

Browse files
authored
Fix the case where the process group gets removed without the addresses being included (#2147)
* Fix the case where the process group gets removed without the addresses being included
1 parent 232da61 commit e5fd090

File tree

3 files changed

+58
-44
lines changed

3 files changed

+58
-44
lines changed

controllers/remove_process_groups.go

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,8 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust
132132
removedProcessGroups := r.removeProcessGroups(ctx, logger, cluster, zoneRemovals, zonedRemovals[removals.TerminatingZone])
133133
err = includeProcessGroup(ctx, logger, r, cluster, removedProcessGroups, status, adminClient)
134134
if err != nil {
135-
return &requeue{curError: err, delayedRequeue: true}
135+
// If the inclusion is blocked or another issues happened we will retry in 60 seconds.
136+
return &requeue{curError: err, delayedRequeue: true, delay: 60 * time.Second}
136137
}
137138

138139
return nil
@@ -214,7 +215,7 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus
214215
return false, false, nil
215216
}
216217

217-
// Pod is in terminating state so we don't want to block but we also don't want to include it
218+
// Pod is in terminating state so we don't want to block, but we also don't want to include it
218219
canBeIncluded = false
219220
}
220221

@@ -231,7 +232,7 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus
231232
return false, false, nil
232233
}
233234

234-
// PVC is in terminating state so we don't want to block but we also don't want to include it
235+
// PVC is in terminating state so we don't want to block, but we also don't want to include it
235236
canBeIncluded = false
236237
} else if len(pvcs.Items) > 1 {
237238
return false, false, fmt.Errorf("multiple PVCs found for cluster %s, processGroupID %s", cluster.Name, processGroup.ProcessGroupID)
@@ -251,20 +252,27 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus
251252
return false, false, nil
252253
}
253254

254-
// Service is in terminating state so we don't want to block but we also don't want to include it
255+
// Service is in terminating state so we don't want to block, but we also don't want to include it
255256
canBeIncluded = false
256257
}
257258

258259
return true, canBeIncluded, nil
259260
}
260261

261262
func includeProcessGroup(ctx context.Context, logger logr.Logger, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster, removedProcessGroups map[fdbv1beta2.ProcessGroupID]bool, status *fdbv1beta2.FoundationDBStatus, adminClient fdbadminclient.AdminClient) error {
262-
fdbProcessesToInclude, err := getProcessesToInclude(logger, cluster, removedProcessGroups, status)
263+
fdbProcessesToInclude, newProcessGroups, err := getProcessesToInclude(logger, cluster, removedProcessGroups, status)
263264
if err != nil {
264265
return err
265266
}
266267

267268
if len(fdbProcessesToInclude) == 0 {
269+
// In case that the operator was removing a process group without exclusion.
270+
// We can update the process groups at this stage, as no other processes must be included.
271+
if len(cluster.Status.ProcessGroups) != len(newProcessGroups) {
272+
cluster.Status.ProcessGroups = newProcessGroups
273+
return r.updateOrApply(ctx, cluster)
274+
}
275+
268276
return nil
269277
}
270278

@@ -293,59 +301,63 @@ func includeProcessGroup(ctx context.Context, logger logr.Logger, r *FoundationD
293301
return err
294302
}
295303

296-
// Reset the SecondsSinceLastRecovered sine the operator just included some processes, which will cause a recovery.
304+
// Reset the SecondsSinceLastRecovered since the operator just included some processes, which will cause a recovery.
297305
status.Cluster.RecoveryState.SecondsSinceLastRecovered = 0.0
306+
// Update the process group list and remove all removed and included process groups.
307+
cluster.Status.ProcessGroups = newProcessGroups
298308

299309
return r.updateOrApply(ctx, cluster)
300310
}
301311

302-
func getProcessesToInclude(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, removedProcessGroups map[fdbv1beta2.ProcessGroupID]bool, status *fdbv1beta2.FoundationDBStatus) ([]fdbv1beta2.ProcessAddress, error) {
312+
func getProcessesToInclude(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, removedProcessGroups map[fdbv1beta2.ProcessGroupID]bool, status *fdbv1beta2.FoundationDBStatus) ([]fdbv1beta2.ProcessAddress, []*fdbv1beta2.ProcessGroupStatus, error) {
303313
fdbProcessesToInclude := make([]fdbv1beta2.ProcessAddress, 0)
304314

305315
if len(removedProcessGroups) == 0 {
306-
return fdbProcessesToInclude, nil
316+
return fdbProcessesToInclude, nil, nil
307317
}
308318

309319
excludedServers, err := fdbstatus.GetExclusions(status)
310320
if err != nil {
311-
return fdbProcessesToInclude, fmt.Errorf("unable to get excluded servers from status, %w", err)
321+
return fdbProcessesToInclude, nil, fmt.Errorf("unable to get excluded servers from status, %w", err)
312322
}
313323
excludedServersMap := make(map[string]fdbv1beta2.None, len(excludedServers))
314324
for _, excludedServer := range excludedServers {
315325
excludedServersMap[excludedServer.String()] = fdbv1beta2.None{}
316326
}
317327

328+
processGroups := cluster.Status.DeepCopy().ProcessGroups
318329
idx := 0
319-
for _, processGroup := range cluster.Status.ProcessGroups {
330+
for _, processGroup := range processGroups {
320331
if processGroup.IsMarkedForRemoval() && removedProcessGroups[processGroup.ProcessGroupID] {
321332
foundInExcludedServerList := false
322333
exclusionString := processGroup.GetExclusionString()
323334
if _, ok := excludedServersMap[exclusionString]; ok {
324335
fdbProcessesToInclude = append(fdbProcessesToInclude, fdbv1beta2.ProcessAddress{StringAddress: exclusionString})
325336
foundInExcludedServerList = true
326337
}
338+
327339
for _, pAddr := range processGroup.Addresses {
328340
if _, ok := excludedServersMap[pAddr]; ok {
329341
fdbProcessesToInclude = append(fdbProcessesToInclude, fdbv1beta2.ProcessAddress{IPAddress: net.ParseIP(pAddr)})
330342
foundInExcludedServerList = true
331343
}
332344
}
333-
if !foundInExcludedServerList {
345+
346+
if !foundInExcludedServerList && !processGroup.ExclusionSkipped {
334347
// This means that the process is marked for exclusion and is also removed in the previous step but is missing
335348
// its entry in the excluded servers in the status. This should not throw an error as this will block the
336349
// inclusion for other processes, but we should have a record of this event happening in the logs.
337-
logger.Info("processGroup is included but is missing from excluded server list", "processGroup", processGroup)
350+
logger.Info("processGroup should be included but is missing from excluded server list", "processGroup", processGroup)
338351
}
352+
339353
continue
340354
}
341-
cluster.Status.ProcessGroups[idx] = processGroup
355+
356+
processGroups[idx] = processGroup
342357
idx++
343358
}
344359

345-
// Remove the trailing duplicates.
346-
cluster.Status.ProcessGroups = cluster.Status.ProcessGroups[:idx]
347-
348-
return fdbProcessesToInclude, nil
360+
return fdbProcessesToInclude, processGroups[:idx], nil
349361
}
350362

351363
func (r *FoundationDBClusterReconciler) getProcessGroupsToRemove(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, remainingMap map[string]bool, cordSet map[string]fdbv1beta2.None) (bool, bool, []*fdbv1beta2.ProcessGroupStatus) {

controllers/remove_process_groups_test.go

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -561,10 +561,11 @@ var _ = Describe("remove_process_groups", func() {
561561

562562
When("including no process", func() {
563563
It("should not include any process", func() {
564-
processesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
564+
processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
565565
Expect(err).NotTo(HaveOccurred())
566-
Expect(len(processesToInclude)).To(Equal(0))
567-
Expect(len(cluster.Status.ProcessGroups)).To(Equal(16))
566+
Expect(processesToInclude).To(BeEmpty())
567+
Expect(newProcessGroups).To(BeEmpty())
568+
Expect(cluster.Status.ProcessGroups).To(HaveLen(16))
568569
})
569570
})
570571

@@ -580,11 +581,12 @@ var _ = Describe("remove_process_groups", func() {
580581
})
581582

582583
It("should include one process", func() {
583-
fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
584+
processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
584585
Expect(err).NotTo(HaveOccurred())
585-
Expect(len(fdbProcessesToInclude)).To(Equal(1))
586-
Expect(fdbv1beta2.ProcessAddressesString(fdbProcessesToInclude, " ")).To(Equal("1.1.1.1"))
587-
Expect(len(cluster.Status.ProcessGroups)).To(Equal(15))
586+
Expect(processesToInclude).To(HaveLen(1))
587+
Expect(fdbv1beta2.ProcessAddressesString(processesToInclude, " ")).To(Equal("1.1.1.1"))
588+
Expect(newProcessGroups).To(HaveLen(15))
589+
Expect(cluster.Status.ProcessGroups).To(HaveLen(16))
588590
})
589591
})
590592
})
@@ -596,10 +598,11 @@ var _ = Describe("remove_process_groups", func() {
596598

597599
When("including no process", func() {
598600
It("should not include any process", func() {
599-
fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
601+
processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
600602
Expect(err).NotTo(HaveOccurred())
601-
Expect(len(fdbProcessesToInclude)).To(Equal(0))
602-
Expect(len(cluster.Status.ProcessGroups)).To(Equal(16))
603+
Expect(processesToInclude).To(BeEmpty())
604+
Expect(newProcessGroups).To(BeEmpty())
605+
Expect(cluster.Status.ProcessGroups).To(HaveLen(16))
603606
})
604607
})
605608

@@ -615,11 +618,12 @@ var _ = Describe("remove_process_groups", func() {
615618
})
616619

617620
It("should include one process", func() {
618-
fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
621+
processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
619622
Expect(err).NotTo(HaveOccurred())
620-
Expect(len(fdbProcessesToInclude)).To(Equal(1))
621-
Expect(fdbv1beta2.ProcessAddressesString(fdbProcessesToInclude, " ")).To(Equal(removedProcessGroup.GetExclusionString()))
622-
Expect(len(cluster.Status.ProcessGroups)).To(Equal(15))
623+
Expect(processesToInclude).To(HaveLen(1))
624+
Expect(fdbv1beta2.ProcessAddressesString(processesToInclude, " ")).To(Equal(removedProcessGroup.GetExclusionString()))
625+
Expect(newProcessGroups).To(HaveLen(15))
626+
Expect(cluster.Status.ProcessGroups).To(HaveLen(16))
623627
})
624628
})
625629

@@ -636,12 +640,13 @@ var _ = Describe("remove_process_groups", func() {
636640
removedProcessGroups[removedProcessGroup.ProcessGroupID] = true
637641
})
638642

639-
It("should include one process", func() {
640-
fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
643+
It("should include two process addresses", func() {
644+
processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
641645
Expect(err).NotTo(HaveOccurred())
642-
Expect(len(fdbProcessesToInclude)).To(Equal(2))
643-
Expect(fdbv1beta2.ProcessAddressesString(fdbProcessesToInclude, " ")).To(Equal(fmt.Sprintf("%s %s", removedProcessGroup.GetExclusionString(), removedProcessGroup.Addresses[0])))
644-
Expect(len(cluster.Status.ProcessGroups)).To(Equal(15))
646+
Expect(processesToInclude).To(HaveLen(2))
647+
Expect(fdbv1beta2.ProcessAddressesString(processesToInclude, " ")).To(Equal(fmt.Sprintf("%s %s", removedProcessGroup.GetExclusionString(), removedProcessGroup.Addresses[0])))
648+
Expect(newProcessGroups).To(HaveLen(15))
649+
Expect(cluster.Status.ProcessGroups).To(HaveLen(16))
645650
})
646651
})
647652

@@ -663,11 +668,12 @@ var _ = Describe("remove_process_groups", func() {
663668
})
664669

665670
It("should include one process", func() {
666-
fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
671+
processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status)
667672
Expect(err).NotTo(HaveOccurred())
668-
Expect(len(fdbProcessesToInclude)).To(Equal(1))
669-
Expect(fdbv1beta2.ProcessAddressesString(fdbProcessesToInclude, " ")).To(Equal(removedProcessGroup2.GetExclusionString()))
670-
Expect(len(cluster.Status.ProcessGroups)).To(Equal(14))
673+
Expect(processesToInclude).To(HaveLen(1))
674+
Expect(fdbv1beta2.ProcessAddressesString(processesToInclude, " ")).To(Equal(removedProcessGroup2.GetExclusionString()))
675+
Expect(newProcessGroups).To(HaveLen(14))
676+
Expect(cluster.Status.ProcessGroups).To(HaveLen(16))
671677
})
672678
})
673679
})

setup/setup.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -268,10 +268,6 @@ func StartManager(
268268
clusterReconciler.MaintenanceListWaitDuration = operatorOpts.MaintenanceListWaitDuration
269269
clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion
270270
clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion
271-
clusterReconciler.MaintenanceListStaleDuration = operatorOpts.MaintenanceListStaleDuration
272-
clusterReconciler.MaintenanceListWaitDuration = operatorOpts.MaintenanceListWaitDuration
273-
clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion
274-
clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion
275271
clusterReconciler.ClusterLabelKeyForNodeTrigger = strings.Trim(operatorOpts.ClusterLabelKeyForNodeTrigger, "\"")
276272
clusterReconciler.Namespace = operatorOpts.WatchNamespace
277273

0 commit comments

Comments
 (0)