Skip to content

Commit cb69e09

Browse files
authored
Allow to replace process groups with a high run loop busy value (#2367)
1 parent 5bd976a commit cb69e09

File tree

11 files changed

+295
-53
lines changed

11 files changed

+295
-53
lines changed

api/v1beta2/foundationdb_status.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,9 @@ type FoundationDBStatusProcessInfo struct {
198198
// The time that the process has been up for.
199199
UptimeSeconds float64 `json:"uptime_seconds,omitempty"`
200200

201+
// RunLoopBusy represents the busyness of the run loop of the fdbserver.
202+
RunLoopBusy float64 `json:"run_loop_busy,omitempty"`
203+
201204
// Roles contains a slice of all roles of the process
202205
Roles []FoundationDBStatusProcessRoleInfo `json:"roles,omitempty"`
203206

api/v1beta2/foundationdb_status_test.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ var _ = Describe("FoundationDBStatus", func() {
119119
},
120120
},
121121
},
122-
Messages: []FoundationDBStatusProcessMessage{},
122+
Messages: []FoundationDBStatusProcessMessage{},
123+
RunLoopBusy: 0.024864200000000003,
123124
},
124125
"eab0db1aa7aae81a50ca97e9814a1b7d": {
125126
Address: ProcessAddress{
@@ -161,7 +162,8 @@ var _ = Describe("FoundationDBStatus", func() {
161162
ID: "dfd679875a386d06",
162163
},
163164
},
164-
Messages: []FoundationDBStatusProcessMessage{},
165+
Messages: []FoundationDBStatusProcessMessage{},
166+
RunLoopBusy: 0.0080509299999999978,
165167
},
166168
"f483247d4d5f279ef02c549680cbde64": {
167169
Address: ProcessAddress{
@@ -208,7 +210,8 @@ var _ = Describe("FoundationDBStatus", func() {
208210
},
209211
},
210212
},
211-
Messages: []FoundationDBStatusProcessMessage{},
213+
Messages: []FoundationDBStatusProcessMessage{},
214+
RunLoopBusy: 0.0101502,
212215
},
213216
"f6e0f7fd80da429d20329ad95d793ca3": {
214217
Address: ProcessAddress{
@@ -240,7 +243,8 @@ var _ = Describe("FoundationDBStatus", func() {
240243
ID: "cbeb915c6cceb4a9",
241244
},
242245
},
243-
Messages: []FoundationDBStatusProcessMessage{},
246+
Messages: []FoundationDBStatusProcessMessage{},
247+
RunLoopBusy: 0.027578200000000001,
244248
},
245249
"f75644abdf1b06c803b5c3c124fdd0a0": {
246250
Address: ProcessAddress{
@@ -264,7 +268,8 @@ var _ = Describe("FoundationDBStatus", func() {
264268
ID: "1f953018ad2e746f",
265269
},
266270
},
267-
Messages: []FoundationDBStatusProcessMessage{},
271+
Messages: []FoundationDBStatusProcessMessage{},
272+
RunLoopBusy: 0.0075524900000000002,
268273
},
269274
"105bf6c041f8ec315d03e889c2746ecf": {
270275
Address: ProcessAddress{
@@ -292,7 +297,8 @@ var _ = Describe("FoundationDBStatus", func() {
292297
KVStoreAvailableBytes: ptr.To[int64](84178214912),
293298
},
294299
},
295-
Messages: []FoundationDBStatusProcessMessage{},
300+
Messages: []FoundationDBStatusProcessMessage{},
301+
RunLoopBusy: 0.0081051000000000005,
296302
},
297303
"78c1c84af4481f0df628d40358f0930a": {
298304
Address: ProcessAddress{
@@ -320,7 +326,8 @@ var _ = Describe("FoundationDBStatus", func() {
320326
KVStoreAvailableBytes: ptr.To[int64](84178214912),
321327
},
322328
},
323-
Messages: []FoundationDBStatusProcessMessage{},
329+
Messages: []FoundationDBStatusProcessMessage{},
330+
RunLoopBusy: 0.0088742700000000001,
324331
},
325332
"83084479b50c9c3a09b0286297be3796": {
326333
Address: ProcessAddress{
@@ -348,7 +355,8 @@ var _ = Describe("FoundationDBStatus", func() {
348355
KVStoreAvailableBytes: ptr.To[int64](84178202624),
349356
},
350357
},
351-
Messages: []FoundationDBStatusProcessMessage{},
358+
Messages: []FoundationDBStatusProcessMessage{},
359+
RunLoopBusy: 0.0089497099999999979,
352360
},
353361
},
354362
Data: FoundationDBStatusDataStatistics{

api/v1beta2/foundationdbcluster_types.go

Lines changed: 86 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -68,16 +68,20 @@ type FoundationDBClusterList struct {
6868
Items []FoundationDBCluster `json:"items"`
6969
}
7070

71-
var conditionsThatNeedReplacement = []ProcessGroupConditionType{
72-
MissingProcesses,
73-
PodFailing,
74-
MissingPod,
75-
MissingPVC,
76-
MissingService,
77-
PodPending,
78-
NodeTaintReplacing,
79-
ProcessIsMarkedAsExcluded,
80-
ProcessHasIOError,
71+
// TODO (johscheuer): I think it would make sense to expose this as a setting in the FoundationDBCluster automation options
72+
// that way users can define what conditions should be used for the replacement logic.
73+
var defaultConditionsThatNeedReplacement = map[ProcessGroupConditionType]None{
74+
MissingProcesses: {},
75+
PodFailing: {},
76+
MissingPod: {},
77+
MissingPVC: {},
78+
MissingService: {},
79+
PodPending: {},
80+
NodeTaintReplacing: {},
81+
ProcessIsMarkedAsExcluded: {},
82+
ProcessHasIOError: {},
83+
SidecarUnreachable: {},
84+
ProcessHasHighRunLoopBusy: {},
8185
}
8286

8387
const (
@@ -547,12 +551,13 @@ func (processGroupStatus *ProcessGroupStatus) GetPvcName(cluster *FoundationDBCl
547551
return fmt.Sprintf("%s-data", processGroupStatus.GetPodName(cluster))
548552
}
549553

550-
// NeedsReplacement checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
554+
// NeedsReplacementWithConditions checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
551555
// The method will return the failure condition and the timestamp. If no failure is detected an empty condition and a 0
552-
// will be returned.
553-
func (processGroupStatus *ProcessGroupStatus) NeedsReplacement(
556+
// will be returned. The conditions that should trigger a replacement can be passed to this method.
557+
func (processGroupStatus *ProcessGroupStatus) NeedsReplacementWithConditions(
554558
failureTime int,
555559
taintReplacementTime int,
560+
conditionsThatNeedReplacement map[ProcessGroupConditionType]None,
556561
) (ProcessGroupConditionType, int64) {
557562
var earliestFailureTime int64 = math.MaxInt64
558563
var earliestTaintReplacementTime int64 = math.MaxInt64
@@ -563,30 +568,38 @@ func (processGroupStatus *ProcessGroupStatus) NeedsReplacement(
563568
}
564569

565570
var failureCondition ProcessGroupConditionType
566-
for _, conditionType := range conditionsThatNeedReplacement {
567-
conditionTimePtr := processGroupStatus.GetConditionTime(conditionType)
568-
if conditionTimePtr == nil {
571+
572+
// Iterate over all the conditions that the process group has, under normal circumstances the process group
573+
// should have no oder a minimal set of conditions. If any of the condition is part of the conditionsThatNeedReplacement
574+
// check how long the condition is present and check if the process group should be replaced.
575+
var hasConditionThatRequiresReplacement bool
576+
for _, condition := range processGroupStatus.ProcessGroupConditions {
577+
_, ok := conditionsThatNeedReplacement[condition.ProcessGroupConditionType]
578+
if !ok {
569579
continue
570580
}
571581

572-
conditionTime := *conditionTimePtr
573-
if conditionType == NodeTaintReplacing {
574-
if earliestTaintReplacementTime > conditionTime {
575-
earliestTaintReplacementTime = conditionTime
582+
hasConditionThatRequiresReplacement = true
583+
if condition.ProcessGroupConditionType == NodeTaintReplacing {
584+
if earliestTaintReplacementTime > condition.Timestamp {
585+
earliestTaintReplacementTime = condition.Timestamp
576586
}
577587

578-
failureCondition = conditionType
588+
failureCondition = condition.ProcessGroupConditionType
579589
continue
580590
}
581591

582-
if earliestFailureTime > conditionTime {
583-
earliestFailureTime = conditionTime
584-
failureCondition = conditionType
592+
if earliestFailureTime > condition.Timestamp {
593+
earliestFailureTime = condition.Timestamp
594+
failureCondition = condition.ProcessGroupConditionType
585595
}
586596
}
587597

588-
failureWindowStart := time.Now().Add(-1 * time.Duration(failureTime) * time.Second).Unix()
589-
if earliestFailureTime < failureWindowStart {
598+
if !hasConditionThatRequiresReplacement {
599+
return "", 0
600+
}
601+
602+
if earliestFailureTime < time.Now().Add(-1*time.Duration(failureTime)*time.Second).Unix() {
590603
return failureCondition, earliestFailureTime
591604
}
592605

@@ -601,6 +614,21 @@ func (processGroupStatus *ProcessGroupStatus) NeedsReplacement(
601614
return "", 0
602615
}
603616

617+
// NeedsReplacement checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
618+
// The method will return the failure condition and the timestamp. If no failure is detected an empty condition and a 0
619+
// will be returned.
620+
// Deprecated: Use NeedsReplacementWithConditions.
621+
func (processGroupStatus *ProcessGroupStatus) NeedsReplacement(
622+
failureTime int,
623+
taintReplacementTime int,
624+
) (ProcessGroupConditionType, int64) {
625+
return processGroupStatus.NeedsReplacementWithConditions(
626+
failureTime,
627+
taintReplacementTime,
628+
defaultConditionsThatNeedReplacement,
629+
)
630+
}
631+
604632
// AddAddresses adds the new address to the ProcessGroupStatus and removes duplicates and old addresses
605633
// if the process group is not marked as removal.
606634
func (processGroupStatus *ProcessGroupStatus) AddAddresses(
@@ -1071,6 +1099,9 @@ const (
10711099
// This condition can occur during the migration of the image type, the change of the image configuration
10721100
// for the sidecar or during version incompatible upgrades until the sidecar is updated to the new desired version.
10731101
IncorrectSidecarImage ProcessGroupConditionType = "IncorrectSidecarImage"
1102+
// ProcessHasHighRunLoopBusy represents a process group that has a high run loop busy value. A high run loop busy
1103+
// value can be caused by infrastructure issues or by overloaded processes.
1104+
ProcessHasHighRunLoopBusy ProcessGroupConditionType = "ProcessHasHighRunLoopBusy"
10741105
)
10751106

10761107
// AllProcessGroupConditionTypes returns all ProcessGroupConditionType
@@ -1093,6 +1124,7 @@ func AllProcessGroupConditionTypes() []ProcessGroupConditionType {
10931124
ProcessIsMarkedAsExcluded,
10941125
ProcessHasIOError,
10951126
IncorrectSidecarImage,
1127+
ProcessHasHighRunLoopBusy,
10961128
}
10971129
}
10981130

@@ -1137,6 +1169,8 @@ func GetProcessGroupConditionType(
11371169
return ProcessHasIOError, nil
11381170
case "IncorrectSidecarImage":
11391171
return IncorrectSidecarImage, nil
1172+
case "ProcessHasHighRunLoopBusy":
1173+
return ProcessHasHighRunLoopBusy, nil
11401174
}
11411175

11421176
return "", fmt.Errorf("unknown process group condition type: %s", processGroupConditionType)
@@ -1759,7 +1793,14 @@ func (cluster *FoundationDBCluster) CheckReconciliation(log logr.Logger) (bool,
17591793
0,
17601794
len(processGroup.ProcessGroupConditions),
17611795
)
1796+
17621797
for _, condition := range processGroup.ProcessGroupConditions {
1798+
// The ProcessHasHighRunLoopBusy is currently only informational and shouldn't block the reconciliation.
1799+
if condition.ProcessGroupConditionType == ProcessHasHighRunLoopBusy {
1800+
logger.V(1).
1801+
Info("Detected process with high run loop busy value", "processGroupID", processGroup.ProcessGroupID)
1802+
}
1803+
17631804
// If there is at least one process with an incorrect command line, that means the operator has to restart
17641805
// processes.
17651806
if condition.ProcessGroupConditionType == IncorrectCommandLine &&
@@ -1780,18 +1821,20 @@ func (cluster *FoundationDBCluster) CheckReconciliation(log logr.Logger) (bool,
17801821
conditions = append(conditions, condition.ProcessGroupConditionType)
17811822
}
17821823

1783-
logger.Info(
1784-
"Has unhealthy process group",
1785-
"processGroupID",
1786-
processGroup.ProcessGroupID,
1787-
"state",
1788-
"HasUnhealthyProcess",
1789-
"conditions",
1790-
conditions,
1791-
)
1792-
cluster.Status.Generations.HasUnhealthyProcess = cluster.Generation
1793-
reconciled = false
1794-
continue
1824+
if len(conditions) > 0 {
1825+
logger.Info(
1826+
"Has unhealthy process group",
1827+
"processGroupID",
1828+
processGroup.ProcessGroupID,
1829+
"state",
1830+
"HasUnhealthyProcess",
1831+
"conditions",
1832+
conditions,
1833+
)
1834+
cluster.Status.Generations.HasUnhealthyProcess = cluster.Generation
1835+
reconciled = false
1836+
continue
1837+
}
17951838
}
17961839

17971840
cluster.Status.ReconciledProcessGroups++
@@ -3609,3 +3652,8 @@ func (cluster *FoundationDBCluster) GetDatabaseInteractionMode() DatabaseInterac
36093652

36103653
return *cluster.Spec.AutomationOptions.DatabaseInteractionMode
36113654
}
3655+
3656+
// GetConditionsThatNeedReplacement returns the conditions that should trigger a replacement.
3657+
func (cluster *FoundationDBCluster) GetConditionsThatNeedReplacement() map[ProcessGroupConditionType]None {
3658+
return defaultConditionsThatNeedReplacement
3659+
}

controllers/cluster_controller.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ type FoundationDBClusterReconciler struct {
141141
ClusterLabelKeyForNodeTrigger string
142142
decodingSerializer runtime.Serializer
143143
SimulationOptions SimulationOptions
144+
// Defines the threshold for the high run loop busy condition, the default is 1.0.
145+
HighRunLoopBusyThreshold float64
144146
}
145147

146148
// NewFoundationDBClusterReconciler creates a new FoundationDBClusterReconciler with defaults.

controllers/replace_failed_process_groups_test.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,56 @@ var _ = Describe("replace_failed_process_groups", func() {
795795
})
796796
},
797797
)
798+
799+
When("a process has the ProcessHasHighRunLoopBusy condition",
800+
func() {
801+
BeforeEach(func() {
802+
processGroup.ProcessGroupConditions = append(
803+
processGroup.ProcessGroupConditions,
804+
&fdbv1beta2.ProcessGroupCondition{
805+
ProcessGroupConditionType: fdbv1beta2.ProcessHasHighRunLoopBusy,
806+
Timestamp: time.Now().Add(-1 * time.Hour).Unix(),
807+
},
808+
)
809+
})
810+
811+
It("should requeue", func() {
812+
Expect(result).NotTo(BeNil())
813+
})
814+
815+
It("should mark the process group to be removed", func() {
816+
removedIDs := getRemovedProcessGroupIDs(cluster)
817+
Expect(removedIDs).To(HaveLen(1))
818+
Expect(
819+
removedIDs,
820+
).To(ConsistOf([]fdbv1beta2.ProcessGroupID{processGroup.ProcessGroupID}))
821+
})
822+
})
823+
824+
When("a process has the SidecarUnreachable condition",
825+
func() {
826+
BeforeEach(func() {
827+
processGroup.ProcessGroupConditions = append(
828+
processGroup.ProcessGroupConditions,
829+
&fdbv1beta2.ProcessGroupCondition{
830+
ProcessGroupConditionType: fdbv1beta2.SidecarUnreachable,
831+
Timestamp: time.Now().Add(-1 * time.Hour).Unix(),
832+
},
833+
)
834+
})
835+
836+
It("should requeue", func() {
837+
Expect(result).NotTo(BeNil())
838+
})
839+
840+
It("should mark the process group to be removed", func() {
841+
removedIDs := getRemovedProcessGroupIDs(cluster)
842+
Expect(removedIDs).To(HaveLen(1))
843+
Expect(
844+
removedIDs,
845+
).To(ConsistOf([]fdbv1beta2.ProcessGroupID{processGroup.ProcessGroupID}))
846+
})
847+
})
798848
})
799849

800850
When("fault domain replacements are enabled", func() {

controllers/suite_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,5 +214,6 @@ func createTestClusterReconciler() *FoundationDBClusterReconciler {
214214
DatabaseClientProvider: mock.DatabaseClientProvider{},
215215
MaintenanceListStaleDuration: 4 * time.Hour,
216216
MaintenanceListWaitDuration: 5 * time.Minute,
217+
HighRunLoopBusyThreshold: 1.0,
217218
}
218219
}

controllers/update_status.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ func checkAndSetProcessStatus(
372372
return nil
373373
}
374374

375-
var excluded, hasIncorrectCommandLine, hasMissingProcesses, sidecarUnreachable, hasIOError bool
375+
var excluded, hasIncorrectCommandLine, hasMissingProcesses, sidecarUnreachable, hasIOError, hasHighRunLoopBusy bool
376376
var substitutions map[string]string
377377
var err error
378378

@@ -438,6 +438,13 @@ func checkAndSetProcessStatus(
438438
excluded = true
439439
}
440440

441+
// Check if the process has a high run loop busy value. If multiple processes are running inside this pod
442+
// a one process with a high busy value will be enough to flag this process groups with the fdbv1beta2.ProcessHasHighRunLoopBusy
443+
// condition.
444+
if process.RunLoopBusy >= r.HighRunLoopBusyThreshold && !hasHighRunLoopBusy {
445+
hasHighRunLoopBusy = true
446+
}
447+
441448
if len(substitutions) == 0 {
442449
continue
443450
}
@@ -519,6 +526,7 @@ func checkAndSetProcessStatus(
519526

520527
processGroupStatus.UpdateCondition(fdbv1beta2.ProcessIsMarkedAsExcluded, excluded)
521528
processGroupStatus.UpdateCondition(fdbv1beta2.ProcessHasIOError, hasIOError)
529+
processGroupStatus.UpdateCondition(fdbv1beta2.ProcessHasHighRunLoopBusy, hasHighRunLoopBusy)
522530
// If the sidecar is unreachable we are not able to compute the desired commandline.
523531
if sidecarUnreachable {
524532
return nil

0 commit comments

Comments
 (0)