@@ -68,16 +68,20 @@ type FoundationDBClusterList struct {
68
68
Items []FoundationDBCluster `json:"items"`
69
69
}
70
70
71
- var conditionsThatNeedReplacement = []ProcessGroupConditionType {
72
- MissingProcesses ,
73
- PodFailing ,
74
- MissingPod ,
75
- MissingPVC ,
76
- MissingService ,
77
- PodPending ,
78
- NodeTaintReplacing ,
79
- ProcessIsMarkedAsExcluded ,
80
- ProcessHasIOError ,
71
+ // TODO (johscheuer): I think it would make sense to expose this as a setting in the FoundationDBCluster automation options
72
+ // that way users can define what conditions should be used for the replacement logic.
73
+ var defaultConditionsThatNeedReplacement = map [ProcessGroupConditionType ]None {
74
+ MissingProcesses : {},
75
+ PodFailing : {},
76
+ MissingPod : {},
77
+ MissingPVC : {},
78
+ MissingService : {},
79
+ PodPending : {},
80
+ NodeTaintReplacing : {},
81
+ ProcessIsMarkedAsExcluded : {},
82
+ ProcessHasIOError : {},
83
+ SidecarUnreachable : {},
84
+ ProcessHasHighRunLoopBusy : {},
81
85
}
82
86
83
87
const (
@@ -547,12 +551,13 @@ func (processGroupStatus *ProcessGroupStatus) GetPvcName(cluster *FoundationDBCl
547
551
return fmt .Sprintf ("%s-data" , processGroupStatus .GetPodName (cluster ))
548
552
}
549
553
550
- // NeedsReplacement checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
554
+ // NeedsReplacementWithConditions checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
551
555
// The method will return the failure condition and the timestamp. If no failure is detected an empty condition and a 0
552
- // will be returned.
553
- func (processGroupStatus * ProcessGroupStatus ) NeedsReplacement (
556
+ // will be returned. The conditions that should trigger a replacement can be passed to this method.
557
+ func (processGroupStatus * ProcessGroupStatus ) NeedsReplacementWithConditions (
554
558
failureTime int ,
555
559
taintReplacementTime int ,
560
+ conditionsThatNeedReplacement map [ProcessGroupConditionType ]None ,
556
561
) (ProcessGroupConditionType , int64 ) {
557
562
var earliestFailureTime int64 = math .MaxInt64
558
563
var earliestTaintReplacementTime int64 = math .MaxInt64
@@ -563,30 +568,38 @@ func (processGroupStatus *ProcessGroupStatus) NeedsReplacement(
563
568
}
564
569
565
570
var failureCondition ProcessGroupConditionType
566
- for _ , conditionType := range conditionsThatNeedReplacement {
567
- conditionTimePtr := processGroupStatus .GetConditionTime (conditionType )
568
- if conditionTimePtr == nil {
571
+
572
+ // Iterate over all the conditions that the process group has, under normal circumstances the process group
573
+ // should have no oder a minimal set of conditions. If any of the condition is part of the conditionsThatNeedReplacement
574
+ // check how long the condition is present and check if the process group should be replaced.
575
+ var hasConditionThatRequiresReplacement bool
576
+ for _ , condition := range processGroupStatus .ProcessGroupConditions {
577
+ _ , ok := conditionsThatNeedReplacement [condition .ProcessGroupConditionType ]
578
+ if ! ok {
569
579
continue
570
580
}
571
581
572
- conditionTime := * conditionTimePtr
573
- if conditionType == NodeTaintReplacing {
574
- if earliestTaintReplacementTime > conditionTime {
575
- earliestTaintReplacementTime = conditionTime
582
+ hasConditionThatRequiresReplacement = true
583
+ if condition . ProcessGroupConditionType == NodeTaintReplacing {
584
+ if earliestTaintReplacementTime > condition . Timestamp {
585
+ earliestTaintReplacementTime = condition . Timestamp
576
586
}
577
587
578
- failureCondition = conditionType
588
+ failureCondition = condition . ProcessGroupConditionType
579
589
continue
580
590
}
581
591
582
- if earliestFailureTime > conditionTime {
583
- earliestFailureTime = conditionTime
584
- failureCondition = conditionType
592
+ if earliestFailureTime > condition . Timestamp {
593
+ earliestFailureTime = condition . Timestamp
594
+ failureCondition = condition . ProcessGroupConditionType
585
595
}
586
596
}
587
597
588
- failureWindowStart := time .Now ().Add (- 1 * time .Duration (failureTime ) * time .Second ).Unix ()
589
- if earliestFailureTime < failureWindowStart {
598
+ if ! hasConditionThatRequiresReplacement {
599
+ return "" , 0
600
+ }
601
+
602
+ if earliestFailureTime < time .Now ().Add (- 1 * time .Duration (failureTime )* time .Second ).Unix () {
590
603
return failureCondition , earliestFailureTime
591
604
}
592
605
@@ -601,6 +614,21 @@ func (processGroupStatus *ProcessGroupStatus) NeedsReplacement(
601
614
return "" , 0
602
615
}
603
616
617
+ // NeedsReplacement checks if the ProcessGroupStatus has conditions that require a replacement of the failed Process Group.
618
+ // The method will return the failure condition and the timestamp. If no failure is detected an empty condition and a 0
619
+ // will be returned.
620
+ // Deprecated: Use NeedsReplacementWithConditions.
621
+ func (processGroupStatus * ProcessGroupStatus ) NeedsReplacement (
622
+ failureTime int ,
623
+ taintReplacementTime int ,
624
+ ) (ProcessGroupConditionType , int64 ) {
625
+ return processGroupStatus .NeedsReplacementWithConditions (
626
+ failureTime ,
627
+ taintReplacementTime ,
628
+ defaultConditionsThatNeedReplacement ,
629
+ )
630
+ }
631
+
604
632
// AddAddresses adds the new address to the ProcessGroupStatus and removes duplicates and old addresses
605
633
// if the process group is not marked as removal.
606
634
func (processGroupStatus * ProcessGroupStatus ) AddAddresses (
@@ -1071,6 +1099,9 @@ const (
1071
1099
// This condition can occur during the migration of the image type, the change of the image configuration
1072
1100
// for the sidecar or during version incompatible upgrades until the sidecar is updated to the new desired version.
1073
1101
IncorrectSidecarImage ProcessGroupConditionType = "IncorrectSidecarImage"
1102
+ // ProcessHasHighRunLoopBusy represents a process group that has a high run loop busy value. A high run loop busy
1103
+ // value can be caused by infrastructure issues or by overloaded processes.
1104
+ ProcessHasHighRunLoopBusy ProcessGroupConditionType = "ProcessHasHighRunLoopBusy"
1074
1105
)
1075
1106
1076
1107
// AllProcessGroupConditionTypes returns all ProcessGroupConditionType
@@ -1093,6 +1124,7 @@ func AllProcessGroupConditionTypes() []ProcessGroupConditionType {
1093
1124
ProcessIsMarkedAsExcluded ,
1094
1125
ProcessHasIOError ,
1095
1126
IncorrectSidecarImage ,
1127
+ ProcessHasHighRunLoopBusy ,
1096
1128
}
1097
1129
}
1098
1130
@@ -1137,6 +1169,8 @@ func GetProcessGroupConditionType(
1137
1169
return ProcessHasIOError , nil
1138
1170
case "IncorrectSidecarImage" :
1139
1171
return IncorrectSidecarImage , nil
1172
+ case "ProcessHasHighRunLoopBusy" :
1173
+ return ProcessHasHighRunLoopBusy , nil
1140
1174
}
1141
1175
1142
1176
return "" , fmt .Errorf ("unknown process group condition type: %s" , processGroupConditionType )
@@ -1759,7 +1793,14 @@ func (cluster *FoundationDBCluster) CheckReconciliation(log logr.Logger) (bool,
1759
1793
0 ,
1760
1794
len (processGroup .ProcessGroupConditions ),
1761
1795
)
1796
+
1762
1797
for _ , condition := range processGroup .ProcessGroupConditions {
1798
+ // The ProcessHasHighRunLoopBusy is currently only informational and shouldn't block the reconciliation.
1799
+ if condition .ProcessGroupConditionType == ProcessHasHighRunLoopBusy {
1800
+ logger .V (1 ).
1801
+ Info ("Detected process with high run loop busy value" , "processGroupID" , processGroup .ProcessGroupID )
1802
+ }
1803
+
1763
1804
// If there is at least one process with an incorrect command line, that means the operator has to restart
1764
1805
// processes.
1765
1806
if condition .ProcessGroupConditionType == IncorrectCommandLine &&
@@ -1780,18 +1821,20 @@ func (cluster *FoundationDBCluster) CheckReconciliation(log logr.Logger) (bool,
1780
1821
conditions = append (conditions , condition .ProcessGroupConditionType )
1781
1822
}
1782
1823
1783
- logger .Info (
1784
- "Has unhealthy process group" ,
1785
- "processGroupID" ,
1786
- processGroup .ProcessGroupID ,
1787
- "state" ,
1788
- "HasUnhealthyProcess" ,
1789
- "conditions" ,
1790
- conditions ,
1791
- )
1792
- cluster .Status .Generations .HasUnhealthyProcess = cluster .Generation
1793
- reconciled = false
1794
- continue
1824
+ if len (conditions ) > 0 {
1825
+ logger .Info (
1826
+ "Has unhealthy process group" ,
1827
+ "processGroupID" ,
1828
+ processGroup .ProcessGroupID ,
1829
+ "state" ,
1830
+ "HasUnhealthyProcess" ,
1831
+ "conditions" ,
1832
+ conditions ,
1833
+ )
1834
+ cluster .Status .Generations .HasUnhealthyProcess = cluster .Generation
1835
+ reconciled = false
1836
+ continue
1837
+ }
1795
1838
}
1796
1839
1797
1840
cluster .Status .ReconciledProcessGroups ++
@@ -3609,3 +3652,8 @@ func (cluster *FoundationDBCluster) GetDatabaseInteractionMode() DatabaseInterac
3609
3652
3610
3653
return * cluster .Spec .AutomationOptions .DatabaseInteractionMode
3611
3654
}
3655
+
3656
+ // GetConditionsThatNeedReplacement returns the conditions that should trigger a replacement.
3657
+ func (cluster * FoundationDBCluster ) GetConditionsThatNeedReplacement () map [ProcessGroupConditionType ]None {
3658
+ return defaultConditionsThatNeedReplacement
3659
+ }
0 commit comments