@@ -22,12 +22,13 @@ package controllers
22
22
23
23
import (
24
24
"context"
25
+ "errors"
25
26
"fmt"
26
27
"time"
27
28
28
- "github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/fdbstatus"
29
-
30
29
"github.com/FoundationDB/fdb-kubernetes-operator/v2/internal/buggify"
30
+ "github.com/FoundationDB/fdb-kubernetes-operator/v2/internal/coordination"
31
+ "github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/fdbstatus"
31
32
32
33
"github.com/FoundationDB/fdb-kubernetes-operator/v2/internal/restarts"
33
34
@@ -38,8 +39,7 @@ import (
38
39
"k8s.io/utils/pointer"
39
40
)
40
41
41
- // bounceProcesses provides a reconciliation step for bouncing fdbserver
42
- // processes.
42
+ // bounceProcesses provides a reconciliation step for bouncing fdbserver processes.
43
43
type bounceProcesses struct {}
44
44
45
45
// reconcile runs the reconciler's work.
@@ -64,16 +64,40 @@ func (c bounceProcesses) reconcile(_ context.Context, r *FoundationDBClusterReco
64
64
}
65
65
}
66
66
67
+ // Fetch the processes that are ready for restart for the current cluster, we use that information to determine which
68
+ // processes must be added to the set.
69
+ var processesReadyForRestart map [fdbv1beta2.ProcessGroupID ]time.Time
70
+ var processesPendingForRestart map [fdbv1beta2.ProcessGroupID ]time.Time
71
+ globalSynchronizationMode := cluster .GetSynchronizationMode () == fdbv1beta2 .SynchronizationModeGlobal
72
+ if globalSynchronizationMode {
73
+ processesReadyForRestart , err = adminClient .GetReadyForRestart (cluster .Spec .ProcessGroupIDPrefix )
74
+ if err != nil {
75
+ return & requeue {curError : err }
76
+ }
77
+
78
+ processesPendingForRestart , err = adminClient .GetPendingForRestart (cluster .Spec .ProcessGroupIDPrefix )
79
+ if err != nil {
80
+ return & requeue {curError : err }
81
+ }
82
+ }
83
+
67
84
currentMinimumUptime , addressMap , err := fdbstatus .GetMinimumUptimeAndAddressMap (logger , cluster , status , r .EnableRecoveryState )
68
85
if err != nil {
69
86
return & requeue {curError : err }
70
87
}
71
88
72
- addresses , req := getProcessesReadyForRestart (logger , cluster , addressMap )
89
+ addresses , updatesReadyForRestart , updatesPendingForRestart , req := getProcessesReadyForRestart (logger , cluster , addressMap , processesReadyForRestart , processesPendingForRestart )
73
90
if req != nil {
74
91
return req
75
92
}
76
93
94
+ if globalSynchronizationMode && len (updatesPendingForRestart ) > 0 {
95
+ err = adminClient .UpdatePendingForRestart (updatesPendingForRestart )
96
+ if err != nil {
97
+ return & requeue {curError : err }
98
+ }
99
+ }
100
+
77
101
// Only perform the check if the cluster controller must be restarted if the cluster was up long enough. This is an
78
102
// additional safety guard to reduce the risk of successive restarts in cases where unidirectional partitions occur.
79
103
if currentMinimumUptime > r .MinimumRequiredUptimeCCBounce .Seconds () {
@@ -118,6 +142,7 @@ func (c bounceProcesses) reconcile(_ context.Context, r *FoundationDBClusterReco
118
142
return & requeue {curError : err }
119
143
}
120
144
}
145
+
121
146
version , err := fdbv1beta2 .ParseFdbVersion (cluster .Spec .Version )
122
147
if err != nil {
123
148
return & requeue {curError : err }
@@ -137,9 +162,18 @@ func (c bounceProcesses) reconcile(_ context.Context, r *FoundationDBClusterReco
137
162
}
138
163
}
139
164
140
- err = r .takeLock (logger , cluster , fmt .Sprintf ("bouncing processes: %v" , addresses ))
141
- if err != nil {
142
- return & requeue {curError : err }
165
+ if globalSynchronizationMode && len (updatesReadyForRestart ) > 0 {
166
+ err = adminClient .UpdateReadyForRestart (updatesReadyForRestart )
167
+ if err != nil {
168
+ return & requeue {curError : err }
169
+ }
170
+ }
171
+
172
+ if useLocks {
173
+ lockErr := lockClient .TakeLock ()
174
+ if lockErr != nil {
175
+ return & requeue {curError : lockErr , delayedRequeue : true }
176
+ }
143
177
}
144
178
145
179
if useLocks && upgrading {
@@ -153,6 +187,32 @@ func (c bounceProcesses) reconcile(_ context.Context, r *FoundationDBClusterReco
153
187
}
154
188
}
155
189
190
+ // When the cluster is being upgraded, we use the same synchronization mode as before.
191
+ if globalSynchronizationMode && ! upgrading {
192
+ pendingForRestart , err := adminClient .GetPendingForRestart ("" )
193
+ if err != nil {
194
+ return & requeue {curError : err , delayedRequeue : true }
195
+ }
196
+
197
+ readyForRestart , err := adminClient .GetReadyForRestart ("" )
198
+ if err != nil {
199
+ return & requeue {curError : err , delayedRequeue : true }
200
+ }
201
+
202
+ err = coordination .AllProcessesReady (logger , pendingForRestart , readyForRestart , r .GlobalSynchronizationWaitDuration )
203
+ if err != nil {
204
+ waitTimeError := & coordination.WaitTimeError {}
205
+ if errors .As (err , waitTimeError ) {
206
+ return & requeue {curError : err , delayedRequeue : true , delay : waitTimeError .GetWaitTime ()}
207
+ }
208
+
209
+ return & requeue {curError : err , delayedRequeue : true }
210
+ }
211
+
212
+ addresses = coordination .GetAddressesFromStatus (logger , status , readyForRestart , false )
213
+ logger .Info ("Addresses from status" , "addresses" , addresses )
214
+ }
215
+
156
216
filteredAddresses , removedAddresses := buggify .FilterIgnoredProcessGroups (cluster , addresses , status )
157
217
if removedAddresses {
158
218
addresses = filteredAddresses
@@ -169,8 +229,13 @@ func (c bounceProcesses) reconcile(_ context.Context, r *FoundationDBClusterReco
169
229
// processes in the cluster.
170
230
err = adminClient .KillProcessesForUpgrade (addresses )
171
231
} else {
232
+ clearErr := adminClient .ClearReadyForRestart ()
233
+ if clearErr != nil {
234
+ logger .Info ("Could not remove ready entries for restart, will continue with restart" , "error" , clearErr .Error ())
235
+ }
172
236
err = adminClient .KillProcesses (addresses )
173
237
}
238
+
174
239
if err != nil {
175
240
return & requeue {curError : err }
176
241
}
@@ -191,11 +256,13 @@ func (c bounceProcesses) reconcile(_ context.Context, r *FoundationDBClusterReco
191
256
192
257
// getProcessesReadyForRestart returns a slice of process addresses that can be restarted. If addresses are missing or not all processes
193
258
// have the latest configuration this method will return a requeue struct with more details.
194
- func getProcessesReadyForRestart (logger logr.Logger , cluster * fdbv1beta2.FoundationDBCluster , addressMap map [fdbv1beta2.ProcessGroupID ][]fdbv1beta2.ProcessAddress ) ([]fdbv1beta2.ProcessAddress , * requeue ) {
259
+ func getProcessesReadyForRestart (logger logr.Logger , cluster * fdbv1beta2.FoundationDBCluster , addressMap map [fdbv1beta2.ProcessGroupID ][]fdbv1beta2.ProcessAddress , processesReadyForRestart map [fdbv1beta2. ProcessGroupID ]time. Time , processesPendingForRestart map [fdbv1beta2. ProcessGroupID ]time. Time ) ([]fdbv1beta2.ProcessAddress , map [fdbv1beta2. ProcessGroupID ]fdbv1beta2. UpdateAction , map [fdbv1beta2. ProcessGroupID ]fdbv1beta2. UpdateAction , * requeue ) {
195
260
addresses := make ([]fdbv1beta2.ProcessAddress , 0 , len (cluster .Status .ProcessGroups ))
196
261
allSynced := true
197
262
versionIncompatibleUpgrade := cluster .IsBeingUpgradedWithVersionIncompatibleVersion ()
198
263
var missingAddress []fdbv1beta2.ProcessGroupID
264
+ updatesReadyForRestart := map [fdbv1beta2.ProcessGroupID ]fdbv1beta2.UpdateAction {}
265
+ updatesPendingForRestart := map [fdbv1beta2.ProcessGroupID ]fdbv1beta2.UpdateAction {}
199
266
200
267
filterConditions := restarts .GetFilterConditions (cluster )
201
268
missingProcesses := map [fdbv1beta2.ProcessClass ]int {}
@@ -224,24 +291,19 @@ func getProcessesReadyForRestart(logger logr.Logger, cluster *fdbv1beta2.Foundat
224
291
if versionIncompatibleUpgrade && processGroup .IsExcluded () {
225
292
logger .Info ("adding process group that is marked for exclusion to list of restarted processes" , "processGroupID" , processGroup .ProcessGroupID )
226
293
addresses = append (addresses , addressMap [processGroup .ProcessGroupID ]... )
227
- continue
228
- }
229
- }
294
+ if _ , ok := processesReadyForRestart [processGroup .ProcessGroupID ]; ! ok {
295
+ updatesReadyForRestart [processGroup .ProcessGroupID ] = fdbv1beta2 .UpdateActionAdd
296
+ }
297
+ if _ , ok := processesPendingForRestart [processGroup .ProcessGroupID ]; ! ok {
298
+ updatesPendingForRestart [processGroup .ProcessGroupID ] = fdbv1beta2 .UpdateActionAdd
299
+ }
230
300
231
- // Ignore processes that are missing for more than 30 seconds e.mg. if the process is network partitioned.
232
- // This is required since the update status will not update the SidecarUnreachable setting if a process is
233
- // missing in the status.
234
- if missingTime := processGroup .GetConditionTime (fdbv1beta2 .MissingProcesses ); missingTime != nil {
235
- if time .Unix (* missingTime , 0 ).Add (cluster .GetIgnoreMissingProcessesSeconds ()).Before (time .Now ()) {
236
- logger .Info ("ignore process group with missing process" , "processGroupID" , processGroup .ProcessGroupID )
237
- missingProcesses [processGroup .ProcessClass ]++
238
301
continue
239
302
}
240
303
}
241
304
242
- // If a Pod is stuck in pending we have to ignore it, as the processes hosted by this Pod will not be running.
243
- if cluster .SkipProcessGroup (processGroup ) {
244
- logger .Info ("ignore process group with Pod stuck in pending" , "processGroupID" , processGroup .ProcessGroupID )
305
+ // Check if the processes should be ignored.
306
+ if restarts .ShouldBeIgnoredBecauseMissing (logger , cluster , processGroup ) {
245
307
missingProcesses [processGroup .ProcessClass ]++
246
308
continue
247
309
}
@@ -258,20 +320,27 @@ func getProcessesReadyForRestart(logger logr.Logger, cluster *fdbv1beta2.Foundat
258
320
continue
259
321
}
260
322
323
+ if _ , ok := processesPendingForRestart [processGroup .ProcessGroupID ]; ! ok {
324
+ updatesPendingForRestart [processGroup .ProcessGroupID ] = fdbv1beta2 .UpdateActionAdd
325
+ }
326
+
261
327
if addressMap [processGroup .ProcessGroupID ] == nil {
262
328
missingAddress = append (missingAddress , processGroup .ProcessGroupID )
263
329
continue
264
330
}
265
331
266
332
addresses = append (addresses , addressMap [processGroup .ProcessGroupID ]... )
333
+ if _ , ok := processesReadyForRestart [processGroup .ProcessGroupID ]; ! ok {
334
+ updatesReadyForRestart [processGroup .ProcessGroupID ] = fdbv1beta2 .UpdateActionAdd
335
+ }
267
336
}
268
337
269
338
if len (missingAddress ) > 0 {
270
- return nil , & requeue {message : fmt .Sprintf ("could not find address for processes: %s" , missingAddress ), delayedRequeue : true }
339
+ return nil , nil , updatesPendingForRestart , & requeue {message : fmt .Sprintf ("could not find address for processes: %s" , missingAddress ), delayedRequeue : true }
271
340
}
272
341
273
342
if ! allSynced {
274
- return nil , & requeue {message : "Waiting for config map to sync to all pods" , delayedRequeue : true }
343
+ return nil , nil , updatesPendingForRestart , & requeue {message : "Waiting for config map to sync to all pods" , delayedRequeue : true }
275
344
}
276
345
277
346
// Only if the cluster is upgraded with an incompatible version we have to make sure that all processes are ready to be restarted.
@@ -280,15 +349,15 @@ func getProcessesReadyForRestart(logger logr.Logger, cluster *fdbv1beta2.Foundat
280
349
err := checkIfEnoughProcessesAreRunning (logger , cluster , len (addresses ), missingProcesses , markedForRemoval )
281
350
// If not all processes are ready to restart we will block the upgrade and delay it.
282
351
if err != nil {
283
- return nil , & requeue {
352
+ return nil , nil , nil , & requeue {
284
353
message : err .Error (),
285
354
delay : 5 * time .Second ,
286
355
delayedRequeue : true ,
287
356
}
288
357
}
289
358
}
290
359
291
- return addresses , nil
360
+ return addresses , updatesReadyForRestart , updatesPendingForRestart , nil
292
361
}
293
362
294
363
// getUpgradeAddressesFromStatus will return the processes that can be upgraded and all the processes that are not ready to be upgraded.
0 commit comments