1
1
package redisfailover
2
2
3
3
import (
4
+ "context"
4
5
"errors"
6
+ "github.com/spotahome/redis-operator/service/k8s"
7
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5
8
"strconv"
6
9
"time"
7
10
@@ -85,6 +88,15 @@ func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailov
85
88
// CheckAndHeal runs verifcation checks to ensure the RedisFailover is in an expected and healthy state.
86
89
// If the checks do not match up to expectations, an attempt will be made to "heal" the RedisFailover into a healthy state.
87
90
func (r * RedisFailoverHandler ) CheckAndHeal (rf * redisfailoverv1.RedisFailover ) error {
91
+
92
+ oldState := rf .Status .State
93
+
94
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
95
+ State : redisfailoverv1 .HealthyState ,
96
+ }
97
+
98
+ defer updateStatus (r .k8sservice , rf , oldState )
99
+
88
100
if rf .Bootstrapping () {
89
101
return r .checkAndHealBootstrapMode (rf )
90
102
}
@@ -99,19 +111,33 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
99
111
// Sentinel knows the correct slave number
100
112
101
113
if ! r .rfChecker .IsRedisRunning (rf ) {
102
- setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .REDIS_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New ("not all replicas running" ))
114
+ errorMsg := "not all replicas running"
115
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
116
+ State : redisfailoverv1 .NotHealthyState ,
117
+ Message : errorMsg ,
118
+ }
119
+ setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .REDIS_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New (errorMsg ))
103
120
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Debugf ("Number of redis mismatch, waiting for redis statefulset reconcile" )
104
121
return nil
105
122
}
106
123
107
124
if ! r .rfChecker .IsSentinelRunning (rf ) {
108
- setRedisCheckerMetrics (r .mClient , "sentinel" , rf .Namespace , rf .Name , metrics .SENTINEL_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New ("not all replicas running" ))
125
+ errorMsg := "not all replicas running"
126
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
127
+ State : redisfailoverv1 .NotHealthyState ,
128
+ Message : errorMsg ,
129
+ }
130
+ setRedisCheckerMetrics (r .mClient , "sentinel" , rf .Namespace , rf .Name , metrics .SENTINEL_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New (errorMsg ))
109
131
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Debugf ("Number of sentinel mismatch, waiting for sentinel deployment reconcile" )
110
132
return nil
111
133
}
112
134
113
135
nMasters , err := r .rfChecker .GetNumberMasters (rf )
114
136
if err != nil {
137
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
138
+ State : redisfailoverv1 .NotHealthyState ,
139
+ Message : "unable to get number of masters" ,
140
+ }
115
141
return err
116
142
}
117
143
@@ -125,7 +151,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
125
151
err = r .rfHealer .SetOldestAsMaster (rf )
126
152
setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NO_MASTER , metrics .NOT_APPLICABLE , err )
127
153
if err != nil {
128
- r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf ("Error in Setting oldest Pod as master" )
154
+ errorMsg := "Error in Setting oldest Pod as master"
155
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
156
+ State : redisfailoverv1 .NotHealthyState ,
157
+ Message : errorMsg ,
158
+ }
159
+ r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf (errorMsg )
129
160
return err
130
161
}
131
162
return nil
@@ -138,6 +169,10 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
138
169
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Warningf ("Number of Masters running is 0" )
139
170
maxUptime , err := r .rfChecker .GetMaxRedisPodTime (rf )
140
171
if err != nil {
172
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
173
+ State : redisfailoverv1 .NotHealthyState ,
174
+ Message : "unable to get Redis POD time" ,
175
+ }
141
176
return err
142
177
}
143
178
@@ -150,13 +185,22 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
150
185
err2 := r .rfHealer .SetOldestAsMaster (rf )
151
186
setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NO_MASTER , metrics .NOT_APPLICABLE , err2 )
152
187
if err2 != nil {
153
- r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf ("Error in Setting oldest Pod as master" )
188
+ errorMsg := "Error in Setting oldest Pod as master"
189
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
190
+ State : redisfailoverv1 .NotHealthyState ,
191
+ Message : errorMsg ,
192
+ }
193
+ r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf (errorMsg )
154
194
return err2
155
195
}
156
196
} else {
157
197
//sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master
158
198
status , err2 := r .rfChecker .CheckIfMasterLocalhost (rf )
159
199
if err2 != nil {
200
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
201
+ State : redisfailoverv1 .NotHealthyState ,
202
+ Message : "unable to check if master localhost" ,
203
+ }
160
204
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf ("CheckIfMasterLocalhost failed retry later" )
161
205
return err2
162
206
} else if status {
@@ -165,7 +209,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
165
209
err3 := r .rfHealer .SetOldestAsMaster (rf )
166
210
setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NO_MASTER , metrics .NOT_APPLICABLE , err3 )
167
211
if err3 != nil {
168
- r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf ("Error in Setting oldest Pod as master" )
212
+ errorMsg := "Error in Setting oldest Pod as master"
213
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
214
+ State : redisfailoverv1 .NotHealthyState ,
215
+ Message : errorMsg ,
216
+ }
217
+ r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf (errorMsg )
169
218
return err3
170
219
}
171
220
@@ -183,11 +232,20 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
183
232
setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NUMBER_OF_MASTERS , metrics .NOT_APPLICABLE , nil )
184
233
default :
185
234
setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NUMBER_OF_MASTERS , metrics .NOT_APPLICABLE , errors .New ("multiple masters detected" ))
186
- return errors .New ("more than one master, fix manually" )
235
+ errorMsg := "more than one master, fix manually"
236
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
237
+ State : redisfailoverv1 .NotHealthyState ,
238
+ Message : errorMsg ,
239
+ }
240
+ return errors .New (errorMsg )
187
241
}
188
242
189
243
master , err := r .rfChecker .GetMasterIP (rf )
190
244
if err != nil {
245
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
246
+ State : redisfailoverv1 .NotHealthyState ,
247
+ Message : "unable to get master IP" ,
248
+ }
191
249
return err
192
250
}
193
251
@@ -196,23 +254,38 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
196
254
if err != nil {
197
255
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Warningf ("Slave not associated to master: %s" , err .Error ())
198
256
if err = r .rfHealer .SetMasterOnAll (master , rf ); err != nil {
257
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
258
+ State : redisfailoverv1 .NotHealthyState ,
259
+ }
199
260
return err
200
261
}
201
262
}
202
263
203
264
err = r .applyRedisCustomConfig (rf )
204
265
setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .APPLY_REDIS_CONFIG , metrics .NOT_APPLICABLE , err )
205
266
if err != nil {
267
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
268
+ State : redisfailoverv1 .NotHealthyState ,
269
+ Message : "unable to apply custom config" ,
270
+ }
206
271
return err
207
272
}
208
273
209
274
err = r .UpdateRedisesPods (rf )
210
275
if err != nil {
276
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
277
+ State : redisfailoverv1 .NotHealthyState ,
278
+ Message : "unable to update redis PODs" ,
279
+ }
211
280
return err
212
281
}
213
282
214
283
sentinels , err := r .rfChecker .GetSentinelsIPs (rf )
215
284
if err != nil {
285
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
286
+ State : redisfailoverv1 .NotHealthyState ,
287
+ Message : "unable to get sentinels IPs" ,
288
+ }
216
289
return err
217
290
}
218
291
@@ -223,6 +296,9 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
223
296
if err != nil {
224
297
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Warningf ("Fixing sentinel not monitoring expected master: %s" , err .Error ())
225
298
if err := r .rfHealer .NewSentinelMonitor (sip , master , rf ); err != nil {
299
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
300
+ State : redisfailoverv1 .NotHealthyState ,
301
+ }
226
302
return err
227
303
}
228
304
}
@@ -233,37 +309,62 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
233
309
func (r * RedisFailoverHandler ) checkAndHealBootstrapMode (rf * redisfailoverv1.RedisFailover ) error {
234
310
235
311
if ! r .rfChecker .IsRedisRunning (rf ) {
236
- setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .REDIS_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New ("not all replicas running" ))
312
+ errorMsg := "not all replicas running"
313
+ r .k8sservice .UpdateRedisFailoverStatus (context .Background (), rf .Namespace , rf , metav1.UpdateOptions {})
314
+ setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .REDIS_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New (errorMsg ))
237
315
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Debugf ("Number of redis mismatch, waiting for redis statefulset reconcile" )
238
316
return nil
239
317
}
240
318
241
319
err := r .UpdateRedisesPods (rf )
242
320
if err != nil {
243
- return err
321
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
322
+ State : redisfailoverv1 .NotHealthyState ,
323
+ Message : "unable to update Redis PODs" ,
324
+ }
244
325
}
245
326
err = r .applyRedisCustomConfig (rf )
246
327
setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .APPLY_REDIS_CONFIG , metrics .NOT_APPLICABLE , err )
247
328
if err != nil {
329
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
330
+ State : redisfailoverv1 .NotHealthyState ,
331
+ Message : "unable to set Redis custom config" ,
332
+ }
248
333
return err
249
334
}
250
335
251
336
bootstrapSettings := rf .Spec .BootstrapNode
252
337
err = r .rfHealer .SetExternalMasterOnAll (bootstrapSettings .Host , bootstrapSettings .Port , rf )
253
338
setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .APPLY_EXTERNAL_MASTER , metrics .NOT_APPLICABLE , err )
254
339
if err != nil {
340
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
341
+ State : redisfailoverv1 .NotHealthyState ,
342
+ Message : "unable to set external master to all" ,
343
+ }
255
344
return err
256
345
}
257
346
258
347
if rf .SentinelsAllowed () {
259
348
if ! r .rfChecker .IsSentinelRunning (rf ) {
260
- setRedisCheckerMetrics (r .mClient , "sentinel" , rf .Namespace , rf .Name , metrics .SENTINEL_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New ("not all replicas running" ))
349
+ errorMsg := "not all replicas running"
350
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
351
+ State : redisfailoverv1 .NotHealthyState ,
352
+ Message : errorMsg ,
353
+ }
354
+ r .k8sservice .UpdateRedisFailoverStatus (context .Background (), rf .Namespace , rf , metav1.UpdateOptions {})
355
+ setRedisCheckerMetrics (r .mClient , "sentinel" , rf .Namespace , rf .Name , metrics .SENTINEL_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New (errorMsg ))
261
356
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Debugf ("Number of sentinel mismatch, waiting for sentinel deployment reconcile" )
262
357
return nil
358
+ } else {
359
+ r .k8sservice .UpdateRedisFailoverStatus (context .Background (), rf .Namespace , rf , metav1.UpdateOptions {})
263
360
}
264
361
265
362
sentinels , err := r .rfChecker .GetSentinelsIPs (rf )
266
363
if err != nil {
364
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
365
+ State : redisfailoverv1 .NotHealthyState ,
366
+ Message : "unable to get sentinels IPs" ,
367
+ }
267
368
return err
268
369
}
269
370
for _ , sip := range sentinels {
@@ -272,6 +373,10 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red
272
373
if err != nil {
273
374
r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Warningf ("Fixing sentinel not monitoring expected master: %s" , err .Error ())
274
375
if err := r .rfHealer .NewSentinelMonitorWithPort (sip , bootstrapSettings .Host , bootstrapSettings .Port , rf ); err != nil {
376
+ rf .Status = redisfailoverv1.RedisFailoverStatus {
377
+ State : redisfailoverv1 .NotHealthyState ,
378
+ Message : "unable to check sentinel monitor" ,
379
+ }
275
380
return err
276
381
}
277
382
}
@@ -346,3 +451,10 @@ func setRedisCheckerMetrics(metricsClient metrics.Recorder, mode /* redis or sen
346
451
}
347
452
}
348
453
}
454
+
455
+ func updateStatus (k8sservice k8s.Services , rf * redisfailoverv1.RedisFailover , oldState string ) {
456
+ if oldState != rf .Status .State {
457
+ rf .Status .LastChanged = time .Now ().Format (time .RFC3339 )
458
+ }
459
+ k8sservice .UpdateRedisFailoverStatus (context .Background (), rf .Namespace , rf , metav1.UpdateOptions {})
460
+ }
0 commit comments