@@ -43,7 +43,8 @@ import (
43
43
44
44
const (
45
45
CheckAndRecoverGenericProblemRecoveryName string = "CheckAndRecoverGenericProblem"
46
- RestartDirectReplicasRecoveryName string = "RestartDirectReplicas"
46
+ RestartArbitraryDirectReplicaRecoveryName string = "RestartArbitraryDirectReplica"
47
+ RestartAllDirectReplicasRecoveryName string = "RestartAllDirectReplicas"
47
48
RecoverDeadPrimaryRecoveryName string = "RecoverDeadPrimary"
48
49
RecoverPrimaryTabletDeletedRecoveryName string = "RecoverPrimaryTabletDeleted"
49
50
RecoverPrimaryHasPrimaryRecoveryName string = "RecoverPrimaryHasPrimary"
@@ -104,7 +105,8 @@ type recoveryFunction int
104
105
const (
105
106
noRecoveryFunc recoveryFunction = iota
106
107
recoverGenericProblemFunc
107
- restartDirectReplicasFunc
108
+ restartArbitraryDirectReplicaFunc
109
+ restartAllDirectReplicasFunc
108
110
recoverDeadPrimaryFunc
109
111
recoverPrimaryTabletDeletedFunc
110
112
recoverPrimaryHasPrimaryFunc
@@ -351,8 +353,16 @@ func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry *inst.Repl
351
353
return false , nil , nil
352
354
}
353
355
356
+ func restartArbitraryDirectReplica (ctx context.Context , analysisEntry * inst.ReplicationAnalysis , logger * log.PrefixedLogger ) (bool , * TopologyRecovery , error ) {
357
+ return restartDirectReplicas (ctx , analysisEntry , 1 , logger )
358
+ }
359
+
360
+ func restartAllDirectReplicas (ctx context.Context , analysisEntry * inst.ReplicationAnalysis , logger * log.PrefixedLogger ) (bool , * TopologyRecovery , error ) {
361
+ return restartDirectReplicas (ctx , analysisEntry , 0 , logger )
362
+ }
363
+
354
364
// restartDirectReplicas restarts replication on direct replicas of an unreachable primary
355
- func restartDirectReplicas (ctx context.Context , analysisEntry * inst.ReplicationAnalysis , logger * log.PrefixedLogger ) (bool , * TopologyRecovery , error ) {
365
+ func restartDirectReplicas (ctx context.Context , analysisEntry * inst.ReplicationAnalysis , maxReplicas int , logger * log.PrefixedLogger ) (bool , * TopologyRecovery , error ) {
356
366
topologyRecovery , err := AttemptRecoveryRegistration (analysisEntry )
357
367
if topologyRecovery == nil {
358
368
message := fmt .Sprintf ("found an active or recent recovery on %+v. Will not issue another restartDirectReplicas." , analysisEntry .AnalyzedInstanceAlias )
@@ -399,8 +409,14 @@ func restartDirectReplicas(ctx context.Context, analysisEntry *inst.ReplicationA
399
409
eg , _ := errgroup .WithContext (ctx )
400
410
var restartExpected int
401
411
var restartPerformed atomic.Int64
402
- // Iterate through all tablets and find direct replicas of the primary
403
- for _ , tabletInfo := range tablets {
412
+ // Iterate through all tablets and find direct replicas of the primary.
413
+ // We intentionally shuffle tablet order. When maxReplicas is non-zero, we want to
414
+ // randomly pick which replicas to restart, to avoid biasing towards replicas.
415
+ for i , tabletIndex := range rand .Perm (len (tablets )) {
416
+ if maxReplicas > 0 && i >= maxReplicas {
417
+ break
418
+ }
419
+ tabletInfo := tablets [tabletIndex ]
404
420
tablet := tabletInfo .Tablet
405
421
tabletAlias := topoproto .TabletAliasString (tablet .Alias )
406
422
@@ -524,7 +540,9 @@ func getCheckAndRecoverFunctionCode(analysisEntry *inst.ReplicationAnalysis) (re
524
540
case inst .DeadPrimaryAndReplicas :
525
541
recoveryFunc = recoverGenericProblemFunc
526
542
case inst .UnreachablePrimary :
527
- recoveryFunc = restartDirectReplicasFunc
543
+ recoveryFunc = restartArbitraryDirectReplicaFunc
544
+ case inst .UnreachablePrimaryWithBrokenReplicas :
545
+ recoveryFunc = restartAllDirectReplicasFunc
528
546
case inst .UnreachablePrimaryWithLaggingReplicas :
529
547
recoveryFunc = recoverGenericProblemFunc
530
548
case inst .AllPrimaryReplicasNotReplicating :
@@ -549,7 +567,9 @@ func hasActionableRecovery(recoveryFunctionCode recoveryFunction) bool {
549
567
return false
550
568
case recoverGenericProblemFunc :
551
569
return false
552
- case restartDirectReplicasFunc :
570
+ case restartArbitraryDirectReplicaFunc :
571
+ return true
572
+ case restartAllDirectReplicasFunc :
553
573
return true
554
574
case recoverDeadPrimaryFunc :
555
575
return true
@@ -581,8 +601,10 @@ func getCheckAndRecoverFunction(recoveryFunctionCode recoveryFunction) (
581
601
return nil
582
602
case recoverGenericProblemFunc :
583
603
return checkAndRecoverGenericProblem
584
- case restartDirectReplicasFunc :
585
- return restartDirectReplicas
604
+ case restartArbitraryDirectReplicaFunc :
605
+ return restartArbitraryDirectReplica
606
+ case restartAllDirectReplicasFunc :
607
+ return restartAllDirectReplicas
586
608
case recoverDeadPrimaryFunc :
587
609
return recoverDeadPrimary
588
610
case recoverPrimaryTabletDeletedFunc :
@@ -612,8 +634,10 @@ func getRecoverFunctionName(recoveryFunctionCode recoveryFunction) string {
612
634
return ""
613
635
case recoverGenericProblemFunc :
614
636
return CheckAndRecoverGenericProblemRecoveryName
615
- case restartDirectReplicasFunc :
616
- return RestartDirectReplicasRecoveryName
637
+ case restartArbitraryDirectReplicaFunc :
638
+ return RestartArbitraryDirectReplicaRecoveryName
639
+ case restartAllDirectReplicasFunc :
640
+ return RestartAllDirectReplicasRecoveryName
617
641
case recoverDeadPrimaryFunc :
618
642
return RecoverDeadPrimaryRecoveryName
619
643
case recoverPrimaryTabletDeletedFunc :
0 commit comments