Skip to content

Commit 7a5c115

Browse files
committed
UnreachablePrimaryWithBrokenReplicas, restartArbitraryDirectReplicaFunc, restartAllDirectReplicasFunc
Signed-off-by: Shlomi Noach <[email protected]>
1 parent 8417e59 commit 7a5c115

File tree

3 files changed

+43
-13
lines changed

3 files changed

+43
-13
lines changed

go/vt/vtorc/inst/analysis.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ const (
5050
ReplicaMisconfigured AnalysisCode = "ReplicaMisconfigured"
5151
UnreachablePrimaryWithLaggingReplicas AnalysisCode = "UnreachablePrimaryWithLaggingReplicas"
5252
UnreachablePrimary AnalysisCode = "UnreachablePrimary"
53+
UnreachablePrimaryWithBrokenReplicas AnalysisCode = "UnreachablePrimaryWithBrokenReplicas"
5354
PrimarySingleReplicaNotReplicating AnalysisCode = "PrimarySingleReplicaNotReplicating"
5455
PrimarySingleReplicaDead AnalysisCode = "PrimarySingleReplicaDead"
5556
AllPrimaryReplicasNotReplicating AnalysisCode = "AllPrimaryReplicasNotReplicating"

go/vt/vtorc/inst/analysis_dao.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -511,10 +511,15 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
511511
a.Analysis = UnreachablePrimaryWithLaggingReplicas
512512
a.Description = "Primary cannot be reached by vtorc and all of its replicas are lagging"
513513
//
514-
} else if a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 {
514+
} else if a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == a.CountValidReplicas {
515515
// partial success is here to reduce noise
516516
a.Analysis = UnreachablePrimary
517-
a.Description = "Primary cannot be reached by vtorc but it has replicating replicas; possibly a network/host issue"
517+
a.Description = "Primary cannot be reached by vtorc but all of its replicas seem to be replicating; possibly a network/host issue"
518+
//
519+
} else if a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 && a.CountValidReplicatingReplicas < a.CountValidReplicas {
520+
// partial success is here to reduce noise
521+
a.Analysis = UnreachablePrimaryWithBrokenReplicas
522+
a.Description = "Primary cannot be reached by vtorc but it has (some, but not all) replicating replicas; possibly a network/host issue"
518523
//
519524
} else if a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount {
520525
if isStaleBinlogCoordinates {

go/vt/vtorc/logic/topology_recovery.go

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ import (
4343

4444
const (
4545
CheckAndRecoverGenericProblemRecoveryName string = "CheckAndRecoverGenericProblem"
46-
RestartDirectReplicasRecoveryName string = "RestartDirectReplicas"
46+
RestartArbitraryDirectReplicaRecoveryName string = "RestartArbitraryDirectReplica"
47+
RestartAllDirectReplicasRecoveryName string = "RestartAllDirectReplicas"
4748
RecoverDeadPrimaryRecoveryName string = "RecoverDeadPrimary"
4849
RecoverPrimaryTabletDeletedRecoveryName string = "RecoverPrimaryTabletDeleted"
4950
RecoverPrimaryHasPrimaryRecoveryName string = "RecoverPrimaryHasPrimary"
@@ -104,7 +105,8 @@ type recoveryFunction int
104105
const (
105106
noRecoveryFunc recoveryFunction = iota
106107
recoverGenericProblemFunc
107-
restartDirectReplicasFunc
108+
restartArbitraryDirectReplicaFunc
109+
restartAllDirectReplicasFunc
108110
recoverDeadPrimaryFunc
109111
recoverPrimaryTabletDeletedFunc
110112
recoverPrimaryHasPrimaryFunc
@@ -351,8 +353,16 @@ func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry *inst.Repl
351353
return false, nil, nil
352354
}
353355

356+
func restartArbitraryDirectReplica(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (bool, *TopologyRecovery, error) {
357+
return restartDirectReplicas(ctx, analysisEntry, 1, logger)
358+
}
359+
360+
func restartAllDirectReplicas(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (bool, *TopologyRecovery, error) {
361+
return restartDirectReplicas(ctx, analysisEntry, 0, logger)
362+
}
363+
354364
// restartDirectReplicas restarts replication on direct replicas of an unreachable primary
355-
func restartDirectReplicas(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, logger *log.PrefixedLogger) (bool, *TopologyRecovery, error) {
365+
func restartDirectReplicas(ctx context.Context, analysisEntry *inst.ReplicationAnalysis, maxReplicas int, logger *log.PrefixedLogger) (bool, *TopologyRecovery, error) {
356366
topologyRecovery, err := AttemptRecoveryRegistration(analysisEntry)
357367
if topologyRecovery == nil {
358368
message := fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another restartDirectReplicas.", analysisEntry.AnalyzedInstanceAlias)
@@ -399,8 +409,14 @@ func restartDirectReplicas(ctx context.Context, analysisEntry *inst.ReplicationA
399409
eg, _ := errgroup.WithContext(ctx)
400410
var restartExpected int
401411
var restartPerformed atomic.Int64
402-
// Iterate through all tablets and find direct replicas of the primary
403-
for _, tabletInfo := range tablets {
412+
// Iterate through all tablets and find direct replicas of the primary.
413+
// We intentionally shuffle tablet order. When maxReplicas is non-zero, we want to
414+
// randomly pick which replicas to restart, to avoid biasing towards replicas.
415+
for i, tabletIndex := range rand.Perm(len(tablets)) {
416+
if maxReplicas > 0 && i >= maxReplicas {
417+
break
418+
}
419+
tabletInfo := tablets[tabletIndex]
404420
tablet := tabletInfo.Tablet
405421
tabletAlias := topoproto.TabletAliasString(tablet.Alias)
406422

@@ -524,7 +540,9 @@ func getCheckAndRecoverFunctionCode(analysisEntry *inst.ReplicationAnalysis) (re
524540
case inst.DeadPrimaryAndReplicas:
525541
recoveryFunc = recoverGenericProblemFunc
526542
case inst.UnreachablePrimary:
527-
recoveryFunc = restartDirectReplicasFunc
543+
recoveryFunc = restartArbitraryDirectReplicaFunc
544+
case inst.UnreachablePrimaryWithBrokenReplicas:
545+
recoveryFunc = restartAllDirectReplicasFunc
528546
case inst.UnreachablePrimaryWithLaggingReplicas:
529547
recoveryFunc = recoverGenericProblemFunc
530548
case inst.AllPrimaryReplicasNotReplicating:
@@ -549,7 +567,9 @@ func hasActionableRecovery(recoveryFunctionCode recoveryFunction) bool {
549567
return false
550568
case recoverGenericProblemFunc:
551569
return false
552-
case restartDirectReplicasFunc:
570+
case restartArbitraryDirectReplicaFunc:
571+
return true
572+
case restartAllDirectReplicasFunc:
553573
return true
554574
case recoverDeadPrimaryFunc:
555575
return true
@@ -581,8 +601,10 @@ func getCheckAndRecoverFunction(recoveryFunctionCode recoveryFunction) (
581601
return nil
582602
case recoverGenericProblemFunc:
583603
return checkAndRecoverGenericProblem
584-
case restartDirectReplicasFunc:
585-
return restartDirectReplicas
604+
case restartArbitraryDirectReplicaFunc:
605+
return restartArbitraryDirectReplica
606+
case restartAllDirectReplicasFunc:
607+
return restartAllDirectReplicas
586608
case recoverDeadPrimaryFunc:
587609
return recoverDeadPrimary
588610
case recoverPrimaryTabletDeletedFunc:
@@ -612,8 +634,10 @@ func getRecoverFunctionName(recoveryFunctionCode recoveryFunction) string {
612634
return ""
613635
case recoverGenericProblemFunc:
614636
return CheckAndRecoverGenericProblemRecoveryName
615-
case restartDirectReplicasFunc:
616-
return RestartDirectReplicasRecoveryName
637+
case restartArbitraryDirectReplicaFunc:
638+
return RestartArbitraryDirectReplicaRecoveryName
639+
case restartAllDirectReplicasFunc:
640+
return RestartAllDirectReplicasRecoveryName
617641
case recoverDeadPrimaryFunc:
618642
return RecoverDeadPrimaryRecoveryName
619643
case recoverPrimaryTabletDeletedFunc:

0 commit comments

Comments
 (0)