-
Notifications
You must be signed in to change notification settings - Fork 2.3k
EmergencyReparentShard: support reachable replica tablets w/mysqld down
#18896
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
df57f95
29d03c0
2abbc8a
97e8683
ee68e21
032fd6e
c27ec8b
f7f87fe
5231f76
14883c7
62a26fe
8bf456b
85d31ce
358daac
ceb2cb9
19996f8
34d9f24
9c3350c
08835b6
9746e97
7dbc44b
034d040
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,7 +30,7 @@ import ( | |
| "vitess.io/vitess/go/vt/logutil" | ||
| replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" | ||
| topodatapb "vitess.io/vitess/go/vt/proto/topodata" | ||
| "vitess.io/vitess/go/vt/proto/vtrpc" | ||
| vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" | ||
| "vitess.io/vitess/go/vt/topo" | ||
| "vitess.io/vitess/go/vt/topo/topoproto" | ||
| "vitess.io/vitess/go/vt/topotools" | ||
|
|
@@ -117,7 +117,7 @@ func FindPositionsOfAllCandidates( | |
| // Potentially bail. If any other tablet is detected to have | ||
| // GTID-based relay log positions, we will return the error recorded | ||
| // here. | ||
| emptyRelayPosErrorRecorder.RecordError(vterrors.Errorf(vtrpc.Code_UNAVAILABLE, "encountered tablet %v with no relay log position, when at least one other tablet in the status map has GTID based relay log positions", alias)) | ||
| emptyRelayPosErrorRecorder.RecordError(vterrors.Errorf(vtrpcpb.Code_UNAVAILABLE, "encountered tablet %v with no relay log position, when at least one other tablet in the status map has GTID based relay log positions", alias)) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -126,7 +126,7 @@ func FindPositionsOfAllCandidates( | |
| } | ||
|
|
||
| if isGTIDBased && isNonGTIDBased { | ||
| return nil, false, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "encountered mix of GTID-based and non GTID-based relay logs") | ||
| return nil, false, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "encountered mix of GTID-based and non GTID-based relay logs") | ||
| } | ||
|
|
||
| // Store the final positions in the map. | ||
|
|
@@ -159,7 +159,7 @@ func FindPositionsOfAllCandidates( | |
| // error if the Before state of replication is nil. | ||
| func ReplicaWasRunning(stopStatus *replicationdatapb.StopReplicationStatus) (bool, error) { | ||
| if stopStatus == nil || stopStatus.Before == nil { | ||
| return false, vterrors.Errorf(vtrpc.Code_INVALID_ARGUMENT, "could not determine Before state of StopReplicationStatus %v", stopStatus) | ||
| return false, vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "could not determine Before state of StopReplicationStatus %v", stopStatus) | ||
| } | ||
|
|
||
| replStatus := replication.ProtoToReplicationStatus(stopStatus.Before) | ||
|
|
@@ -251,6 +251,21 @@ func stopReplicationAndBuildStatusMaps( | |
|
|
||
| stopReplicationStatus, err := tmc.StopReplicationAndGetStatus(groupCtx, tabletInfo.Tablet, replicationdatapb.StopReplicationMode_IOTHREADONLY) | ||
| if err != nil { | ||
| // If we receive a vtrpcpb.Code_UNAVAILABLE error code from the StopReplicationAndGetStatus RPC, | ||
| // this means the call was received by vttablet but the backend mysqld is down/unreachable. We log | ||
| // and skip tablets in this state because we are reasonably sure they cannot be the most advanced | ||
| // because mysqld is (likely) down. In some cases this may not be true and mysqld IS running + most | ||
| // advanced but somehow vttablet sees it as down, but this should be a very rare exception, meaning | ||
| // we prioritize completing the reparent (availability) for the common case. If this edge case were | ||
| // to occur, errant GTID(s) will be produced; if this happens often we should return UNAVAILABLE | ||
| // from vttablet using more detailed criteria (check the pidfile + running PID, etc). | ||
|
Comment on lines
+259
to
+261
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if it's not worth improving this case today? The lack of detail may have been in place simply because it did not impact any operations. But now we're building logic around the meaning we infer from the response. I'd say we should do this now, provided you have an idea how to do it. |
||
| if topo.IsReplicaType(tabletInfo.Tablet.Type) && vterrors.Code(err) == vtrpcpb.Code_UNAVAILABLE { | ||
| logger.Warningf("replica %v is reachable but mysql is unavailable: %v", alias, err) | ||
| mustWaitForTablet = false // used in defer | ||
| err = nil // used in defer | ||
| return | ||
| } | ||
|
|
||
| sqlErr, isSQLErr := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError) | ||
| if isSQLErr && sqlErr != nil && sqlErr.Number() == sqlerror.ERNotReplica { | ||
| var primaryStatus *replicationdatapb.PrimaryStatus | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This resolves a warning because of underscore deprecation