Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ee90f1c
`vtorc`: use `*topoprotopb.TabletAlias` for tablet alias
timvaillancourt Jun 22, 2025
45041e0
rename func
timvaillancourt Jun 22, 2025
032b6d3
fix e2e test
timvaillancourt Jun 22, 2025
d490762
reorder
timvaillancourt Jun 23, 2025
96e1456
more test cleanup
timvaillancourt Jun 23, 2025
8fbfdd3
cleanup
timvaillancourt Jun 27, 2025
c0eb707
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Jun 28, 2025
ffb7b97
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Jun 30, 2025
f9328c3
use `switch` statement
timvaillancourt Jun 30, 2025
b1ac40c
use `switch` statement - cleanup
timvaillancourt Jun 30, 2025
3bf9c45
gofmt
timvaillancourt Jun 30, 2025
7bd2534
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Aug 30, 2025
e992b44
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Sep 5, 2025
cedfab0
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Sep 23, 2025
2b6bce0
return existing format
timvaillancourt Sep 23, 2025
df1686d
standardize on `github.com/go-viper/mapstructure/v2`
timvaillancourt Sep 23, 2025
dc1b03a
use string for queue key
timvaillancourt Sep 23, 2025
c12da3d
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Sep 25, 2025
ec4157a
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Sep 30, 2025
98b86f7
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Sep 30, 2025
7b8a3ca
rm mistaken add
timvaillancourt Sep 30, 2025
616cdbf
Merge remote-tracking branch 'origin/main' into vtorc-topodatapb-tabl…
timvaillancourt Dec 23, 2025
4aeb37a
fixes
timvaillancourt Dec 25, 2025
2a2c5eb
fixes
timvaillancourt Dec 25, 2025
c3073b8
add unmarshal
timvaillancourt Dec 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions go/test/endtoend/vtorc/api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func TestAPIEndpoints(t *testing.T) {
})

// Before we disable recoveries, let us wait until VTOrc has fixed all the issues (if any).
_, _ = utils.MakeAPICallRetry(t, vtorc, "/api/replication-analysis", func(_ int, response string) bool {
_, _ = utils.MakeAPICallRetry(t, vtorc, "/api/detection-analysis", func(i int, response string) bool {
return response != "null"
})

Expand Down Expand Up @@ -164,41 +164,41 @@ func TestAPIEndpoints(t *testing.T) {
assert.Equal(t, "Global recoveries disabled\n", resp)
})

t.Run("Replication Analysis API", func(t *testing.T) {
t.Run("Detection Analysis API", func(t *testing.T) {
// use vtctldclient to stop replication
_, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
require.NoError(t, err)

// We know VTOrc won't fix this since we disabled global recoveries!
// Wait until VTOrc picks up on this issue and verify
// that we see a not null result on the api/replication-analysis page
status, resp := utils.MakeAPICallRetry(t, vtorc, "/api/replication-analysis", func(_ int, response string) bool {
// that we see a not null result on the api/detection-analysis page
status, resp := utils.MakeAPICallRetry(t, vtorc, "/api/detection-analysis", func(_ int, response string) bool {
return response == "null"
})
assert.Equal(t, 200, status, resp)
assert.Contains(t, resp, fmt.Sprintf(`"AnalyzedInstanceAlias": "%s"`, replica.Alias))
assert.Contains(t, resp, `"Analysis": "ReplicationStopped"`)

// Verify that filtering also works in the API as intended
status, resp, err = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?keyspace=ks&shard=0")
status, resp, err = utils.MakeAPICall(t, vtorc, "/api/detection-analysis?keyspace=ks&shard=0")
require.NoError(t, err)
assert.Equal(t, 200, status, resp)
assert.Contains(t, resp, fmt.Sprintf(`"AnalyzedInstanceAlias": "%s"`, replica.Alias))

// Verify that filtering by keyspace also works in the API as intended
status, resp, err = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?keyspace=ks")
status, resp, err = utils.MakeAPICall(t, vtorc, "/api/detection-analysis?keyspace=ks")
require.NoError(t, err)
assert.Equal(t, 200, status, resp)
assert.Contains(t, resp, fmt.Sprintf(`"AnalyzedInstanceAlias": "%s"`, replica.Alias))

// Check that filtering using keyspace and shard works
status, resp, err = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?keyspace=ks&shard=80-")
status, resp, err = utils.MakeAPICall(t, vtorc, "/api/detection-analysis?keyspace=ks&shard=80-")
require.NoError(t, err)
assert.Equal(t, 200, status, resp)
assert.Equal(t, "null", resp)

// Check that filtering using just the shard fails
status, resp, err = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?shard=0")
status, resp, err = utils.MakeAPICall(t, vtorc, "/api/detection-analysis?shard=0")
require.NoError(t, err)
assert.Equal(t, 400, status, resp)
assert.Equal(t, "Filtering by shard without keyspace isn't supported\n", resp)
Expand Down
8 changes: 4 additions & 4 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,7 @@ func TestFullStatusConnectionPooling(t *testing.T) {
_ = curPrimary.VttabletProcess.Kill()

// Wait until VTOrc notices some problems
status, resp := utils.MakeAPICallRetry(t, vtorc, "/api/replication-analysis", func(_ int, response string) bool {
status, resp := utils.MakeAPICallRetry(t, vtorc, "/api/detection-analysis", func(_ int, response string) bool {
return response == "null"
})
assert.Equal(t, 200, status)
Expand All @@ -697,7 +697,7 @@ func TestFullStatusConnectionPooling(t *testing.T) {

// See that VTOrc eventually reports no errors.
// Wait until there are no problems and the api endpoint returns null
status, resp = utils.MakeAPICallRetry(t, vtorc, "/api/replication-analysis", func(_ int, response string) bool {
status, resp = utils.MakeAPICallRetry(t, vtorc, "/api/detection-analysis", func(_ int, response string) bool {
return response != "null"
})
assert.Equal(t, 200, status)
Expand All @@ -708,7 +708,7 @@ func TestFullStatusConnectionPooling(t *testing.T) {
_ = curPrimary.VttabletProcess.Kill()

// Wait until VTOrc notices some problems
status, resp = utils.MakeAPICallRetry(t, vtorc, "/api/replication-analysis", func(_ int, response string) bool {
status, resp = utils.MakeAPICallRetry(t, vtorc, "/api/detection-analysis", func(_ int, response string) bool {
return response == "null"
})
assert.Equal(t, 200, status)
Expand All @@ -724,7 +724,7 @@ func TestFullStatusConnectionPooling(t *testing.T) {

// See that VTOrc eventually reports no errors.
// Wait until there are no problems and the api endpoint returns null
status, resp = utils.MakeAPICallRetry(t, vtorc, "/api/replication-analysis", func(_ int, response string) bool {
status, resp = utils.MakeAPICallRetry(t, vtorc, "/api/detection-analysis", func(_ int, response string) bool {
return response != "null"
})
assert.Equal(t, 200, status)
Expand Down
12 changes: 9 additions & 3 deletions go/test/endtoend/vtorc/readtopologyinstance/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
"vitess.io/vitess/go/test/endtoend/vtorc/utils"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/servenv"

"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/vtorc/config"
"vitess.io/vitess/go/vt/vtorc/inst"
"vitess.io/vitess/go/vt/vtorc/logic"
Expand Down Expand Up @@ -77,7 +77,13 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
}
}

primaryInstance, err := inst.ReadTopologyInstanceBufferable(primary.Alias, nil)
primaryAlias, err := topoproto.ParseTabletAlias(primary.Alias)
require.NoError(t, err)

replicaAlias, err := topoproto.ParseTabletAlias(replica.Alias)
require.NoError(t, err)

primaryInstance, err := inst.ReadTopologyInstanceBufferable(primaryAlias, nil)
require.NoError(t, err)
require.NotNil(t, primaryInstance)
assert.Equal(t, utils.Hostname, primaryInstance.Hostname)
Expand Down Expand Up @@ -129,7 +135,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
err = logic.EnableRecovery()
require.NoError(t, err)

replicaInstance, err := inst.ReadTopologyInstanceBufferable(replica.Alias, nil)
replicaInstance, err := inst.ReadTopologyInstanceBufferable(replicaAlias, nil)
require.NoError(t, err)
require.NotNil(t, replicaInstance)
assert.Equal(t, utils.Hostname, replicaInstance.Hostname)
Expand Down
4 changes: 2 additions & 2 deletions go/vt/vtorc/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ type DetectionAnalysisHints struct {

// DetectionAnalysis represents an analysis of a detected problem.
type DetectionAnalysis struct {
AnalyzedInstanceAlias string
AnalyzedInstancePrimaryAlias string
AnalyzedInstanceAlias *topodatapb.TabletAlias
AnalyzedInstancePrimaryAlias *topodatapb.TabletAlias
TabletType topodatapb.TabletType
CurrentTabletType topodatapb.TabletType
PrimaryTimeStamp time.Time
Expand Down
31 changes: 18 additions & 13 deletions go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func initializeAnalysisDaoPostConfiguration() {
type clusterAnalysis struct {
hasShardWideAction bool
totalTablets int
primaryAlias string
primaryAlias *topodatapb.TabletAlias
durability policy.Durabler
}

Expand Down Expand Up @@ -319,8 +319,8 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi

a.ShardPrimaryTermTimestamp = m.GetTime("shard_primary_term_timestamp")
a.IsPrimary = m.GetBool("is_primary")
a.AnalyzedInstanceAlias = topoproto.TabletAliasString(tablet.Alias)
a.AnalyzedInstancePrimaryAlias = topoproto.TabletAliasString(primaryTablet.Alias)
a.AnalyzedInstanceAlias = tablet.Alias
a.AnalyzedInstancePrimaryAlias = primaryTablet.Alias
a.AnalyzedInstanceBinlogCoordinates = BinlogCoordinates{
LogFile: m.GetString("binary_log_file"),
LogPos: m.GetUint64("binary_log_pos"),
Expand Down Expand Up @@ -461,12 +461,12 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
case topo.IsReplicaType(a.TabletType) && a.ErrantGTID != "":
a.Analysis = ErrantGTIDDetected
a.Description = "Tablet has errant GTIDs"
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp.IsZero():
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == nil && a.ShardPrimaryTermTimestamp.IsZero():
// ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either.
a.Analysis = ClusterHasNoPrimary
a.Description = "Cluster has no primary"
ca.hasShardWideAction = true
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && !a.ShardPrimaryTermTimestamp.IsZero():
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == nil && !a.ShardPrimaryTermTimestamp.IsZero():
// If there are no primary tablets, but the shard primary start time isn't empty, then we know
// the primary tablet was deleted.
a.Analysis = PrimaryTabletDeleted
Expand All @@ -491,7 +491,7 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
a.Analysis = ReplicaMisconfigured
a.Description = "Replica has been misconfigured"
//
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != "" && a.AnalyzedInstancePrimaryAlias != ca.primaryAlias:
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != nil && !topoproto.TabletAliasEqual(a.AnalyzedInstancePrimaryAlias, ca.primaryAlias):
a.Analysis = ConnectedToWrongPrimary
a.Description = "Connected to wrong primary"
//
Expand Down Expand Up @@ -659,12 +659,13 @@ func postProcessAnalyses(result []*DetectionAnalysis, clusters map[string]*clust
// auditInstanceAnalysisInChangelog will write down an instance's analysis in the database_instance_analysis_changelog table.
// To not repeat recurring analysis code, the database_instance_last_analysis table is used, so that only changes to
// analysis codes are written.
func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisCode) error {
if lastWrittenAnalysis, found := recentInstantAnalysis.Get(tabletAlias); found {
func auditInstanceAnalysisInChangelog(tabletAlias *topodatapb.TabletAlias, analysisCode AnalysisCode) error {
tabletAliasString := topoproto.TabletAliasString(tabletAlias)
if lastWrittenAnalysis, found := recentInstantAnalysis.Get(tabletAliasString); found {
if lastWrittenAnalysis == analysisCode {
// Surely nothing new.
// And let's expand the timeout
recentInstantAnalysis.Set(tabletAlias, analysisCode, cache.DefaultExpiration)
recentInstantAnalysis.Set(tabletAliasString, analysisCode, cache.DefaultExpiration)
return nil
}
}
Expand All @@ -680,7 +681,9 @@ func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisC
alias = ?
AND analysis != ?
`,
string(analysisCode), tabletAlias, string(analysisCode),
string(analysisCode),
tabletAliasString,
string(analysisCode),
)
if err != nil {
log.Error(err)
Expand Down Expand Up @@ -709,7 +712,8 @@ func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisC
DATETIME('now'),
?
)`,
tabletAlias, string(analysisCode),
tabletAliasString,
string(analysisCode),
)
if err != nil {
log.Error(err)
Expand All @@ -722,7 +726,7 @@ func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisC
}
firstInsertion = rows > 0
}
recentInstantAnalysis.Set(tabletAlias, analysisCode, cache.DefaultExpiration)
recentInstantAnalysis.Set(tabletAliasString, analysisCode, cache.DefaultExpiration)
// If the analysis has changed or if it is the first insertion, we need to make sure we write this change to the database.
if !lastAnalysisChanged && !firstInsertion {
return nil
Expand All @@ -738,7 +742,8 @@ func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisC
DATETIME('now'),
?
)`,
tabletAlias, string(analysisCode),
tabletAliasString,
string(analysisCode),
)
if err == nil {
analysisChangeWriteCounter.Add(1)
Expand Down
36 changes: 18 additions & 18 deletions go/vt/vtorc/inst/analysis_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1106,24 +1106,24 @@ func TestAuditInstanceAnalysisInChangelog(t *testing.T) {
}()

updates := []struct {
tabletAlias string
tabletAlias *topodatapb.TabletAlias
analysisCode AnalysisCode
writeCounterExpectation int64
wantErr string
}{
{
// Store a new analysis for the zone1-100 tablet.
tabletAlias: "zone1-100",
tabletAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100},
analysisCode: ReplicationStopped,
writeCounterExpectation: 1,
}, {
// Write the same analysis, no new write should happen.
tabletAlias: "zone1-100",
tabletAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100},
analysisCode: ReplicationStopped,
writeCounterExpectation: 1,
}, {
// Change the analysis. This should trigger an update.
tabletAlias: "zone1-100",
tabletAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100},
analysisCode: ReplicaSemiSyncMustBeSet,
writeCounterExpectation: 2,
},
Expand Down Expand Up @@ -1192,43 +1192,43 @@ func TestPostProcessAnalyses(t *testing.T) {
analyses: []*DetectionAnalysis{
{
Analysis: InvalidPrimary,
AnalyzedInstanceAlias: "zone1-100",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard0,
TabletType: topodatapb.TabletType_PRIMARY,
}, {
Analysis: NoProblem,
LastCheckValid: true,
AnalyzedInstanceAlias: "zone1-202",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 202},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard80,
TabletType: topodatapb.TabletType_RDONLY,
}, {
Analysis: ConnectedToWrongPrimary,
LastCheckValid: true,
AnalyzedInstanceAlias: "zone1-101",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard0,
TabletType: topodatapb.TabletType_REPLICA,
ReplicationStopped: true,
}, {
Analysis: ReplicationStopped,
LastCheckValid: true,
AnalyzedInstanceAlias: "zone1-102",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 102},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard0,
TabletType: topodatapb.TabletType_RDONLY,
ReplicationStopped: true,
}, {
Analysis: InvalidReplica,
AnalyzedInstanceAlias: "zone1-108",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 108},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard0,
TabletType: topodatapb.TabletType_REPLICA,
LastCheckValid: false,
}, {
Analysis: NoProblem,
AnalyzedInstanceAlias: "zone1-302",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 302},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard80,
LastCheckValid: true,
Expand All @@ -1238,21 +1238,21 @@ func TestPostProcessAnalyses(t *testing.T) {
want: []*DetectionAnalysis{
{
Analysis: DeadPrimary,
AnalyzedInstanceAlias: "zone1-100",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard0,
TabletType: topodatapb.TabletType_PRIMARY,
}, {
Analysis: NoProblem,
LastCheckValid: true,
AnalyzedInstanceAlias: "zone1-202",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 202},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard80,
TabletType: topodatapb.TabletType_RDONLY,
}, {
Analysis: NoProblem,
LastCheckValid: true,
AnalyzedInstanceAlias: "zone1-302",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 302},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard80,
TabletType: topodatapb.TabletType_REPLICA,
Expand All @@ -1264,36 +1264,36 @@ func TestPostProcessAnalyses(t *testing.T) {
analyses: []*DetectionAnalysis{
{
Analysis: InvalidPrimary,
AnalyzedInstanceAlias: "zone1-100",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 100},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard0,
TabletType: topodatapb.TabletType_PRIMARY,
}, {
Analysis: NoProblem,
AnalyzedInstanceAlias: "zone1-202",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 202},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard80,
LastCheckValid: true,
TabletType: topodatapb.TabletType_RDONLY,
}, {
Analysis: NoProblem,
LastCheckValid: true,
AnalyzedInstanceAlias: "zone1-101",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 101},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard0,
TabletType: topodatapb.TabletType_REPLICA,
}, {
Analysis: ReplicationStopped,
LastCheckValid: true,
AnalyzedInstanceAlias: "zone1-102",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 102},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard0,
TabletType: topodatapb.TabletType_RDONLY,
ReplicationStopped: true,
}, {
Analysis: NoProblem,
LastCheckValid: true,
AnalyzedInstanceAlias: "zone1-302",
AnalyzedInstanceAlias: &topodatapb.TabletAlias{Cell: "zone1", Uid: 302},
AnalyzedKeyspace: keyspace,
AnalyzedShard: shard80,
TabletType: topodatapb.TabletType_REPLICA,
Expand Down
Loading
Loading