Skip to content
4 changes: 2 additions & 2 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ go.sum @harshit-gangal @mattlord @rohit-nayak-ps @systay @frouioui
/go/test/endtoend/transaction @harshit-gangal @systay @frouioui
/go/test/endtoend/*throttler* @shlomi-noach @mattlord @timvaillancourt
/go/test/endtoend/vtgate @harshit-gangal @systay @frouioui
/go/test/endtoend/vtorc @shlomi-noach @timvaillancourt
/go/test/endtoend/vtorc @mattlord @shlomi-noach @timvaillancourt
/go/tools/ @frouioui @systay
/go/vt/dbconnpool @harshit-gangal @mattlord
/go/vt/discovery @frouioui
Expand Down Expand Up @@ -63,7 +63,7 @@ go.sum @harshit-gangal @mattlord @rohit-nayak-ps @systay @frouioui
/go/vt/vtgate/planbuilder @harshit-gangal @systay @frouioui @arthurschreiber
/go/vt/vtgate/*vstream* @rohit-nayak-ps @mattlord @shlomi-noach @beingnoble03
/go/vt/vtgate/evalengine @dbussink @systay
/go/vt/vtorc @shlomi-noach @timvaillancourt
/go/vt/vtorc @mattlord @shlomi-noach @timvaillancourt
/go/vt/vttablet/*conn* @harshit-gangal @systay
/go/vt/vttablet/endtoend @harshit-gangal @mattlord @rohit-nayak-ps @systay
/go/vt/vttablet/grpc* @rohit-nayak-ps @shlomi-noach @harshit-gangal
Expand Down
62 changes: 62 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,68 @@ func TestSemiSync(t *testing.T) {
}
}

func TestSemiSync_NoAckers(t *testing.T) {
// stop any vtorc instance running due to a previous test.
utils.StopVTOrcs(t, clusterInfo)
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{
PreventCrossCellFailover: true,
}, 1, "")
defer func() {
utils.StopVTOrcs(t, clusterInfo)
clusterInfo.ClusterInstance.Teardown()
}()
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, keyspace.Name, shard0.Name, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica tablet
replicas := make([]*cluster.Vttablet, 0)
for _, tablet := range shard0.Vttablets {
if tablet.Alias != curPrimary.Alias {
replicas = append(replicas, tablet)
}
}
assert.NotEmpty(t, replicas, "did not find replica tablets")

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, shard0.Vttablets, 10*time.Second)

// Make the replica vttablet unavailable and delete from topo so the PRIMARY cannot apply semi-sync
for _, replica := range replicas {
replica.VttabletProcess.Kill()
out, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("DeleteTablets", replica.Alias)
require.NoError(t, err, out)
}

// Enable semi-sync durability policy
out, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspace.Name, "--durability-policy=semi_sync")
require.NoError(t, err, out)

// Wait for no detected PrimarySemiSyncMustBeSet problem, and no FixPrimary recovery, because we have no ackers
time.Sleep(time.Second * 10)
utils.WaitForDetectedProblems(t, vtOrcProcess,
string(inst.PrimarySemiSyncMustBeSet),
curPrimary.Alias,
keyspace.Name,
shard0.Name,
0,
)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, keyspace.Name, shard0.Name, 0)

// Startup up replica vttablet again, wait for FixPrimary recovery
for _, replica := range replicas {
require.NoError(t, replica.VttabletProcess.Setup())
}
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, keyspace.Name, shard0.Name, 1)
}

// TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld
func TestVTOrcWithPrs(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
Expand Down
12 changes: 12 additions & 0 deletions go/vt/vtorc/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"time"

topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/vtctl/reparentutil/policy"
"vitess.io/vitess/go/vt/vtorc/config"
)

Expand Down Expand Up @@ -122,6 +123,7 @@ type DetectionAnalysis struct {
SemiSyncReplicaEnabled bool
SemiSyncBlocked bool
CountSemiSyncReplicasEnabled uint
CountValidSemiSyncReplicatingReplicas uint
CountLoggingReplicas uint
CountStatementBasedLoggingReplicas uint
CountMixedBasedLoggingReplicas uint
Expand All @@ -148,6 +150,16 @@ func (detectionAnalysis *DetectionAnalysis) MarshalJSON() ([]byte, error) {
return json.Marshal(i)
}

// hasMinSemiSyncAckers returns true if there are a minimum number of semi-sync ackers enabled and replicating.
// True is always returned if the durability policy does not require semi-sync ackers (eg: "none"). This gives
// a useful signal if it is safe to enable semi-sync without risk of stalling ongoing PRIMARY writes.
func hasMinSemiSyncAckers(durabler policy.Durabler, primary *topodatapb.Tablet, analysis *DetectionAnalysis) bool {
if durabler == nil || analysis == nil {
return false
}
return int(analysis.CountValidSemiSyncReplicatingReplicas) >= durabler.SemiSyncAckers(primary)
}

// ValidSecondsFromSeenToLastAttemptedCheck returns the maximum allowed elapsed time
// between last_attempted_check to last_checked before we consider the instance as invalid.
func ValidSecondsFromSeenToLastAttemptedCheck() uint {
Expand Down
12 changes: 11 additions & 1 deletion go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,15 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
),
0
) AS count_valid_semi_sync_replicas,
IFNULL(
SUM(
replica_instance.last_checked <= replica_instance.last_seen
AND replica_instance.replica_io_running != 0
AND replica_instance.replica_sql_running != 0
AND replica_instance.semi_sync_replica_enabled != 0
),
0
) AS count_valid_semi_sync_replicating_replicas,
IFNULL(
SUM(
replica_instance.log_bin
Expand Down Expand Up @@ -345,6 +354,7 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
a.SemiSyncBlocked = m.GetBool("semi_sync_blocked")
a.SemiSyncReplicaEnabled = m.GetBool("semi_sync_replica_enabled")
a.CountSemiSyncReplicasEnabled = m.GetUint("count_semi_sync_replicas")
a.CountValidSemiSyncReplicatingReplicas = m.GetUint("count_valid_semi_sync_replicating_replicas")
// countValidSemiSyncReplicasEnabled := m.GetUint("count_valid_semi_sync_replicas")
a.SemiSyncPrimaryWaitForReplicaCount = m.GetUint("semi_sync_primary_wait_for_replica_count")
a.SemiSyncPrimaryClients = m.GetUint("semi_sync_primary_clients")
Expand Down Expand Up @@ -447,7 +457,7 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
a.Analysis = PrimaryIsReadOnly
a.Description = "Primary is read-only"
//
case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled:
case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) > 0 && hasMinSemiSyncAckers(ca.durability, tablet, a) && !a.SemiSyncPrimaryEnabled:
a.Analysis = PrimarySemiSyncMustBeSet
a.Description = "Primary semi-sync must be set"
//
Expand Down
Loading
Loading