From 5458bcd0de452a0d94a9d855a89a2c2b89a8d4b9 Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Fri, 19 Dec 2025 13:39:07 -0800 Subject: [PATCH 01/33] mysqlctl: add MySQL CLONE support infrastructure Add core infrastructure for MySQL CLONE REMOTE operations: - CloneExecutor in mysqlctl/clone.go for executing CLONE INSTANCE FROM - Clone user configuration in dbconfigs (--db-clone-user, --db-clone-password, --db-clone-use-ssl) - vt_clone user in init_db.sql with BACKUP_ADMIN privilege (MySQL 8.0.17+ only) - Clone plugin loading in mycnf configs (plugin-load-add = mysql_clone.so) - MySQLClonePluginFlavorCapability for version checking MySQL CLONE copies data at the physical level over the network, providing a faster alternative to logical backup/restore for replica provisioning. Requires MySQL 8.0.17+ and InnoDB-only tables. Signed-off-by: Nick Van Wiggeren --- config/embed.go | 6 + config/init_clone.sql | 4 + config/mycnf/clone.cnf | 3 + go/mysql/capabilities/capability.go | 4 +- go/vt/dbconfigs/dbconfigs.go | 15 +- go/vt/mysqlctl/clone.go | 217 ++++++++++++++++++++++++++++ go/vt/mysqlctl/clone_test.go | 150 +++++++++++++++++++ go/vt/mysqlctl/mysqld.go | 25 ++++ 8 files changed, 422 insertions(+), 2 deletions(-) create mode 100644 config/init_clone.sql create mode 100644 config/mycnf/clone.cnf create mode 100644 go/vt/mysqlctl/clone.go create mode 100644 go/vt/mysqlctl/clone_test.go diff --git a/config/embed.go b/config/embed.go index b138184c5a3..68ca8fe90ce 100644 --- a/config/embed.go +++ b/config/embed.go @@ -25,3 +25,9 @@ var MycnfMySQL84 string //go:embed mycnf/mysql90.cnf var MycnfMySQL90 string + +//go:embed mycnf/clone.cnf +var MycnfClone string + +//go:embed init_clone.sql +var InitClone string diff --git a/config/init_clone.sql b/config/init_clone.sql new file mode 100644 index 00000000000..5857414b640 --- /dev/null +++ b/config/init_clone.sql @@ -0,0 +1,4 @@ +# User for MySQL CLONE operations. +# BACKUP_ADMIN is required on the donor for clone operations. +/*!80017 CREATE USER IF NOT EXISTS 'vt_clone'@'%' */; +/*!80017 GRANT BACKUP_ADMIN ON *.* TO 'vt_clone'@'%' */; diff --git a/config/mycnf/clone.cnf b/config/mycnf/clone.cnf new file mode 100644 index 00000000000..10fbc3e4eac --- /dev/null +++ b/config/mycnf/clone.cnf @@ -0,0 +1,3 @@ +# Clone plugin for MySQL backup and replica provisioning. +# Included when --mysql-clone-enabled=true (requires MySQL 8.0.17+). +plugin-load-add = mysql_clone.so diff --git a/go/mysql/capabilities/capability.go b/go/mysql/capabilities/capability.go index 1877caf11d6..80e20209f1c 100644 --- a/go/mysql/capabilities/capability.go +++ b/go/mysql/capabilities/capability.go @@ -52,6 +52,7 @@ const ( ReplicaTerminologyCapability // Supported in 8.0.26 and above, using SHOW REPLICA STATUS and all variations. BinaryLogStatus // Supported in 8.2.0 and above, uses SHOW BINARY LOG STATUS RestrictFKOnNonStandardKey // Supported in 8.4.0 and above, restricts usage of non-standard indexes for foreign keys. + MySQLClonePluginFlavorCapability // Supported in 8.0.17 and above, MySQL CLONE plugin for physical snapshot. ) type CapableOf func(capability FlavorCapability) (bool, error) @@ -107,7 +108,8 @@ func MySQLVersionHasCapability(serverVersion string, capability FlavorCapability return atLeast(8, 0, 16) case CheckConstraintsCapability: return atLeast(8, 0, 16) - case TransactionalGtidExecutedFlavorCapability: + case TransactionalGtidExecutedFlavorCapability, + MySQLClonePluginFlavorCapability: return atLeast(8, 0, 17) case DisableRedoLogFlavorCapability: return atLeast(8, 0, 21) diff --git a/go/vt/dbconfigs/dbconfigs.go b/go/vt/dbconfigs/dbconfigs.go index c6c8e41b839..0ec3cd661fc 100644 --- a/go/vt/dbconfigs/dbconfigs.go +++ b/go/vt/dbconfigs/dbconfigs.go @@ -49,6 +49,7 @@ const ( Filtered = "filtered" Repl = "repl" ExternalRepl = "erepl" + Clone = "clone" ) var ( @@ -56,7 +57,7 @@ var ( GlobalDBConfigs DBConfigs // All can be used to register all flags: RegisterFlags(All...) - All = []string{App, AppDebug, AllPrivs, Dba, Filtered, Repl, ExternalRepl} + All = []string{App, AppDebug, AllPrivs, Dba, Filtered, Repl, ExternalRepl, Clone} ) // DBConfigs stores all the data needed to build various connection @@ -97,6 +98,7 @@ type DBConfigs struct { Repl UserConfig `json:"repl,omitempty"` Appdebug UserConfig `json:"appdebug,omitempty"` Allprivs UserConfig `json:"allprivs,omitempty"` + CloneUser UserConfig `json:"clone,omitempty"` externalRepl UserConfig appParams mysql.ConnParams @@ -105,6 +107,7 @@ type DBConfigs struct { replParams mysql.ConnParams appdebugParams mysql.ConnParams allprivsParams mysql.ConnParams + cloneParams mysql.ConnParams externalReplParams mysql.ConnParams } @@ -261,6 +264,11 @@ func (dbcfgs *DBConfigs) ExternalReplWithDB() Connector { return params } +// CloneConnector returns connection parameters for clone with no dbname set. +func (dbcfgs *DBConfigs) CloneConnector() Connector { + return dbcfgs.makeParams(&dbcfgs.cloneParams, false) +} + // AppWithDB returns connection parameters for app with dbname set. func (dbcfgs *DBConfigs) makeParams(cp *mysql.ConnParams, withDB bool) Connector { result := *cp @@ -306,6 +314,7 @@ func (dbcfgs *DBConfigs) Redacted() *DBConfigs { dbcfgs.Repl.Password = "****" dbcfgs.Appdebug.Password = "****" dbcfgs.Allprivs.Password = "****" + dbcfgs.CloneUser.Password = "****" return dbcfgs } @@ -401,6 +410,9 @@ func (dbcfgs *DBConfigs) getParams(userKey string) (*UserConfig, *mysql.ConnPara case ExternalRepl: uc = &dbcfgs.externalRepl cp = &dbcfgs.externalReplParams + case Clone: + uc = &dbcfgs.CloneUser + cp = &dbcfgs.cloneParams default: log.Exitf("Invalid db user key requested: %s", userKey) } @@ -423,6 +435,7 @@ func NewTestDBConfigs(genParams, appDebugParams mysql.ConnParams, dbname string) dbaParams: genParams, filteredParams: genParams, replParams: genParams, + cloneParams: genParams, externalReplParams: genParams, DBName: dbname, Charset: "", diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go new file mode 100644 index 00000000000..e807d24dfaa --- /dev/null +++ b/go/vt/mysqlctl/clone.go @@ -0,0 +1,217 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mysqlctl + +import ( + "context" + "fmt" + "strings" + + "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/mysql/capabilities" + "vitess.io/vitess/go/vt/log" +) + +// CloneExecutor handles MySQL CLONE REMOTE operations for backup and replica provisioning. +// It executes CLONE INSTANCE FROM on the recipient to clone data from a donor. +type CloneExecutor struct { + // DonorHost is the hostname or IP of the donor MySQL instance. + DonorHost string + // DonorPort is the MySQL port of the donor instance. + DonorPort int + // DonorUser is the MySQL user for clone operations (needs BACKUP_ADMIN on donor). + DonorUser string + // DonorPassword is the password for the clone user. + DonorPassword string + // UseSSL indicates whether to use SSL for the clone connection. + UseSSL bool +} + +// ValidateDonor checks that the donor MySQL instance meets all prerequisites for cloning. +// It verifies: +// - MySQL version >= 8.0.17 +// - Clone plugin is installed +// - No non-InnoDB tables exist (clone only supports InnoDB) +func (c *CloneExecutor) ValidateDonor(ctx context.Context, mysqld MysqlDaemon) error { + // Check MySQL version using capabilities system + if err := c.checkCloneCapability(ctx, mysqld); err != nil { + return err + } + + // Check for non-InnoDB tables + if err := c.checkNoNonInnoDBTables(ctx, mysqld); err != nil { + return err + } + + // Check clone plugin is installed + if err := c.checkClonePluginInstalled(ctx, mysqld); err != nil { + return err + } + + return nil +} + +// ValidateRecipient checks that the recipient MySQL instance meets all prerequisites for cloning. +// It verifies: +// - MySQL version >= 8.0.17 +// - Clone plugin is installed +func (c *CloneExecutor) ValidateRecipient(ctx context.Context, mysqld MysqlDaemon) error { + // Check MySQL version using capabilities system + if err := c.checkCloneCapability(ctx, mysqld); err != nil { + return err + } + + // Check clone plugin is installed + if err := c.checkClonePluginInstalled(ctx, mysqld); err != nil { + return err + } + + return nil +} + +// checkCloneCapability verifies that the MySQL version supports the CLONE plugin. +func (c *CloneExecutor) checkCloneCapability(ctx context.Context, mysqld MysqlDaemon) error { + result, err := mysqld.FetchSuperQuery(ctx, "SELECT @@version") + if err != nil { + return fmt.Errorf("failed to query MySQL version: %w", err) + } + + if len(result.Rows) == 0 || len(result.Rows[0]) == 0 { + return fmt.Errorf("empty version result") + } + + versionStr := result.Rows[0][0].ToString() + capableOf := mysql.ServerVersionCapableOf(versionStr) + if capableOf == nil { + return fmt.Errorf("unable to determine MySQL capabilities for version %q", versionStr) + } + + ok, err := capableOf(capabilities.MySQLClonePluginFlavorCapability) + if err != nil { + return fmt.Errorf("failed to check clone capability: %w", err) + } + if !ok { + return fmt.Errorf("MySQL CLONE requires version 8.0.17 or higher, got %s", versionStr) + } + + return nil +} + +// ExecuteClone performs CLONE REMOTE from the donor to the recipient. +// This will: +// 1. Set clone_valid_donor_list on the recipient +// 2. Execute CLONE INSTANCE FROM on the recipient +// 3. The recipient MySQL will restart automatically after clone completes +// +// Note: This operation will DESTROY all existing data on the recipient. +func (c *CloneExecutor) ExecuteClone(ctx context.Context, mysqld MysqlDaemon) error { + if !MySQLCloneEnabled() { + return fmt.Errorf("MySQL CLONE not enabled; set --mysql-clone-enabled=true on both donor and recipient") + } + + log.Infof("Starting CLONE REMOTE from %s:%d", c.DonorHost, c.DonorPort) + + // Set the valid donor list + donorAddr := fmt.Sprintf("%s:%d", c.DonorHost, c.DonorPort) + setDonorListQuery := fmt.Sprintf("SET GLOBAL clone_valid_donor_list = '%s'", donorAddr) + + if err := mysqld.ExecuteSuperQuery(ctx, setDonorListQuery); err != nil { + return fmt.Errorf("failed to set clone_valid_donor_list: %w", err) + } + + // Build the CLONE INSTANCE command + cloneCmd := c.buildCloneCommand() + + log.Infof("Executing CLONE INSTANCE FROM %s:%d (this may take a while)", c.DonorHost, c.DonorPort) + + // Execute the clone command + // Note: After this command completes, MySQL will restart automatically + if err := mysqld.ExecuteSuperQuery(ctx, cloneCmd); err != nil { + return fmt.Errorf("CLONE INSTANCE failed: %w", err) + } + + log.Infof("CLONE REMOTE completed successfully from %s:%d", c.DonorHost, c.DonorPort) + return nil +} + +// buildCloneCommand constructs the CLONE INSTANCE SQL command. +func (c *CloneExecutor) buildCloneCommand() string { + var sb strings.Builder + sb.WriteString(fmt.Sprintf("CLONE INSTANCE FROM '%s'@'%s':%d", + c.DonorUser, c.DonorHost, c.DonorPort)) + sb.WriteString(fmt.Sprintf(" IDENTIFIED BY '%s'", c.DonorPassword)) + + if c.UseSSL { + sb.WriteString(" REQUIRE SSL") + } else { + sb.WriteString(" REQUIRE NO SSL") + } + + return sb.String() +} + +// checkNoNonInnoDBTables verifies that no user tables use non-InnoDB storage engines. +// MySQL CLONE only copies InnoDB data; other engines would result in empty tables. +func (c *CloneExecutor) checkNoNonInnoDBTables(ctx context.Context, mysqld MysqlDaemon) error { + query := ` + SELECT TABLE_SCHEMA, TABLE_NAME, ENGINE + FROM information_schema.TABLES + WHERE ENGINE != 'InnoDB' + AND ENGINE IS NOT NULL + AND TABLE_TYPE = 'BASE TABLE' + AND TABLE_SCHEMA NOT IN ('mysql', 'information_schema', 'performance_schema', 'sys') + ` + + result, err := mysqld.FetchSuperQuery(ctx, query) + if err != nil { + return fmt.Errorf("failed to check for non-InnoDB tables: %w", err) + } + + if len(result.Rows) > 0 { + var tables []string + for _, row := range result.Rows { + schema := row[0].ToString() + table := row[1].ToString() + engine := row[2].ToString() + tables = append(tables, fmt.Sprintf("%s.%s (%s)", schema, table, engine)) + } + return fmt.Errorf("non-InnoDB tables found (CLONE only supports InnoDB): %s", strings.Join(tables, ", ")) + } + + return nil +} + +// checkClonePluginInstalled verifies that the clone plugin is loaded. +func (c *CloneExecutor) checkClonePluginInstalled(ctx context.Context, mysqld MysqlDaemon) error { + query := "SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'" + + result, err := mysqld.FetchSuperQuery(ctx, query) + if err != nil { + return fmt.Errorf("failed to check clone plugin status: %w", err) + } + + if len(result.Rows) == 0 { + return fmt.Errorf("clone plugin is not installed (add 'plugin-load-add=mysql_clone.so' to my.cnf)") + } + + status := result.Rows[0][0].ToString() + if status != "ACTIVE" { + return fmt.Errorf("clone plugin is not active (status: %s)", status) + } + + return nil +} diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go new file mode 100644 index 00000000000..e84dd74a379 --- /dev/null +++ b/go/vt/mysqlctl/clone_test.go @@ -0,0 +1,150 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mysqlctl + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/sqltypes" +) + +func TestBuildCloneCommand(t *testing.T) { + tests := []struct { + name string + executor *CloneExecutor + expected string + }{ + { + name: "with SSL", + executor: &CloneExecutor{ + DonorHost: "192.168.1.100", + DonorPort: 3306, + DonorUser: "vt_clone", + DonorPassword: "secret123", + UseSSL: true, + }, + expected: "CLONE INSTANCE FROM 'vt_clone'@'192.168.1.100':3306 IDENTIFIED BY 'secret123' REQUIRE SSL", + }, + { + name: "without SSL", + executor: &CloneExecutor{ + DonorHost: "10.0.0.50", + DonorPort: 3307, + DonorUser: "clone_user", + DonorPassword: "password", + UseSSL: false, + }, + expected: "CLONE INSTANCE FROM 'clone_user'@'10.0.0.50':3307 IDENTIFIED BY 'password' REQUIRE NO SSL", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.executor.buildCloneCommand() + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestValidateRecipient(t *testing.T) { + tests := []struct { + name string + versionQuery *sqltypes.Result + pluginQuery *sqltypes.Result + expectError bool + errorContain string + }{ + { + name: "valid MySQL 8.0.32 with clone plugin", + versionQuery: sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.32", + ), + pluginQuery: sqltypes.MakeTestResult( + sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), + "ACTIVE", + ), + expectError: false, + }, + { + name: "MySQL version too old", + versionQuery: sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.16", + ), + expectError: true, + errorContain: "requires version 8.0.17", + }, + { + name: "clone plugin not installed", + versionQuery: sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.32", + ), + pluginQuery: sqltypes.MakeTestResult(sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar")), + expectError: true, + errorContain: "clone plugin is not installed", + }, + { + name: "clone plugin not active", + versionQuery: sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.32", + ), + pluginQuery: sqltypes.MakeTestResult( + sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), + "DISABLED", + ), + expectError: true, + errorContain: "clone plugin is not active", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fmd := NewFakeMysqlDaemon(nil) + defer fmd.Close() + + fmd.FetchSuperQueryMap = map[string]*sqltypes.Result{ + "SELECT @@version": tt.versionQuery, + } + if tt.pluginQuery != nil { + fmd.FetchSuperQueryMap["SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'"] = tt.pluginQuery + } + + executor := &CloneExecutor{ + DonorHost: "192.168.1.100", + DonorPort: 3306, + DonorUser: "vt_clone", + DonorPassword: "secret", + UseSSL: false, + } + + err := executor.ValidateRecipient(context.Background(), fmd) + if tt.expectError { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.errorContain) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/go/vt/mysqlctl/mysqld.go b/go/vt/mysqlctl/mysqld.go index 3bbad31f1c2..706eebd45ee 100644 --- a/go/vt/mysqlctl/mysqld.go +++ b/go/vt/mysqlctl/mysqld.go @@ -95,6 +95,7 @@ var ( mycnfTemplateFile string socketFile string + mysqlCloneEnabled bool replicationConnectRetry = 10 * time.Second @@ -144,6 +145,12 @@ func registerMySQLDFlags(fs *pflag.FlagSet) { utils.SetFlagStringVar(fs, &mycnfTemplateFile, "mysqlctl-mycnf-template", mycnfTemplateFile, "template file to use for generating the my.cnf file during server init") utils.SetFlagStringVar(fs, &socketFile, "mysqlctl-socket", socketFile, "socket file to use for remote mysqlctl actions (empty for local actions)") utils.SetFlagDurationVar(fs, &replicationConnectRetry, "replication-connect-retry", replicationConnectRetry, "how long to wait in between replica reconnect attempts. Only precise to the second.") + utils.SetFlagBoolVar(fs, &mysqlCloneEnabled, "mysql-clone-enabled", mysqlCloneEnabled, "Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+)") +} + +// MySQLCloneEnabled returns whether MySQL CLONE support is enabled. +func MySQLCloneEnabled() bool { + return mysqlCloneEnabled } func registerReparentFlags(fs *pflag.FlagSet) { @@ -796,6 +803,12 @@ func (mysqld *Mysqld) Init(ctx context.Context, cnf *Mycnf, initDBSQLFile string if err := mysqld.executeMysqlScript(ctx, params, config.DefaultInitDB); err != nil { return fmt.Errorf("failed to initialize mysqld: %v", err) } + // Execute clone-specific init SQL if enabled + if mysqlCloneEnabled { + if err := mysqld.executeMysqlScript(ctx, params, config.InitClone); err != nil { + return fmt.Errorf("failed to initialize clone support: %v", err) + } + } return nil } @@ -812,6 +825,12 @@ func (mysqld *Mysqld) Init(ctx context.Context, cnf *Mycnf, initDBSQLFile string if err := mysqld.executeMysqlScript(ctx, params, string(script)); err != nil { return fmt.Errorf("can't run init-db-sql-file (%v): %v", initDBSQLFile, err) } + // Execute clone-specific init SQL if enabled + if mysqlCloneEnabled { + if err := mysqld.executeMysqlScript(ctx, params, config.InitClone); err != nil { + return fmt.Errorf("failed to initialize clone support: %v", err) + } + } return nil } @@ -970,6 +989,12 @@ func (mysqld *Mysqld) getMycnfTemplate() string { myTemplateSource.WriteString(versionConfig) + // Conditionally include clone plugin config + if mysqlCloneEnabled && mysqld.capabilities.isMySQLLike() { + myTemplateSource.WriteString("\n## Clone plugin (--mysql-clone-enabled)\n") + myTemplateSource.WriteString(config.MycnfClone) + } + if extraCnf := os.Getenv("EXTRA_MY_CNF"); extraCnf != "" { parts := strings.Split(extraCnf, ":") for _, path := range parts { From 0fe8efd2342e7941b4705ddf45f3e300a2aa93de Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Fri, 19 Dec 2025 15:09:21 -0800 Subject: [PATCH 02/33] add flags to tests + fix errors from linters Signed-off-by: Nick Van Wiggeren --- go/flags/endtoend/mysqlctl.txt | 1 + go/flags/endtoend/mysqlctld.txt | 1 + go/flags/endtoend/vtcombo.txt | 4 ++++ go/flags/endtoend/vttablet.txt | 4 ++++ go/flags/endtoend/vttestserver.txt | 1 + go/vt/mysqlctl/clone.go | 7 ++++--- go/vt/vttablet/tabletserver/tabletenv/config_test.go | 2 ++ 7 files changed, 17 insertions(+), 3 deletions(-) diff --git a/go/flags/endtoend/mysqlctl.txt b/go/flags/endtoend/mysqlctl.txt index 36c87e9d6ad..e252218fd95 100644 --- a/go/flags/endtoend/mysqlctl.txt +++ b/go/flags/endtoend/mysqlctl.txt @@ -70,6 +70,7 @@ Flags: --log_dir string If non-empty, write log files in this directory --logtostderr log to standard error instead of files --max-stack-size int configure the maximum stack size in bytes (default 67108864) + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port. (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-socket string Path to the mysqld socket file. diff --git a/go/flags/endtoend/mysqlctld.txt b/go/flags/endtoend/mysqlctld.txt index e728e1561d2..d5c977ce411 100644 --- a/go/flags/endtoend/mysqlctld.txt +++ b/go/flags/endtoend/mysqlctld.txt @@ -99,6 +99,7 @@ Flags: --log_dir string If non-empty, write log files in this directory --logtostderr log to standard error instead of files --max-stack-size int configure the maximum stack size in bytes (default 67108864) + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-socket string Path to the mysqld socket file diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index f3342596517..edfb1058bf4 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -61,6 +61,9 @@ Flags: --db-appdebug-use-ssl Set this flag to false to make the appdebug connection to not use ssl (default true) --db-appdebug-user string db appdebug user userKey (default "vt_appdebug") --db-charset string Character set/collation used for this tablet. Make sure to configure this to a charset/collation supported by the lowest MySQL version in your environment. (default "utf8mb4") + --db-clone-password string db clone password + --db-clone-use-ssl Set this flag to false to make the clone connection to not use ssl (default true) + --db-clone-user string db clone user userKey (default "vt_clone") --db-conn-query-info enable parsing and processing of QUERY_OK info fields --db-connect-timeout-ms int connection timeout to mysqld in milliseconds (0 for no timeout) --db-credentials-file string db credentials file; send SIGHUP to reload this file @@ -220,6 +223,7 @@ Flags: --mycnf-tmp-dir string mysql tmp directory --mysql-allow-clear-text-without-tls If set, the server will allow the use of a clear text password over non-SSL connections. --mysql-auth-server-impl string Which auth server implementation to use. Options: none, ldap, clientcert, static, vault. (default "static") + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-default-workload string Default session workload (OLTP, OLAP, DBA) (default "OLTP") --mysql-port int mysql port (default 3306) --mysql-server-bind-address string Binds on this address when listening to MySQL binary protocol. Useful to restrict listening to 'localhost' only for instance. diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index 2a0cd50e65a..d9a27b62ea6 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -95,6 +95,9 @@ Flags: --db-appdebug-use-ssl Set this flag to false to make the appdebug connection to not use ssl (default true) --db-appdebug-user string db appdebug user userKey (default "vt_appdebug") --db-charset string Character set/collation used for this tablet. Make sure to configure this to a charset/collation supported by the lowest MySQL version in your environment. (default "utf8mb4") + --db-clone-password string db clone password + --db-clone-use-ssl Set this flag to false to make the clone connection to not use ssl (default true) + --db-clone-user string db clone user userKey (default "vt_clone") --db-conn-query-info enable parsing and processing of QUERY_OK info fields --db-connect-timeout-ms int connection timeout to mysqld in milliseconds (0 for no timeout) --db-credentials-file string db credentials file; send SIGHUP to reload this file @@ -240,6 +243,7 @@ Flags: --mycnf-slow-log-path string mysql slow query log path --mycnf-socket-file string mysql socket file --mycnf-tmp-dir string mysql tmp directory + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-shell-backup-location string location where the backup will be stored --mysql-shell-dump-flags string flags to pass to mysql shell dump utility. This should be a JSON string and will be saved in the MANIFEST (default "{\"threads\": 4}") diff --git a/go/flags/endtoend/vttestserver.txt b/go/flags/endtoend/vttestserver.txt index ad05efc4b19..d41a6b2c2bc 100644 --- a/go/flags/endtoend/vttestserver.txt +++ b/go/flags/endtoend/vttestserver.txt @@ -91,6 +91,7 @@ Flags: --max-table-shard-size int The maximum number of initial rows in a table shard. Ignored if--initialize-with-random-data is false. The actual number is chosen randomly (default 10000) --min-table-shard-size int The minimum number of initial rows in a table shard. Ignored if--initialize-with-random-data is false. The actual number is chosen randomly. (default 1000) --mysql-bind-host string which host to bind vtgate mysql listener to (default "localhost") + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-only If this flag is set only mysql is initialized. The rest of the vitess components are not started. Also, the output specifies the mysql unix socket instead of the vtgate port. --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-shell-backup-location string location where the backup will be stored diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index e807d24dfaa..577a213ccbc 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -18,6 +18,7 @@ package mysqlctl import ( "context" + "errors" "fmt" "strings" @@ -91,7 +92,7 @@ func (c *CloneExecutor) checkCloneCapability(ctx context.Context, mysqld MysqlDa } if len(result.Rows) == 0 || len(result.Rows[0]) == 0 { - return fmt.Errorf("empty version result") + return errors.New("empty version result") } versionStr := result.Rows[0][0].ToString() @@ -120,7 +121,7 @@ func (c *CloneExecutor) checkCloneCapability(ctx context.Context, mysqld MysqlDa // Note: This operation will DESTROY all existing data on the recipient. func (c *CloneExecutor) ExecuteClone(ctx context.Context, mysqld MysqlDaemon) error { if !MySQLCloneEnabled() { - return fmt.Errorf("MySQL CLONE not enabled; set --mysql-clone-enabled=true on both donor and recipient") + return errors.New("MySQL CLONE not enabled; set --mysql-clone-enabled=true on both donor and recipient") } log.Infof("Starting CLONE REMOTE from %s:%d", c.DonorHost, c.DonorPort) @@ -205,7 +206,7 @@ func (c *CloneExecutor) checkClonePluginInstalled(ctx context.Context, mysqld My } if len(result.Rows) == 0 { - return fmt.Errorf("clone plugin is not installed (add 'plugin-load-add=mysql_clone.so' to my.cnf)") + return errors.New("clone plugin is not installed (add 'plugin-load-add=mysql_clone.so' to my.cnf)") } status := result.Rows[0][0].ToString() diff --git a/go/vt/vttablet/tabletserver/tabletenv/config_test.go b/go/vt/vttablet/tabletserver/tabletenv/config_test.go index 943ec9f020e..783dfaa6c36 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/config_test.go +++ b/go/vt/vttablet/tabletserver/tabletenv/config_test.go @@ -73,6 +73,8 @@ func TestConfigParse(t *testing.T) { user: b appdebug: password: '****' + clone: + password: '****' dba: password: '****' user: c From 894145931c1b23520b6b88b2329e72d1c4ab44bd Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Fri, 19 Dec 2025 15:20:48 -0800 Subject: [PATCH 03/33] fix up more tests Signed-off-by: Nick Van Wiggeren --- go/flags/endtoend/vtbackup.txt | 3 +++ go/vt/dbconfigs/dbconfigs_test.go | 2 ++ 2 files changed, 5 insertions(+) diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt index 62729b31c4e..87bb94fd43c 100644 --- a/go/flags/endtoend/vtbackup.txt +++ b/go/flags/endtoend/vtbackup.txt @@ -88,6 +88,9 @@ Flags: --db-appdebug-use-ssl Set this flag to false to make the appdebug connection to not use ssl (default true) --db-appdebug-user string db appdebug user userKey (default "vt_appdebug") --db-charset string Character set/collation used for this tablet. Make sure to configure this to a charset/collation supported by the lowest MySQL version in your environment. (default "utf8mb4") + --db-clone-password string db clone password + --db-clone-use-ssl Set this flag to false to make the clone connection to not use ssl (default true) + --db-clone-user string db clone user userKey (default "vt_clone") --db-conn-query-info enable parsing and processing of QUERY_OK info fields --db-connect-timeout-ms int connection timeout to mysqld in milliseconds (0 for no timeout) --db-credentials-file string db credentials file; send SIGHUP to reload this file diff --git a/go/vt/dbconfigs/dbconfigs_test.go b/go/vt/dbconfigs/dbconfigs_test.go index 029682d13b7..046ffe3327e 100644 --- a/go/vt/dbconfigs/dbconfigs_test.go +++ b/go/vt/dbconfigs/dbconfigs_test.go @@ -326,6 +326,8 @@ app: user: vt_app appdebug: password: '****' +clone: + password: '****' dba: password: '****' user: vt_dba From 94624dcb6032c43fd3e425bcbec1efe496a6c06d Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 19 Dec 2025 19:41:29 -0500 Subject: [PATCH 04/33] go/vt/mysqlctl: add CloneFromDonor Signed-off-by: Max Englander --- go/vt/mysqlctl/clone.go | 130 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 577a213ccbc..c92a69a8cd8 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -21,12 +21,38 @@ import ( "errors" "fmt" "strings" + "time" + "github.com/spf13/pflag" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/mysql/capabilities" + "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/log" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/servenv" + "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/utils" ) +var ( + cloneFromPrimary = false + cloneFromTablet = "" +) + +func init() { + // TODO: enable these flags for vttablet and vtbackup. + for _, cmd := range []string{ /*"vttablet", "vtbackup"*/ } { + servenv.OnParseFor(cmd, registerCloneFlags) + } +} + +func registerCloneFlags(fs *pflag.FlagSet) { + utils.SetFlagBoolVar(fs, &cloneFromPrimary, "clone-from-primary", cloneFromPrimary, "Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet.") + utils.SetFlagStringVar(fs, &cloneFromTablet, "clone-from-tablet", cloneFromTablet, "Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary.") +} + // CloneExecutor handles MySQL CLONE REMOTE operations for backup and replica provisioning. // It executes CLONE INSTANCE FROM on the recipient to clone data from a donor. type CloneExecutor struct { @@ -216,3 +242,107 @@ func (c *CloneExecutor) checkClonePluginInstalled(ctx context.Context, mysqld My return nil } + +// CloneFromDonor clones data from the specified donor tablet using MySQL CLONE REMOTE. +// It returns the GTID position of the cloned data. +func CloneFromDonor(ctx context.Context, topoServer *topo.Server, mysqld MysqlDaemon, keyspace, shard string) (replication.Position, error) { + var donorAlias *topodatapb.TabletAlias + var err error + + if cloneFromPrimary { + // Look up the primary tablet from topology. + log.Infof("Looking up primary tablet for shard %s/%s", keyspace, shard) + si, err := topoServer.GetShard(ctx, keyspace, shard) + if err != nil { + return replication.Position{}, fmt.Errorf("failed to get shard %s/%s: %v", keyspace, shard, err) + } + if topoproto.TabletAliasIsZero(si.PrimaryAlias) { + return replication.Position{}, fmt.Errorf("shard %s/%s has no primary", keyspace, shard) + } + donorAlias = si.PrimaryAlias + log.Infof("Found primary tablet: %s", topoproto.TabletAliasString(donorAlias)) + } else { + // Parse the explicit donor tablet alias. + log.Infof("Starting clone-based backup from tablet %s", cloneFromTablet) + donorAlias, err = topoproto.ParseTabletAlias(cloneFromTablet) + if err != nil { + return replication.Position{}, fmt.Errorf("invalid tablet alias %q: %v", cloneFromTablet, err) + } + } + + // Get donor tablet info from topology. + donorTablet, err := topoServer.GetTablet(ctx, donorAlias) + if err != nil { + return replication.Position{}, fmt.Errorf("failed to get tablet %s from topology: %v", topoproto.TabletAliasString(donorAlias), err) + } + + // Get clone credentials. + cloneConfig := dbconfigs.GlobalDBConfigs.CloneUser + if cloneConfig.User == "" { + return replication.Position{}, fmt.Errorf("clone user not configured; set --db-clone-user flag") + } + + // Create the clone executor. + executor := &CloneExecutor{ + DonorHost: donorTablet.MysqlHostname, + DonorPort: int(donorTablet.MysqlPort), + DonorUser: cloneConfig.User, + DonorPassword: cloneConfig.Password, + UseSSL: cloneConfig.UseSSL, + } + + log.Infof("Clone executor configured for donor %s:%d", executor.DonorHost, executor.DonorPort) + + // Validate that the recipient (local) MySQL meets prerequisites. + if err := executor.ValidateRecipient(ctx, mysqld); err != nil { + return replication.Position{}, fmt.Errorf("recipient validation failed: %v", err) + } + + // Execute the clone operation. + // Note: MySQL will restart automatically after clone completes. + if err := executor.ExecuteClone(ctx, mysqld); err != nil { + return replication.Position{}, fmt.Errorf("clone execution failed: %v", err) + } + + // After clone, MySQL restarts automatically. We need to wait for it to come back up. + log.Info("Clone completed, waiting for MySQL to restart...") + + // The connection to MySQL will be lost after clone. Wait for it to come back. + if err := waitForMySQLRestart(ctx, mysqld); err != nil { + return replication.Position{}, fmt.Errorf("failed waiting for MySQL restart after clone: %v", err) + } + + // Get the GTID position from the cloned data. + pos, err := mysqld.PrimaryPosition(ctx) + if err != nil { + return replication.Position{}, fmt.Errorf("failed to get position after clone: %v", err) + } + + log.Infof("Clone completed successfully at position %v", pos) + return pos, nil +} + +// waitForMySQLRestart waits for MySQL to restart after a clone operation. +func waitForMySQLRestart(ctx context.Context, mysqld MysqlDaemon) error { + // MySQL automatically restarts after clone. We need to wait for it. + // Use a reasonable timeout for restart. + restartTimeout := 5 * time.Minute + restartCtx, cancel := context.WithTimeout(ctx, restartTimeout) + defer cancel() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-restartCtx.Done(): + return errors.New("timeout waiting for MySQL to restart after clone") + case <-ticker.C: + // Try to connect to MySQL. + if _, err := mysqld.FetchSuperQuery(restartCtx, "SELECT 1"); err == nil { + log.Info("MySQL is back online after clone") + return nil + } + } + } +} From 0342d746190c94e485ea7a6b2d5d2874449b282a Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Fri, 19 Dec 2025 17:06:23 -0800 Subject: [PATCH 05/33] actually validate the source and destination, and poll for the copy to be marked as completed after the CLONE is done Signed-off-by: Nick Van Wiggeren --- go/vt/mysqlctl/clone.go | 220 ++++++++++++++++++++++-------- go/vt/mysqlctl/clone_test.go | 146 ++++++++++++++++++++ go/vt/mysqlctl/fakemysqldaemon.go | 9 ++ 3 files changed, 321 insertions(+), 54 deletions(-) diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 577a213ccbc..c4ebbe4d94c 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "strings" + "time" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/mysql/capabilities" @@ -42,22 +43,16 @@ type CloneExecutor struct { UseSSL bool } -// ValidateDonor checks that the donor MySQL instance meets all prerequisites for cloning. +// ValidateRecipient checks that the recipient MySQL instance meets all prerequisites for cloning. // It verifies: // - MySQL version >= 8.0.17 // - Clone plugin is installed -// - No non-InnoDB tables exist (clone only supports InnoDB) -func (c *CloneExecutor) ValidateDonor(ctx context.Context, mysqld MysqlDaemon) error { +func (c *CloneExecutor) ValidateRecipient(ctx context.Context, mysqld MysqlDaemon) error { // Check MySQL version using capabilities system if err := c.checkCloneCapability(ctx, mysqld); err != nil { return err } - // Check for non-InnoDB tables - if err := c.checkNoNonInnoDBTables(ctx, mysqld); err != nil { - return err - } - // Check clone plugin is installed if err := c.checkClonePluginInstalled(ctx, mysqld); err != nil { return err @@ -66,21 +61,81 @@ func (c *CloneExecutor) ValidateDonor(ctx context.Context, mysqld MysqlDaemon) e return nil } -// ValidateRecipient checks that the recipient MySQL instance meets all prerequisites for cloning. -// It verifies: -// - MySQL version >= 8.0.17 -// - Clone plugin is installed -func (c *CloneExecutor) ValidateRecipient(ctx context.Context, mysqld MysqlDaemon) error { - // Check MySQL version using capabilities system - if err := c.checkCloneCapability(ctx, mysqld); err != nil { - return err +// validateDonorRemote connects to the donor MySQL instance and validates it meets +// all prerequisites for cloning. This is called from ExecuteClone to verify the +// donor before attempting the clone operation. +func (c *CloneExecutor) validateDonorRemote(ctx context.Context) error { + params := &mysql.ConnParams{ + Host: c.DonorHost, + Port: c.DonorPort, + Uname: c.DonorUser, + Pass: c.DonorPassword, + } + + conn, err := mysql.Connect(ctx, params) + if err != nil { + return fmt.Errorf("failed to connect to donor %s:%d: %w", c.DonorHost, c.DonorPort, err) + } + defer conn.Close() + + // Check MySQL version + qr, err := conn.ExecuteFetch("SELECT @@version", 1, false) + if err != nil { + return fmt.Errorf("failed to query donor MySQL version: %w", err) + } + if len(qr.Rows) == 0 || len(qr.Rows[0]) == 0 { + return errors.New("empty version result from donor") + } + versionStr := qr.Rows[0][0].ToString() + capableOf := mysql.ServerVersionCapableOf(versionStr) + if capableOf == nil { + return fmt.Errorf("unable to determine MySQL capabilities for donor version %q", versionStr) + } + ok, err := capableOf(capabilities.MySQLClonePluginFlavorCapability) + if err != nil { + return fmt.Errorf("failed to check donor clone capability: %w", err) + } + if !ok { + return fmt.Errorf("donor MySQL CLONE requires version 8.0.17 or higher, got %s", versionStr) } // Check clone plugin is installed - if err := c.checkClonePluginInstalled(ctx, mysqld); err != nil { - return err + qr, err = conn.ExecuteFetch("SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'", 1, false) + if err != nil { + return fmt.Errorf("failed to check donor clone plugin status: %w", err) + } + if len(qr.Rows) == 0 { + return errors.New("clone plugin is not installed on donor (add 'plugin-load-add=mysql_clone.so' to my.cnf)") + } + status := qr.Rows[0][0].ToString() + if status != "ACTIVE" { + return fmt.Errorf("clone plugin is not active on donor (status: %s)", status) + } + + // Check for non-InnoDB tables + qr, err = conn.ExecuteFetch(` + SELECT TABLE_SCHEMA, TABLE_NAME, ENGINE + FROM information_schema.TABLES + WHERE ENGINE != 'InnoDB' + AND ENGINE IS NOT NULL + AND TABLE_TYPE = 'BASE TABLE' + AND TABLE_SCHEMA NOT IN ('mysql', 'information_schema', 'performance_schema', 'sys') + `, 1000, false) + if err != nil { + return fmt.Errorf("failed to check donor for non-InnoDB tables: %w", err) + } + if len(qr.Rows) > 0 { + var tables []string + for _, row := range qr.Rows { + schema := row[0].ToString() + table := row[1].ToString() + engine := row[2].ToString() + tables = append(tables, fmt.Sprintf("%s.%s (%s)", schema, table, engine)) + } + return fmt.Errorf("non-InnoDB tables found on donor (CLONE only supports InnoDB): %s", strings.Join(tables, ", ")) } + log.Infof("Donor %s:%d validated successfully (MySQL %s)", c.DonorHost, c.DonorPort, versionStr) return nil } @@ -116,14 +171,27 @@ func (c *CloneExecutor) checkCloneCapability(ctx context.Context, mysqld MysqlDa // This will: // 1. Set clone_valid_donor_list on the recipient // 2. Execute CLONE INSTANCE FROM on the recipient -// 3. The recipient MySQL will restart automatically after clone completes +// 3. Wait for MySQL to restart and verify clone completed successfully +// +// The restartTimeout specifies how long to wait for MySQL to restart and +// report clone completion after the CLONE command finishes. // // Note: This operation will DESTROY all existing data on the recipient. -func (c *CloneExecutor) ExecuteClone(ctx context.Context, mysqld MysqlDaemon) error { +func (c *CloneExecutor) ExecuteClone(ctx context.Context, mysqld MysqlDaemon, restartTimeout time.Duration) error { if !MySQLCloneEnabled() { return errors.New("MySQL CLONE not enabled; set --mysql-clone-enabled=true on both donor and recipient") } + // Validate recipient prerequisites + if err := c.ValidateRecipient(ctx, mysqld); err != nil { + return fmt.Errorf("recipient validation failed: %w", err) + } + + // Validate donor prerequisites by connecting remotely + if err := c.validateDonorRemote(ctx); err != nil { + return fmt.Errorf("donor validation failed: %w", err) + } + log.Infof("Starting CLONE REMOTE from %s:%d", c.DonorHost, c.DonorPort) // Set the valid donor list @@ -139,10 +207,16 @@ func (c *CloneExecutor) ExecuteClone(ctx context.Context, mysqld MysqlDaemon) er log.Infof("Executing CLONE INSTANCE FROM %s:%d (this may take a while)", c.DonorHost, c.DonorPort) - // Execute the clone command - // Note: After this command completes, MySQL will restart automatically + // Execute the clone command. When clone completes, MySQL restarts automatically + // which will cause the connection to drop. We ignore this error and verify + // success by checking clone_status after MySQL comes back up. if err := mysqld.ExecuteSuperQuery(ctx, cloneCmd); err != nil { - return fmt.Errorf("CLONE INSTANCE failed: %w", err) + log.Infof("CLONE command returned (connection likely lost due to MySQL restart): %v", err) + } + + // Wait for MySQL to restart and verify clone completed successfully + if err := c.waitForCloneComplete(ctx, mysqld, restartTimeout); err != nil { + return fmt.Errorf("clone verification failed: %w", err) } log.Infof("CLONE REMOTE completed successfully from %s:%d", c.DonorHost, c.DonorPort) @@ -165,37 +239,6 @@ func (c *CloneExecutor) buildCloneCommand() string { return sb.String() } -// checkNoNonInnoDBTables verifies that no user tables use non-InnoDB storage engines. -// MySQL CLONE only copies InnoDB data; other engines would result in empty tables. -func (c *CloneExecutor) checkNoNonInnoDBTables(ctx context.Context, mysqld MysqlDaemon) error { - query := ` - SELECT TABLE_SCHEMA, TABLE_NAME, ENGINE - FROM information_schema.TABLES - WHERE ENGINE != 'InnoDB' - AND ENGINE IS NOT NULL - AND TABLE_TYPE = 'BASE TABLE' - AND TABLE_SCHEMA NOT IN ('mysql', 'information_schema', 'performance_schema', 'sys') - ` - - result, err := mysqld.FetchSuperQuery(ctx, query) - if err != nil { - return fmt.Errorf("failed to check for non-InnoDB tables: %w", err) - } - - if len(result.Rows) > 0 { - var tables []string - for _, row := range result.Rows { - schema := row[0].ToString() - table := row[1].ToString() - engine := row[2].ToString() - tables = append(tables, fmt.Sprintf("%s.%s (%s)", schema, table, engine)) - } - return fmt.Errorf("non-InnoDB tables found (CLONE only supports InnoDB): %s", strings.Join(tables, ", ")) - } - - return nil -} - // checkClonePluginInstalled verifies that the clone plugin is loaded. func (c *CloneExecutor) checkClonePluginInstalled(ctx context.Context, mysqld MysqlDaemon) error { query := "SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'" @@ -216,3 +259,72 @@ func (c *CloneExecutor) checkClonePluginInstalled(ctx context.Context, mysqld My return nil } + +// waitForCloneComplete waits for a clone operation to complete by polling +// performance_schema.clone_status. This handles the MySQL restart that occurs +// after clone - connections will fail during restart and this function will +// retry until MySQL is back and clone_status shows completion. +func (c *CloneExecutor) waitForCloneComplete(ctx context.Context, mysqld MysqlDaemon, timeout time.Duration) error { + const pollInterval = time.Second + + deadline := time.Now().Add(timeout) + query := "SELECT STATE, ERROR_NO, ERROR_MESSAGE FROM performance_schema.clone_status ORDER BY ID DESC LIMIT 1" + + log.Infof("Waiting for clone to complete (timeout: %v)", timeout) + + for { + // Check context cancellation + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + // Check timeout + if time.Now().After(deadline) { + return fmt.Errorf("timeout waiting for clone to complete after %v", timeout) + } + + // Try to query clone status - connection may fail if MySQL is restarting + result, err := mysqld.FetchSuperQuery(ctx, query) + if err != nil { + // Connection failures are expected during MySQL restart + log.Infof("Clone status query failed (MySQL may be restarting): %v", err) + time.Sleep(pollInterval) + continue + } + + if len(result.Rows) == 0 { + // No clone status yet - MySQL may have just started + log.Infof("No clone status found, waiting...") + time.Sleep(pollInterval) + continue + } + + state := result.Rows[0][0].ToString() + errorNo := result.Rows[0][1].ToString() + errorMsg := result.Rows[0][2].ToString() + + log.Infof("Clone status: STATE=%s, ERROR_NO=%s", state, errorNo) + + switch state { + case "Completed": + if errorNo != "0" { + return fmt.Errorf("clone completed with error %s: %s", errorNo, errorMsg) + } + log.Infof("Clone completed successfully") + return nil + case "Failed": + return fmt.Errorf("clone failed with error %s: %s", errorNo, errorMsg) + case "In Progress", "Not Started": + // Still running, keep waiting + time.Sleep(pollInterval) + continue + default: + // Unknown state, keep waiting but log it + log.Warningf("Unknown clone state: %s", state) + time.Sleep(pollInterval) + continue + } + } +} diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index e84dd74a379..10e65cc2fa8 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -19,6 +19,7 @@ package mysqlctl import ( "context" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -64,6 +65,151 @@ func TestBuildCloneCommand(t *testing.T) { } } +func Test_waitForCloneComplete(t *testing.T) { + tests := []struct { + name string + queryResults []*sqltypes.Result + queryErrors []error + expectError bool + errorContain string + }{ + { + name: "clone completed successfully", + queryResults: []*sqltypes.Result{ + sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "Completed|0|", + ), + }, + expectError: false, + }, + { + name: "clone completed with error", + queryResults: []*sqltypes.Result{ + sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "Completed|3862|Clone Donor Error: 3862 : Clone requires redo log archiving to be started by BACKUP.", + ), + }, + expectError: true, + errorContain: "clone completed with error", + }, + { + name: "clone failed", + queryResults: []*sqltypes.Result{ + sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "Failed|3862|Clone Donor Error", + ), + }, + expectError: true, + errorContain: "clone failed", + }, + { + name: "clone in progress then completed", + queryResults: []*sqltypes.Result{ + sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "In Progress|0|", + ), + sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "Completed|0|", + ), + }, + expectError: false, + }, + { + name: "connection error then completed", + queryResults: []*sqltypes.Result{ + nil, + sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "Completed|0|", + ), + }, + queryErrors: []error{ + assert.AnError, + nil, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fmd := NewFakeMysqlDaemon(nil) + defer fmd.Close() + + // Set up the sequence of results + callCount := 0 + fmd.FetchSuperQueryCallback = func(query string) (*sqltypes.Result, error) { + if callCount < len(tt.queryResults) { + idx := callCount + callCount++ + var err error + if tt.queryErrors != nil && idx < len(tt.queryErrors) { + err = tt.queryErrors[idx] + } + return tt.queryResults[idx], err + } + return nil, assert.AnError + } + + executor := &CloneExecutor{} + + err := executor.waitForCloneComplete(context.Background(), fmd, 5*time.Second) + if tt.expectError { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.errorContain) + } else { + require.NoError(t, err) + } + }) + } +} + +func Test_waitForCloneComplete_Timeout(t *testing.T) { + fmd := NewFakeMysqlDaemon(nil) + defer fmd.Close() + + // Always return "In Progress" + fmd.FetchSuperQueryCallback = func(query string) (*sqltypes.Result, error) { + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "In Progress|0|", + ), nil + } + + executor := &CloneExecutor{} + + err := executor.waitForCloneComplete(context.Background(), fmd, 100*time.Millisecond) + require.Error(t, err) + assert.Contains(t, err.Error(), "timeout") +} + +func Test_waitForCloneComplete_ContextCanceled(t *testing.T) { + fmd := NewFakeMysqlDaemon(nil) + defer fmd.Close() + + // Always return "In Progress" + fmd.FetchSuperQueryCallback = func(query string) (*sqltypes.Result, error) { + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "In Progress|0|", + ), nil + } + + executor := &CloneExecutor{} + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately + + err := executor.waitForCloneComplete(ctx, fmd, 5*time.Second) + require.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} + func TestValidateRecipient(t *testing.T) { tests := []struct { name string diff --git a/go/vt/mysqlctl/fakemysqldaemon.go b/go/vt/mysqlctl/fakemysqldaemon.go index fe94486dacd..8829c3ae578 100644 --- a/go/vt/mysqlctl/fakemysqldaemon.go +++ b/go/vt/mysqlctl/fakemysqldaemon.go @@ -184,6 +184,10 @@ type FakeMysqlDaemon struct { // FetchSuperQueryResults is used by FetchSuperQuery. FetchSuperQueryMap map[string]*sqltypes.Result + // FetchSuperQueryCallback is an optional callback for dynamic query handling. + // If set, it takes precedence over FetchSuperQueryMap. + FetchSuperQueryCallback func(query string) (*sqltypes.Result, error) + // SemiSyncPrimaryEnabled represents the state of rpl_semi_sync_source_enabled. SemiSyncPrimaryEnabled bool // SemiSyncReplicaEnabled represents the state of rpl_semi_sync_replica_enabled. @@ -617,6 +621,11 @@ func (fmd *FakeMysqlDaemon) ExecuteSuperQueryList(ctx context.Context, queryList // FetchSuperQuery returns the results from the map, if any. func (fmd *FakeMysqlDaemon) FetchSuperQuery(ctx context.Context, query string) (*sqltypes.Result, error) { + // If a callback is set, use it for dynamic handling + if fmd.FetchSuperQueryCallback != nil { + return fmd.FetchSuperQueryCallback(query) + } + if fmd.FetchSuperQueryMap == nil { return nil, fmt.Errorf("unexpected query: %v", query) } From 95dccd1a39b7aa6474e79fde2f0a51b41cf443cf Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 19 Dec 2025 20:13:38 -0500 Subject: [PATCH 06/33] TestCloneFromDonor: add a few cases Signed-off-by: Max Englander --- go/vt/mysqlctl/clone.go | 7 +- go/vt/mysqlctl/clone_test.go | 137 +++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+), 2 deletions(-) diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index c92a69a8cd8..29031fbfeda 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -249,7 +249,8 @@ func CloneFromDonor(ctx context.Context, topoServer *topo.Server, mysqld MysqlDa var donorAlias *topodatapb.TabletAlias var err error - if cloneFromPrimary { + switch { + case cloneFromPrimary: // Look up the primary tablet from topology. log.Infof("Looking up primary tablet for shard %s/%s", keyspace, shard) si, err := topoServer.GetShard(ctx, keyspace, shard) @@ -261,13 +262,15 @@ func CloneFromDonor(ctx context.Context, topoServer *topo.Server, mysqld MysqlDa } donorAlias = si.PrimaryAlias log.Infof("Found primary tablet: %s", topoproto.TabletAliasString(donorAlias)) - } else { + case cloneFromTablet != "": // Parse the explicit donor tablet alias. log.Infof("Starting clone-based backup from tablet %s", cloneFromTablet) donorAlias, err = topoproto.ParseTabletAlias(cloneFromTablet) if err != nil { return replication.Position{}, fmt.Errorf("invalid tablet alias %q: %v", cloneFromTablet, err) } + default: + return replication.Position{}, fmt.Errorf("no donor specified") } // Get donor tablet info from topology. diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index e84dd74a379..a203ce43ba3 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -23,7 +23,14 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "vitess.io/vitess/go/mysql/fakesqldb" "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/test/utils" + "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/memorytopo" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) func TestBuildCloneCommand(t *testing.T) { @@ -148,3 +155,133 @@ func TestValidateRecipient(t *testing.T) { }) } } + +type cloneFromDonorTestEnv struct { + ctx context.Context + logger *logutil.MemoryLogger + ts *topo.Server + mysqld *FakeMysqlDaemon + keyspace string + shard string +} + +func createCloneFromDonorTestEnv(t *testing.T) *cloneFromDonorTestEnv { + ctx := context.Background() + logger := logutil.NewMemoryLogger() + + // Create in-memory topo server with a test cell + ts := memorytopo.NewServer(ctx, "cell1") + + keyspace := "test" + shard := "-" + + // Create keyspace in topology + require.NoError(t, ts.CreateKeyspace(ctx, keyspace, &topodatapb.Keyspace{})) + + // Create fake MySQL daemon + sqldb := fakesqldb.New(t) + sqldb.SetNeverFail(true) + mysqld := NewFakeMysqlDaemon(sqldb) + + t.Cleanup(func() { + mysqld.Close() + sqldb.Close() + utils.EnsureNoLeaks(t) + }) + + return &cloneFromDonorTestEnv{ + ctx: ctx, + logger: logger, + ts: ts, + mysqld: mysqld, + keyspace: keyspace, + shard: shard, + } +} + +func TestCloneFromDonor(t *testing.T) { + testCases := []struct { + name string + cloneFromPrimary bool + cloneFromTablet string + setup func(*testing.T, *cloneFromDonorTestEnv) + wantErr bool + wantErrContains string + }{ + { + name: "clone from primary, GetShard fails", + cloneFromPrimary: true, + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // Don't create the shard, so GetShard will fail + }, + wantErr: true, + wantErrContains: "failed to get shard", + }, + { + name: "clone from primary, shard has no primary", + cloneFromPrimary: true, + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // Create shard without a primary + require.NoError(t, env.ts.CreateShard(env.ctx, env.keyspace, env.shard)) + }, + wantErr: true, + wantErrContains: "has no primary", + }, + { + name: "clone from tablet, invalid tablet alias", + cloneFromTablet: "invalid-alias-format", + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // No setup needed, invalid alias will fail parsing + }, + wantErr: true, + wantErrContains: "invalid tablet alias", + }, + { + name: "neither cloneFromPrimary nor cloneFromTablet specified", + cloneFromPrimary: false, + cloneFromTablet: "", + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // No setup needed, will fail when no donor is specified + }, + wantErr: true, + wantErrContains: "no donor specified", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + env := createCloneFromDonorTestEnv(t) + + // Save and restore global flags + oldCloneFromPrimary := cloneFromPrimary + oldCloneFromTablet := cloneFromTablet + defer func() { + cloneFromPrimary = oldCloneFromPrimary + cloneFromTablet = oldCloneFromTablet + }() + + // Set test flag values + cloneFromPrimary = tc.cloneFromPrimary + cloneFromTablet = tc.cloneFromTablet + + // Run setup if provided + if tc.setup != nil { + tc.setup(t, env) + } + + // Execute CloneFromDonor + pos, err := CloneFromDonor(env.ctx, env.ts, env.mysqld, env.keyspace, env.shard) + + // Verify results + if tc.wantErr { + require.Error(t, err) + if tc.wantErrContains != "" { + assert.ErrorContains(t, err, tc.wantErrContains) + } + } else { + require.NoError(t, err) + assert.NotEmpty(t, pos) + } + }) + } +} From e66d1ee3e1c2bffb9227c4621b517ab062ec885b Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 19 Dec 2025 21:18:02 -0500 Subject: [PATCH 07/33] more tests Signed-off-by: Max Englander --- go/vt/mysqlctl/clone_test.go | 276 +++++++++++++++++++++++++++++- go/vt/mysqlctl/fakemysqldaemon.go | 6 + 2 files changed, 280 insertions(+), 2 deletions(-) diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index 597db136dff..77d0d0da255 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -18,19 +18,27 @@ package mysqlctl import ( "context" + "fmt" + "net" + "strings" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/mysql/fakesqldb" + "vitess.io/vitess/go/mysql/replication" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/test/utils" + "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/memorytopo" + "vitess.io/vitess/go/vt/vtenv" + querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) @@ -302,6 +310,69 @@ func TestValidateRecipient(t *testing.T) { } } +type mockDonorHandler struct { + mysql.UnimplementedHandler + t *testing.T +} + +func (h *mockDonorHandler) ComQuery(c *mysql.Conn, query string, callback func(*sqltypes.Result) error) error { + // Respond to donor validation queries + switch { + case strings.Contains(query, "SELECT @@version"): + result := sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.32", + ) + return callback(result) + case strings.Contains(query, "SELECT PLUGIN_STATUS"): + result := sqltypes.MakeTestResult( + sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), + "ACTIVE", + ) + return callback(result) + case strings.Contains(query, "SELECT TABLE_SCHEMA"): + // Return empty result (no non-InnoDB tables) + result := sqltypes.MakeTestResult( + sqltypes.MakeTestFields("TABLE_SCHEMA|TABLE_NAME|ENGINE", "varchar|varchar|varchar"), + ) + return callback(result) + default: + return fmt.Errorf("unexpected query: %s", query) + } +} + +func (h *mockDonorHandler) ComQueryMulti(c *mysql.Conn, sql string, callback func(qr sqltypes.QueryResponse, more bool, firstPacket bool) error) error { + return fmt.Errorf("ComQueryMulti not implemented") +} + +func (h *mockDonorHandler) ComPrepare(c *mysql.Conn, query string) ([]*querypb.Field, uint16, error) { + return nil, 0, fmt.Errorf("ComPrepare not implemented") +} + +func (h *mockDonorHandler) ComStmtExecute(c *mysql.Conn, prepare *mysql.PrepareData, callback func(*sqltypes.Result) error) error { + return fmt.Errorf("ComStmtExecute not implemented") +} + +func (h *mockDonorHandler) ComRegisterReplica(c *mysql.Conn, replicaHost string, replicaPort uint16, replicaUser string, replicaPassword string) error { + return fmt.Errorf("ComRegisterReplica not implemented") +} + +func (h *mockDonorHandler) ComBinlogDump(c *mysql.Conn, logFile string, binlogPos uint32) error { + return fmt.Errorf("ComBinlogDump not implemented") +} + +func (h *mockDonorHandler) ComBinlogDumpGTID(c *mysql.Conn, logFile string, logPos uint64, gtidSet replication.GTIDSet) error { + return fmt.Errorf("ComBinlogDumpGTID not implemented") +} + +func (h *mockDonorHandler) WarningCount(c *mysql.Conn) uint16 { + return 0 +} + +func (h *mockDonorHandler) Env() *vtenv.Environment { + return vtenv.NewTestEnv() +} + type cloneFromDonorTestEnv struct { ctx context.Context logger *logutil.MemoryLogger @@ -355,7 +426,7 @@ func TestCloneFromDonor(t *testing.T) { wantErrContains string }{ { - name: "clone from primary, GetShard fails", + name: "clone from primary, get shard fails", cloneFromPrimary: true, setup: func(t *testing.T, env *cloneFromDonorTestEnv) { // Don't create the shard, so GetShard will fail @@ -392,23 +463,224 @@ func TestCloneFromDonor(t *testing.T) { wantErr: true, wantErrContains: "no donor specified", }, + { + name: "GetTablet fails", + cloneFromTablet: "cell1-100", + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // Don't create the tablet, so GetTablet will fail + }, + wantErr: true, + wantErrContains: "failed to get tablet", + }, + { + name: "clone user not configured", + cloneFromTablet: "cell1-100", + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // Create a valid tablet + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, + MysqlHostname: "donor-host", + MysqlPort: 3306, + Keyspace: env.keyspace, + Shard: env.shard, + } + require.NoError(t, env.ts.CreateTablet(env.ctx, tablet)) + + // Don't set GlobalDBConfigs.CloneUser, so it will be empty + }, + wantErr: true, + wantErrContains: "clone user not configured", + }, + { + name: "recipient validation fails", + cloneFromTablet: "cell1-100", + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // Create a valid tablet + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, + MysqlHostname: "donor-host", + MysqlPort: 3306, + Keyspace: env.keyspace, + Shard: env.shard, + } + require.NoError(t, env.ts.CreateTablet(env.ctx, tablet)) + + // Set up clone credentials + dbconfigs.GlobalDBConfigs.CloneUser = dbconfigs.UserConfig{ + User: "clone_user", + Password: "password", + } + + // Configure mysqld to return an old MySQL version + env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ + "SELECT @@version": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.16", + ), + } + }, + wantErr: true, + wantErrContains: "recipient validation failed", + }, + { + name: "get position after clone fails", + cloneFromTablet: "cell1-100", + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // Create mock donor server with auth for clone_user + jsonConfig := `{"clone_user": [{"Password": "password"}]}` + authServer := mysql.NewAuthServerStatic("", jsonConfig, 0) + handler := &mockDonorHandler{t: t} + + listener, err := mysql.NewListener("tcp", "127.0.0.1:", authServer, handler, 0, 0, false, false, 0, 0, false) + require.NoError(t, err) + + // Start accepting connections + go listener.Accept() + + // Clean up when test ends + t.Cleanup(func() { + listener.Close() + }) + + // Get the assigned host/port + host := listener.Addr().(*net.TCPAddr).IP.String() + port := listener.Addr().(*net.TCPAddr).Port + + // Create donor tablet with the mock server's address + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, + MysqlHostname: host, + MysqlPort: int32(port), + Keyspace: env.keyspace, + Shard: env.shard, + } + require.NoError(t, env.ts.CreateTablet(env.ctx, tablet)) + + // Set up clone credentials + dbconfigs.GlobalDBConfigs.CloneUser = dbconfigs.UserConfig{ + User: "clone_user", + Password: "password", + } + + // Configure recipient mysqld for successful validation and clone + env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ + "SELECT @@version": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.32", + ), + "SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), + "ACTIVE", + ), + "SELECT STATE, ERROR_NO, ERROR_MESSAGE FROM performance_schema.clone_status": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "Completed|0|", + ), + } + + // List all expected queries that ExecuteClone will run + env.mysqld.ExpectedExecuteSuperQueryList = []string{ + fmt.Sprintf("SET GLOBAL clone_valid_donor_list = '%s:%d'", host, port), + fmt.Sprintf("CLONE INSTANCE FROM 'clone_user'@'%s':%d IDENTIFIED BY 'password' REQUIRE NO SSL", host, port), + } + + // Make PrimaryPosition return an error + env.mysqld.PrimaryPositionError = assert.AnError + }, + wantErr: true, + wantErrContains: "failed to get position after clone", + }, + { + name: "success", + cloneFromTablet: "cell1-100", + setup: func(t *testing.T, env *cloneFromDonorTestEnv) { + // Create mock donor server with auth for clone_user + jsonConfig := `{"clone_user": [{"Password": "password"}]}` + authServer := mysql.NewAuthServerStatic("", jsonConfig, 0) + handler := &mockDonorHandler{t: t} + + listener, err := mysql.NewListener("tcp", "127.0.0.1:", authServer, handler, 0, 0, false, false, 0, 0, false) + require.NoError(t, err) + + // Start accepting connections + go listener.Accept() + + // Clean up when test ends + t.Cleanup(func() { + listener.Close() + }) + + // Get the assigned host/port + host := listener.Addr().(*net.TCPAddr).IP.String() + port := listener.Addr().(*net.TCPAddr).Port + + // Create donor tablet with the mock server's address + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, + MysqlHostname: host, + MysqlPort: int32(port), + Keyspace: env.keyspace, + Shard: env.shard, + } + require.NoError(t, env.ts.CreateTablet(env.ctx, tablet)) + + // Set up clone credentials + dbconfigs.GlobalDBConfigs.CloneUser = dbconfigs.UserConfig{ + User: "clone_user", + Password: "password", + } + + // Configure recipient mysqld for successful validation and clone + env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ + "SELECT @@version": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.32", + ), + "SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), + "ACTIVE", + ), + "SELECT STATE, ERROR_NO, ERROR_MESSAGE FROM performance_schema.clone_status": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "Completed|0|", + ), + } + + // Set a valid GTID position + env.mysqld.CurrentPrimaryPosition = replication.Position{ + GTIDSet: replication.Mysql56GTIDSet{}, + } + + // List all expected queries that ExecuteClone will run + env.mysqld.ExpectedExecuteSuperQueryList = []string{ + fmt.Sprintf("SET GLOBAL clone_valid_donor_list = '%s:%d'", host, port), + fmt.Sprintf("CLONE INSTANCE FROM 'clone_user'@'%s':%d IDENTIFIED BY 'password' REQUIRE NO SSL", host, port), + } + }, + wantErr: false, + }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { env := createCloneFromDonorTestEnv(t) - // Save and restore global flags + // Save and restore global flags and config oldCloneFromPrimary := cloneFromPrimary oldCloneFromTablet := cloneFromTablet + oldCloneUser := dbconfigs.GlobalDBConfigs.CloneUser + oldMysqlCloneEnabled := mysqlCloneEnabled defer func() { cloneFromPrimary = oldCloneFromPrimary cloneFromTablet = oldCloneFromTablet + dbconfigs.GlobalDBConfigs.CloneUser = oldCloneUser + mysqlCloneEnabled = oldMysqlCloneEnabled }() // Set test flag values cloneFromPrimary = tc.cloneFromPrimary cloneFromTablet = tc.cloneFromTablet + mysqlCloneEnabled = true // Run setup if provided if tc.setup != nil { diff --git a/go/vt/mysqlctl/fakemysqldaemon.go b/go/vt/mysqlctl/fakemysqldaemon.go index 8829c3ae578..f279933f78a 100644 --- a/go/vt/mysqlctl/fakemysqldaemon.go +++ b/go/vt/mysqlctl/fakemysqldaemon.go @@ -85,6 +85,9 @@ type FakeMysqlDaemon struct { // and ReplicationStatus. CurrentPrimaryPosition replication.Position + // PrimaryPositionError is used by PrimaryPosition. + PrimaryPositionError error + // CurrentRelayLogPosition is returned by ReplicationStatus. CurrentRelayLogPosition replication.Position @@ -415,6 +418,9 @@ func (fmd *FakeMysqlDaemon) GetPreviousGTIDs(ctx context.Context, binlog string) // PrimaryPosition is part of the MysqlDaemon interface. func (fmd *FakeMysqlDaemon) PrimaryPosition(ctx context.Context) (replication.Position, error) { + if fmd.PrimaryPositionError != nil { + return replication.Position{}, fmd.PrimaryPositionError + } return fmd.GetPrimaryPositionLocked(), nil } From 0d57e26f9dd0803d9ad2af66bde741cc812f79f2 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 19 Dec 2025 21:29:51 -0500 Subject: [PATCH 08/33] make it nicer Signed-off-by: Max Englander --- go/vt/mysqlctl/clone_test.go | 266 +++++++++++++---------------------- 1 file changed, 94 insertions(+), 172 deletions(-) diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index 77d0d0da255..52ed9d525a9 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -374,15 +374,18 @@ func (h *mockDonorHandler) Env() *vtenv.Environment { } type cloneFromDonorTestEnv struct { - ctx context.Context - logger *logutil.MemoryLogger - ts *topo.Server - mysqld *FakeMysqlDaemon - keyspace string - shard string + ctx context.Context + logger *logutil.MemoryLogger + ts *topo.Server + mysqld *FakeMysqlDaemon + keyspace string + shard string + donorHost string + donorPort int + donorAlias *topodatapb.TabletAlias } -func createCloneFromDonorTestEnv(t *testing.T) *cloneFromDonorTestEnv { +func createCloneFromDonorTestEnv(t *testing.T, donorHost string, donorPort int) *cloneFromDonorTestEnv { ctx := context.Background() logger := logutil.NewMemoryLogger() @@ -395,28 +398,95 @@ func createCloneFromDonorTestEnv(t *testing.T) *cloneFromDonorTestEnv { // Create keyspace in topology require.NoError(t, ts.CreateKeyspace(ctx, keyspace, &topodatapb.Keyspace{})) + // Create donor tablet in topology with the mock server's address + donorAlias := &topodatapb.TabletAlias{Cell: "cell1", Uid: 100} + tablet := &topodatapb.Tablet{ + Alias: donorAlias, + MysqlHostname: donorHost, + MysqlPort: int32(donorPort), + Keyspace: keyspace, + Shard: shard, + } + require.NoError(t, ts.CreateTablet(ctx, tablet)) + // Create fake MySQL daemon sqldb := fakesqldb.New(t) sqldb.SetNeverFail(true) mysqld := NewFakeMysqlDaemon(sqldb) + // Set up default clone credentials (success path) + dbconfigs.GlobalDBConfigs.CloneUser = dbconfigs.UserConfig{ + User: "clone_user", + Password: "password", + } + + // Configure recipient mysqld for successful validation and clone by default + mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ + "SELECT @@version": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.32", + ), + "SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), + "ACTIVE", + ), + "SELECT STATE, ERROR_NO, ERROR_MESSAGE FROM performance_schema.clone_status": sqltypes.MakeTestResult( + sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), + "Completed|0|", + ), + } + + // Set a valid GTID position by default + mysqld.CurrentPrimaryPosition = replication.Position{ + GTIDSet: replication.Mysql56GTIDSet{}, + } + + // List all expected queries that ExecuteClone will run + mysqld.ExpectedExecuteSuperQueryList = []string{ + fmt.Sprintf("SET GLOBAL clone_valid_donor_list = '%s:%d'", donorHost, donorPort), + fmt.Sprintf("CLONE INSTANCE FROM 'clone_user'@'%s':%d IDENTIFIED BY 'password' REQUIRE NO SSL", donorHost, donorPort), + } + t.Cleanup(func() { mysqld.Close() sqldb.Close() - utils.EnsureNoLeaks(t) }) return &cloneFromDonorTestEnv{ - ctx: ctx, - logger: logger, - ts: ts, - mysqld: mysqld, - keyspace: keyspace, - shard: shard, + ctx: ctx, + logger: logger, + ts: ts, + mysqld: mysqld, + keyspace: keyspace, + shard: shard, + donorHost: donorHost, + donorPort: donorPort, + donorAlias: donorAlias, } } func TestCloneFromDonor(t *testing.T) { + // Create mock donor MySQL server once for all test cases + jsonConfig := `{"clone_user": [{"Password": "password"}]}` + authServer := mysql.NewAuthServerStatic("", jsonConfig, 0) + handler := &mockDonorHandler{t: t} + + listener, err := mysql.NewListener("tcp", "127.0.0.1:", authServer, handler, 0, 0, false, false, 0, 0, false) + require.NoError(t, err) + + // Start accepting connections + go listener.Accept() + + // Clean up when all tests complete + t.Cleanup(func() { + listener.Close() + utils.EnsureNoLeaks(t) + }) + + // Get the assigned host/port + donorHost := listener.Addr().(*net.TCPAddr).IP.String() + donorPort := listener.Addr().(*net.TCPAddr).Port + testCases := []struct { name string cloneFromPrimary bool @@ -467,7 +537,8 @@ func TestCloneFromDonor(t *testing.T) { name: "GetTablet fails", cloneFromTablet: "cell1-100", setup: func(t *testing.T, env *cloneFromDonorTestEnv) { - // Don't create the tablet, so GetTablet will fail + // Delete the tablet that was created in env setup + require.NoError(t, env.ts.DeleteTablet(env.ctx, env.donorAlias)) }, wantErr: true, wantErrContains: "failed to get tablet", @@ -476,17 +547,8 @@ func TestCloneFromDonor(t *testing.T) { name: "clone user not configured", cloneFromTablet: "cell1-100", setup: func(t *testing.T, env *cloneFromDonorTestEnv) { - // Create a valid tablet - tablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, - MysqlHostname: "donor-host", - MysqlPort: 3306, - Keyspace: env.keyspace, - Shard: env.shard, - } - require.NoError(t, env.ts.CreateTablet(env.ctx, tablet)) - - // Don't set GlobalDBConfigs.CloneUser, so it will be empty + // Clear clone user config + dbconfigs.GlobalDBConfigs.CloneUser = dbconfigs.UserConfig{} }, wantErr: true, wantErrContains: "clone user not configured", @@ -495,29 +557,11 @@ func TestCloneFromDonor(t *testing.T) { name: "recipient validation fails", cloneFromTablet: "cell1-100", setup: func(t *testing.T, env *cloneFromDonorTestEnv) { - // Create a valid tablet - tablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, - MysqlHostname: "donor-host", - MysqlPort: 3306, - Keyspace: env.keyspace, - Shard: env.shard, - } - require.NoError(t, env.ts.CreateTablet(env.ctx, tablet)) - - // Set up clone credentials - dbconfigs.GlobalDBConfigs.CloneUser = dbconfigs.UserConfig{ - User: "clone_user", - Password: "password", - } - // Configure mysqld to return an old MySQL version - env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ - "SELECT @@version": sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.16", - ), - } + env.mysqld.FetchSuperQueryMap["SELECT @@version"] = sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.16", + ) }, wantErr: true, wantErrContains: "recipient validation failed", @@ -526,64 +570,6 @@ func TestCloneFromDonor(t *testing.T) { name: "get position after clone fails", cloneFromTablet: "cell1-100", setup: func(t *testing.T, env *cloneFromDonorTestEnv) { - // Create mock donor server with auth for clone_user - jsonConfig := `{"clone_user": [{"Password": "password"}]}` - authServer := mysql.NewAuthServerStatic("", jsonConfig, 0) - handler := &mockDonorHandler{t: t} - - listener, err := mysql.NewListener("tcp", "127.0.0.1:", authServer, handler, 0, 0, false, false, 0, 0, false) - require.NoError(t, err) - - // Start accepting connections - go listener.Accept() - - // Clean up when test ends - t.Cleanup(func() { - listener.Close() - }) - - // Get the assigned host/port - host := listener.Addr().(*net.TCPAddr).IP.String() - port := listener.Addr().(*net.TCPAddr).Port - - // Create donor tablet with the mock server's address - tablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, - MysqlHostname: host, - MysqlPort: int32(port), - Keyspace: env.keyspace, - Shard: env.shard, - } - require.NoError(t, env.ts.CreateTablet(env.ctx, tablet)) - - // Set up clone credentials - dbconfigs.GlobalDBConfigs.CloneUser = dbconfigs.UserConfig{ - User: "clone_user", - Password: "password", - } - - // Configure recipient mysqld for successful validation and clone - env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ - "SELECT @@version": sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.32", - ), - "SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'": sqltypes.MakeTestResult( - sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), - "ACTIVE", - ), - "SELECT STATE, ERROR_NO, ERROR_MESSAGE FROM performance_schema.clone_status": sqltypes.MakeTestResult( - sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), - "Completed|0|", - ), - } - - // List all expected queries that ExecuteClone will run - env.mysqld.ExpectedExecuteSuperQueryList = []string{ - fmt.Sprintf("SET GLOBAL clone_valid_donor_list = '%s:%d'", host, port), - fmt.Sprintf("CLONE INSTANCE FROM 'clone_user'@'%s':%d IDENTIFIED BY 'password' REQUIRE NO SSL", host, port), - } - // Make PrimaryPosition return an error env.mysqld.PrimaryPositionError = assert.AnError }, @@ -593,77 +579,13 @@ func TestCloneFromDonor(t *testing.T) { { name: "success", cloneFromTablet: "cell1-100", - setup: func(t *testing.T, env *cloneFromDonorTestEnv) { - // Create mock donor server with auth for clone_user - jsonConfig := `{"clone_user": [{"Password": "password"}]}` - authServer := mysql.NewAuthServerStatic("", jsonConfig, 0) - handler := &mockDonorHandler{t: t} - - listener, err := mysql.NewListener("tcp", "127.0.0.1:", authServer, handler, 0, 0, false, false, 0, 0, false) - require.NoError(t, err) - - // Start accepting connections - go listener.Accept() - - // Clean up when test ends - t.Cleanup(func() { - listener.Close() - }) - - // Get the assigned host/port - host := listener.Addr().(*net.TCPAddr).IP.String() - port := listener.Addr().(*net.TCPAddr).Port - - // Create donor tablet with the mock server's address - tablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, - MysqlHostname: host, - MysqlPort: int32(port), - Keyspace: env.keyspace, - Shard: env.shard, - } - require.NoError(t, env.ts.CreateTablet(env.ctx, tablet)) - - // Set up clone credentials - dbconfigs.GlobalDBConfigs.CloneUser = dbconfigs.UserConfig{ - User: "clone_user", - Password: "password", - } - - // Configure recipient mysqld for successful validation and clone - env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ - "SELECT @@version": sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.32", - ), - "SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'": sqltypes.MakeTestResult( - sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), - "ACTIVE", - ), - "SELECT STATE, ERROR_NO, ERROR_MESSAGE FROM performance_schema.clone_status": sqltypes.MakeTestResult( - sqltypes.MakeTestFields("STATE|ERROR_NO|ERROR_MESSAGE", "varchar|varchar|varchar"), - "Completed|0|", - ), - } - - // Set a valid GTID position - env.mysqld.CurrentPrimaryPosition = replication.Position{ - GTIDSet: replication.Mysql56GTIDSet{}, - } - - // List all expected queries that ExecuteClone will run - env.mysqld.ExpectedExecuteSuperQueryList = []string{ - fmt.Sprintf("SET GLOBAL clone_valid_donor_list = '%s:%d'", host, port), - fmt.Sprintf("CLONE INSTANCE FROM 'clone_user'@'%s':%d IDENTIFIED BY 'password' REQUIRE NO SSL", host, port), - } - }, - wantErr: false, + wantErr: false, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - env := createCloneFromDonorTestEnv(t) + env := createCloneFromDonorTestEnv(t, donorHost, donorPort) // Save and restore global flags and config oldCloneFromPrimary := cloneFromPrimary From 06a370b39189f48de6956873e637588aeac2e141 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 19 Dec 2025 21:37:19 -0500 Subject: [PATCH 09/33] make tests nicer Signed-off-by: Max Englander --- go/vt/mysqlctl/clone_test.go | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index 52ed9d525a9..9eb871a7cba 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -398,8 +398,18 @@ func createCloneFromDonorTestEnv(t *testing.T, donorHost string, donorPort int) // Create keyspace in topology require.NoError(t, ts.CreateKeyspace(ctx, keyspace, &topodatapb.Keyspace{})) - // Create donor tablet in topology with the mock server's address + // Create donor tablet alias donorAlias := &topodatapb.TabletAlias{Cell: "cell1", Uid: 100} + + // Create shard in topology with donor as primary + require.NoError(t, ts.CreateShard(ctx, keyspace, shard)) + _, err := ts.UpdateShardFields(ctx, keyspace, shard, func(si *topo.ShardInfo) error { + si.PrimaryAlias = donorAlias + return nil + }) + require.NoError(t, err) + + // Create donor tablet in topology with the mock server's address tablet := &topodatapb.Tablet{ Alias: donorAlias, MysqlHostname: donorHost, @@ -499,7 +509,8 @@ func TestCloneFromDonor(t *testing.T) { name: "clone from primary, get shard fails", cloneFromPrimary: true, setup: func(t *testing.T, env *cloneFromDonorTestEnv) { - // Don't create the shard, so GetShard will fail + // Delete the shard that was created in env setup + require.NoError(t, env.ts.DeleteShard(env.ctx, env.keyspace, env.shard)) }, wantErr: true, wantErrContains: "failed to get shard", @@ -508,8 +519,12 @@ func TestCloneFromDonor(t *testing.T) { name: "clone from primary, shard has no primary", cloneFromPrimary: true, setup: func(t *testing.T, env *cloneFromDonorTestEnv) { - // Create shard without a primary - require.NoError(t, env.ts.CreateShard(env.ctx, env.keyspace, env.shard)) + // Clear the primary alias from the shard + _, err := env.ts.UpdateShardFields(env.ctx, env.keyspace, env.shard, func(si *topo.ShardInfo) error { + si.PrimaryAlias = nil + return nil + }) + require.NoError(t, err) }, wantErr: true, wantErrContains: "has no primary", From 88340e56dc0f1ff849af8c0168e99facf1febb8a Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 19 Dec 2025 21:39:52 -0500 Subject: [PATCH 10/33] slight reorg add comments Signed-off-by: Max Englander --- go/vt/mysqlctl/clone_test.go | 128 ++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index 9eb871a7cba..c454d13ad49 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -310,69 +310,6 @@ func TestValidateRecipient(t *testing.T) { } } -type mockDonorHandler struct { - mysql.UnimplementedHandler - t *testing.T -} - -func (h *mockDonorHandler) ComQuery(c *mysql.Conn, query string, callback func(*sqltypes.Result) error) error { - // Respond to donor validation queries - switch { - case strings.Contains(query, "SELECT @@version"): - result := sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.32", - ) - return callback(result) - case strings.Contains(query, "SELECT PLUGIN_STATUS"): - result := sqltypes.MakeTestResult( - sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), - "ACTIVE", - ) - return callback(result) - case strings.Contains(query, "SELECT TABLE_SCHEMA"): - // Return empty result (no non-InnoDB tables) - result := sqltypes.MakeTestResult( - sqltypes.MakeTestFields("TABLE_SCHEMA|TABLE_NAME|ENGINE", "varchar|varchar|varchar"), - ) - return callback(result) - default: - return fmt.Errorf("unexpected query: %s", query) - } -} - -func (h *mockDonorHandler) ComQueryMulti(c *mysql.Conn, sql string, callback func(qr sqltypes.QueryResponse, more bool, firstPacket bool) error) error { - return fmt.Errorf("ComQueryMulti not implemented") -} - -func (h *mockDonorHandler) ComPrepare(c *mysql.Conn, query string) ([]*querypb.Field, uint16, error) { - return nil, 0, fmt.Errorf("ComPrepare not implemented") -} - -func (h *mockDonorHandler) ComStmtExecute(c *mysql.Conn, prepare *mysql.PrepareData, callback func(*sqltypes.Result) error) error { - return fmt.Errorf("ComStmtExecute not implemented") -} - -func (h *mockDonorHandler) ComRegisterReplica(c *mysql.Conn, replicaHost string, replicaPort uint16, replicaUser string, replicaPassword string) error { - return fmt.Errorf("ComRegisterReplica not implemented") -} - -func (h *mockDonorHandler) ComBinlogDump(c *mysql.Conn, logFile string, binlogPos uint32) error { - return fmt.Errorf("ComBinlogDump not implemented") -} - -func (h *mockDonorHandler) ComBinlogDumpGTID(c *mysql.Conn, logFile string, logPos uint64, gtidSet replication.GTIDSet) error { - return fmt.Errorf("ComBinlogDumpGTID not implemented") -} - -func (h *mockDonorHandler) WarningCount(c *mysql.Conn) uint16 { - return 0 -} - -func (h *mockDonorHandler) Env() *vtenv.Environment { - return vtenv.NewTestEnv() -} - type cloneFromDonorTestEnv struct { ctx context.Context logger *logutil.MemoryLogger @@ -475,6 +412,71 @@ func createCloneFromDonorTestEnv(t *testing.T, donorHost string, donorPort int) } } +// mockDonorHandler is used to create a minimal mysqld server that the recipient +// can connect to to verify it's safe to CLONE from. +type mockDonorHandler struct { + mysql.UnimplementedHandler + t *testing.T +} + +func (h *mockDonorHandler) ComQuery(c *mysql.Conn, query string, callback func(*sqltypes.Result) error) error { + // Respond to donor validation queries + switch { + case strings.Contains(query, "SELECT @@version"): + result := sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@version", "varchar"), + "8.0.32", + ) + return callback(result) + case strings.Contains(query, "SELECT PLUGIN_STATUS"): + result := sqltypes.MakeTestResult( + sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), + "ACTIVE", + ) + return callback(result) + case strings.Contains(query, "SELECT TABLE_SCHEMA"): + // Return empty result (no non-InnoDB tables) + result := sqltypes.MakeTestResult( + sqltypes.MakeTestFields("TABLE_SCHEMA|TABLE_NAME|ENGINE", "varchar|varchar|varchar"), + ) + return callback(result) + default: + return fmt.Errorf("unexpected query: %s", query) + } +} + +func (h *mockDonorHandler) ComQueryMulti(c *mysql.Conn, sql string, callback func(qr sqltypes.QueryResponse, more bool, firstPacket bool) error) error { + return fmt.Errorf("ComQueryMulti not implemented") +} + +func (h *mockDonorHandler) ComPrepare(c *mysql.Conn, query string) ([]*querypb.Field, uint16, error) { + return nil, 0, fmt.Errorf("ComPrepare not implemented") +} + +func (h *mockDonorHandler) ComStmtExecute(c *mysql.Conn, prepare *mysql.PrepareData, callback func(*sqltypes.Result) error) error { + return fmt.Errorf("ComStmtExecute not implemented") +} + +func (h *mockDonorHandler) ComRegisterReplica(c *mysql.Conn, replicaHost string, replicaPort uint16, replicaUser string, replicaPassword string) error { + return fmt.Errorf("ComRegisterReplica not implemented") +} + +func (h *mockDonorHandler) ComBinlogDump(c *mysql.Conn, logFile string, binlogPos uint32) error { + return fmt.Errorf("ComBinlogDump not implemented") +} + +func (h *mockDonorHandler) ComBinlogDumpGTID(c *mysql.Conn, logFile string, logPos uint64, gtidSet replication.GTIDSet) error { + return fmt.Errorf("ComBinlogDumpGTID not implemented") +} + +func (h *mockDonorHandler) WarningCount(c *mysql.Conn) uint16 { + return 0 +} + +func (h *mockDonorHandler) Env() *vtenv.Environment { + return vtenv.NewTestEnv() +} + func TestCloneFromDonor(t *testing.T) { // Create mock donor MySQL server once for all test cases jsonConfig := `{"clone_user": [{"Password": "password"}]}` From 01fabc0030c218556ab6e5fa0cd84c7bdc8c3572 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 19 Dec 2025 21:44:41 -0500 Subject: [PATCH 11/33] fix imports Signed-off-by: Max Englander --- go/vt/mysqlctl/clone.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 578fb3b3279..a30461ba5ad 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -24,6 +24,7 @@ import ( "time" "github.com/spf13/pflag" + "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/mysql/capabilities" "vitess.io/vitess/go/mysql/replication" From ca83a7a2454cebd22e9cd8b721d2f806f90bd5bc Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 19 Dec 2025 21:57:26 -0500 Subject: [PATCH 12/33] fix lint Signed-off-by: Max Englander --- go/vt/mysqlctl/clone.go | 2 +- go/vt/mysqlctl/clone_test.go | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index a30461ba5ad..6861709b07e 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -313,7 +313,7 @@ func CloneFromDonor(ctx context.Context, topoServer *topo.Server, mysqld MysqlDa return replication.Position{}, fmt.Errorf("invalid tablet alias %q: %v", cloneFromTablet, err) } default: - return replication.Position{}, fmt.Errorf("no donor specified") + return replication.Position{}, errors.New("no donor specified") } // Get donor tablet info from topology. diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index c454d13ad49..2a66d34822f 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -24,6 +24,7 @@ import ( "testing" "time" + "github.com/olekukonko/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -446,27 +447,27 @@ func (h *mockDonorHandler) ComQuery(c *mysql.Conn, query string, callback func(* } func (h *mockDonorHandler) ComQueryMulti(c *mysql.Conn, sql string, callback func(qr sqltypes.QueryResponse, more bool, firstPacket bool) error) error { - return fmt.Errorf("ComQueryMulti not implemented") + return errors.New("ComQueryMulti not implemented") } func (h *mockDonorHandler) ComPrepare(c *mysql.Conn, query string) ([]*querypb.Field, uint16, error) { - return nil, 0, fmt.Errorf("ComPrepare not implemented") + return nil, 0, errors.New("ComPrepare not implemented") } func (h *mockDonorHandler) ComStmtExecute(c *mysql.Conn, prepare *mysql.PrepareData, callback func(*sqltypes.Result) error) error { - return fmt.Errorf("ComStmtExecute not implemented") + return errors.New("ComStmtExecute not implemented") } func (h *mockDonorHandler) ComRegisterReplica(c *mysql.Conn, replicaHost string, replicaPort uint16, replicaUser string, replicaPassword string) error { - return fmt.Errorf("ComRegisterReplica not implemented") + return errors.New("ComRegisterReplica not implemented") } func (h *mockDonorHandler) ComBinlogDump(c *mysql.Conn, logFile string, binlogPos uint32) error { - return fmt.Errorf("ComBinlogDump not implemented") + return errors.New("ComBinlogDump not implemented") } func (h *mockDonorHandler) ComBinlogDumpGTID(c *mysql.Conn, logFile string, logPos uint64, gtidSet replication.GTIDSet) error { - return fmt.Errorf("ComBinlogDumpGTID not implemented") + return errors.New("ComBinlogDumpGTID not implemented") } func (h *mockDonorHandler) WarningCount(c *mysql.Conn) uint16 { From 2cdb9c45a98b2fb17ecc28aee1a1d7a0816291c3 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Tue, 23 Dec 2025 13:04:33 -0500 Subject: [PATCH 13/33] one more lint Signed-off-by: Max Englander --- go/vt/mysqlctl/clone.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 6861709b07e..03df53de41f 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -325,7 +325,7 @@ func CloneFromDonor(ctx context.Context, topoServer *topo.Server, mysqld MysqlDa // Get clone credentials. cloneConfig := dbconfigs.GlobalDBConfigs.CloneUser if cloneConfig.User == "" { - return replication.Position{}, fmt.Errorf("clone user not configured; set --db-clone-user flag") + return replication.Position{}, errors.New("clone user not configured; set --db-clone-user flag") } // Create the clone executor. From 66e4998cd8a595e82628aa26137c808e87dd263b Mon Sep 17 00:00:00 2001 From: Max Englander Date: Tue, 23 Dec 2025 13:27:40 -0500 Subject: [PATCH 14/33] tidy Signed-off-by: Max Englander --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index c438516adfe..99986270793 100644 --- a/go.mod +++ b/go.mod @@ -102,6 +102,7 @@ require ( github.com/kr/pretty v0.3.1 github.com/kr/text v0.2.0 github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249 + github.com/olekukonko/errors v1.1.0 github.com/shirou/gopsutil/v4 v4.25.8 github.com/spf13/afero v1.15.0 github.com/spf13/jwalterweatherman v1.1.0 @@ -137,7 +138,6 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/olekukonko/cat v0.0.0-20250911104152-50322a0618f6 // indirect - github.com/olekukonko/errors v1.1.0 // indirect github.com/olekukonko/ll v0.1.1 // indirect github.com/opencontainers/runtime-spec v1.2.1 // indirect github.com/puzpuzpuz/xsync/v3 v3.5.1 // indirect From 8633eaf98e9d7d9d16e03cddd456d822bcf83c02 Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Tue, 23 Dec 2025 21:25:36 +0000 Subject: [PATCH 15/33] add an end to end test and do some cleanup Signed-off-by: Nick Van Wiggeren --- go/test/endtoend/clone/clone_test.go | 384 +++++++++++++++++++++++++++ go/vt/mysqlctl/clone.go | 11 +- go/vt/mysqlctl/clone_test.go | 38 +-- go/vt/mysqlctl/mysqld.go | 5 + 4 files changed, 409 insertions(+), 29 deletions(-) create mode 100644 go/test/endtoend/clone/clone_test.go diff --git a/go/test/endtoend/clone/clone_test.go b/go/test/endtoend/clone/clone_test.go new file mode 100644 index 00000000000..04e08c2ced4 --- /dev/null +++ b/go/test/endtoend/clone/clone_test.go @@ -0,0 +1,384 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clone + +import ( + "context" + "flag" + "fmt" + "os" + "os/exec" + "path" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/mysql/capabilities" + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/utils" + "vitess.io/vitess/go/vt/dbconfigs" + "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/mysqlctl" +) + +var ( + clusterInstance *cluster.LocalProcessCluster + donorTablet *cluster.Vttablet + recipientTablet *cluster.Vttablet + hostname = "localhost" + cell = "zone1" +) + +func TestMain(m *testing.M) { + flag.Parse() + + exitCode := func() int { + // Check MySQL version first - skip entire test suite if not supported + versionStr, err := mysqlctl.GetVersionString() + if err != nil { + log.Infof("Skipping clone tests: unable to get MySQL version: %v", err) + return 0 + } + log.Infof("Detected MySQL version: %s", versionStr) + + flavor, version, err := mysqlctl.ParseVersionString(versionStr) + if err != nil { + log.Infof("Skipping clone tests: unable to parse MySQL version: %v", err) + return 0 + } + log.Infof("Parsed flavor: %v, version: %d.%d.%d", flavor, version.Major, version.Minor, version.Patch) + + // Clone is only supported on MySQL 8.0.17+ + if flavor != mysqlctl.FlavorMySQL && flavor != mysqlctl.FlavorPercona { + log.Infof("Skipping clone tests: MySQL CLONE requires MySQL or Percona, got flavor: %v", flavor) + return 0 + } + if version.Major < 8 || (version.Major == 8 && version.Minor == 0 && version.Patch < 17) { + log.Infof("Skipping clone tests: MySQL CLONE requires version 8.0.17+, got: %d.%d.%d", version.Major, version.Minor, version.Patch) + return 0 + } + + // Verify clone capability using the clean version string + cleanVersion := fmt.Sprintf("%d.%d.%d", version.Major, version.Minor, version.Patch) + capableOf := mysql.ServerVersionCapableOf(cleanVersion) + if capableOf == nil { + log.Infof("Skipping clone tests: unable to get capability checker for version %s", cleanVersion) + return 0 + } + hasClone, err := capableOf(capabilities.MySQLClonePluginFlavorCapability) + if err != nil || !hasClone { + log.Infof("Skipping clone tests: MySQL version %s does not support CLONE plugin", cleanVersion) + return 0 + } + log.Infof("MySQL version %s supports CLONE plugin, proceeding with tests", cleanVersion) + + // Setup EXTRA_MY_CNF for clone plugin + if err := setupExtraMyCnf(); err != nil { + log.Errorf("Failed to setup extra MySQL config: %v", err) + return 1 + } + + clusterInstance = cluster.NewCluster(cell, hostname) + defer clusterInstance.Teardown() + + // Start topo server + if err := clusterInstance.StartTopo(); err != nil { + log.Errorf("Failed to start topo: %v", err) + return 1 + } + + // Initialize cluster with 2 tablets for clone testing + if err := initClusterForClone(); err != nil { + log.Errorf("Failed to init cluster: %v", err) + return 1 + } + + // Clean up MySQL processes explicitly since we don't register them with the cluster + defer func() { + for _, tablet := range []*cluster.Vttablet{donorTablet, recipientTablet} { + if tablet != nil { + if err := tablet.MysqlctlProcess.Stop(); err != nil { + log.Errorf("Failed to stop MySQL for tablet %d: %v", tablet.TabletUID, err) + } + } + } + }() + + return m.Run() + }() + os.Exit(exitCode) +} + +// setupExtraMyCnf sets EXTRA_MY_CNF to include clone plugin configuration +func setupExtraMyCnf() error { + cloneCnfPath := path.Join(os.Getenv("VTROOT"), "config", "mycnf", "clone.cnf") + if _, err := os.Stat(cloneCnfPath); os.IsNotExist(err) { + return fmt.Errorf("clone.cnf not found at %s", cloneCnfPath) + } + + // Check if EXTRA_MY_CNF is already set + existing := os.Getenv("EXTRA_MY_CNF") + if existing != "" { + // Append clone.cnf to existing + if err := os.Setenv("EXTRA_MY_CNF", existing+":"+cloneCnfPath); err != nil { + return fmt.Errorf("failed to set EXTRA_MY_CNF: %v", err) + } + } else { + if err := os.Setenv("EXTRA_MY_CNF", cloneCnfPath); err != nil { + return fmt.Errorf("failed to set EXTRA_MY_CNF: %v", err) + } + } + + log.Infof("Set EXTRA_MY_CNF to include clone plugin: %s", os.Getenv("EXTRA_MY_CNF")) + return nil +} + +// initClusterForClone sets up two MySQL instances for clone testing +func initClusterForClone() error { + // Create a combined init file that includes clone user + initDBWithClone, err := createInitDBWithCloneUser() + if err != nil { + return fmt.Errorf("failed to create init DB file: %v", err) + } + log.Infof("Created combined init file at: %s", initDBWithClone) + + var mysqlCtlProcessList []*exec.Cmd + + // Create donor tablet (will be the clone source) + donorTablet = &cluster.Vttablet{ + TabletUID: clusterInstance.GetAndReserveTabletUID(), + HTTPPort: clusterInstance.GetAndReservePort(), + GrpcPort: clusterInstance.GetAndReservePort(), + MySQLPort: clusterInstance.GetAndReservePort(), + Type: "primary", + } + donorTablet.Alias = fmt.Sprintf("%s-%010d", clusterInstance.Cell, donorTablet.TabletUID) + + // Create recipient tablet (will receive cloned data) + recipientTablet = &cluster.Vttablet{ + TabletUID: clusterInstance.GetAndReserveTabletUID(), + HTTPPort: clusterInstance.GetAndReservePort(), + GrpcPort: clusterInstance.GetAndReservePort(), + MySQLPort: clusterInstance.GetAndReservePort(), + Type: "replica", + } + recipientTablet.Alias = fmt.Sprintf("%s-%010d", clusterInstance.Cell, recipientTablet.TabletUID) + + // Start MySQL for both tablets with custom init file that includes clone user + for _, tablet := range []*cluster.Vttablet{donorTablet, recipientTablet} { + mysqlctlProcess, err := cluster.MysqlCtlProcessInstance( + tablet.TabletUID, + tablet.MySQLPort, + clusterInstance.TmpDirectory, + ) + if err != nil { + return fmt.Errorf("failed to create mysqlctl for tablet %d: %v", tablet.TabletUID, err) + } + // Use our custom init file with clone user + mysqlctlProcess.InitDBFile = initDBWithClone + tablet.MysqlctlProcess = *mysqlctlProcess + + proc, err := tablet.MysqlctlProcess.StartProcess() + if err != nil { + return fmt.Errorf("failed to start MySQL for tablet %d: %v", tablet.TabletUID, err) + } + mysqlCtlProcessList = append(mysqlCtlProcessList, proc) + } + + // Wait for MySQL processes to be ready + for _, proc := range mysqlCtlProcessList { + if err := proc.Wait(); err != nil { + return fmt.Errorf("MySQL process failed to start: %v", err) + } + } + log.Infof("MySQL processes started successfully") + + // Note: We intentionally do NOT register tablets with shards/keyspaces + // because we only start MySQL processes (not vttablets). The standard + // Teardown would try to stop VttabletProcess which we never started. + // Instead, we clean up MySQL explicitly in TestMain. + + return nil +} + +// createInitDBWithCloneUser creates a combined init_db.sql that includes clone user setup. +// It uses the official {{custom_sql}} marker in init_db.sql to inject the clone user SQL. +func createInitDBWithCloneUser() (string, error) { + initDBPath := path.Join(os.Getenv("VTROOT"), "config", "init_db.sql") + initClonePath := path.Join(os.Getenv("VTROOT"), "config", "init_clone.sql") + + initDB, err := os.ReadFile(initDBPath) + if err != nil { + return "", fmt.Errorf("failed to read init_db.sql: %v", err) + } + + initClone, err := os.ReadFile(initClonePath) + if err != nil { + return "", fmt.Errorf("failed to read init_clone.sql: %v", err) + } + + // Use the official {{custom_sql}} marker pattern to inject clone user SQL + combined, err := utils.GetInitDBSQL(string(initDB), string(initClone), "") + if err != nil { + return "", fmt.Errorf("failed to inject clone SQL: %v", err) + } + + // Write to temp file + combinedPath := path.Join(clusterInstance.TmpDirectory, "init_db_with_clone.sql") + if err := os.WriteFile(combinedPath, []byte(combined), 0666); err != nil { + return "", fmt.Errorf("failed to write combined init file: %v", err) + } + + return combinedPath, nil +} + +// connectToTablet creates a MySQL connection to the given tablet +func connectToTablet(ctx context.Context, tablet *cluster.Vttablet) (*mysql.Conn, error) { + socketPath := path.Join(os.Getenv("VTDATAROOT"), fmt.Sprintf("vt_%010d", tablet.TabletUID), "mysql.sock") + params := mysql.ConnParams{ + Uname: "vt_dba", + UnixSocket: socketPath, + } + return mysql.Connect(ctx, ¶ms) +} + +// createMysqldForTablet creates a Mysqld instance for CloneExecutor +func createMysqldForTablet(tablet *cluster.Vttablet) *mysqlctl.Mysqld { + socketPath := path.Join(os.Getenv("VTDATAROOT"), fmt.Sprintf("vt_%010d", tablet.TabletUID), "mysql.sock") + + dbcfgs := dbconfigs.NewTestDBConfigs(mysql.ConnParams{ + UnixSocket: socketPath, + Uname: "vt_dba", + }, mysql.ConnParams{ + UnixSocket: socketPath, + Uname: "vt_app", + }, "") + + return mysqlctl.NewMysqld(dbcfgs) +} + +// TestCloneRemote tests MySQL CLONE INSTANCE functionality +func TestCloneRemote(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) + defer cancel() + + // Connect to donor and insert test data + donorConn, err := connectToTablet(ctx, donorTablet) + require.NoError(t, err, "Failed to connect to donor") + defer donorConn.Close() + + // Disable super_read_only so we can create test data + _, err = donorConn.ExecuteFetch("SET GLOBAL super_read_only = OFF", 0, false) + require.NoError(t, err, "Failed to disable super_read_only") + _, err = donorConn.ExecuteFetch("SET GLOBAL read_only = OFF", 0, false) + require.NoError(t, err, "Failed to disable read_only") + + // Create test database and table on donor + _, err = donorConn.ExecuteFetch("CREATE DATABASE IF NOT EXISTS test_clone", 0, false) + require.NoError(t, err, "Failed to create test database") + + _, err = donorConn.ExecuteFetch(` + CREATE TABLE IF NOT EXISTS test_clone.clone_test ( + id INT AUTO_INCREMENT PRIMARY KEY, + msg VARCHAR(255) + ) ENGINE=InnoDB + `, 0, false) + require.NoError(t, err, "Failed to create test table") + + // Insert test data + for i := 1; i <= 10; i++ { + _, err = donorConn.ExecuteFetch(fmt.Sprintf( + "INSERT INTO test_clone.clone_test (msg) VALUES ('test message %d')", i), 0, false) + require.NoError(t, err, "Failed to insert test data row %d", i) + } + + // Verify donor has the data + qr, err := donorConn.ExecuteFetch("SELECT COUNT(*) FROM test_clone.clone_test", 1, false) + require.NoError(t, err, "Failed to count rows on donor") + require.Len(t, qr.Rows, 1) + require.Equal(t, "10", qr.Rows[0][0].ToString(), "Donor should have 10 rows") + + // Pre-clone verification: ensure recipient does NOT have the test database + // This proves the clone actually transfers data, not that it was already there + recipientConnPreClone, err := connectToTablet(ctx, recipientTablet) + require.NoError(t, err, "Failed to connect to recipient for pre-clone check") + qr, err = recipientConnPreClone.ExecuteFetch("SHOW DATABASES LIKE 'test_clone'", 1, false) + require.NoError(t, err, "Failed to check for test_clone database on recipient") + require.Len(t, qr.Rows, 0, "Recipient should NOT have test_clone database before clone") + recipientConnPreClone.Close() + + // Create Mysqld instance for recipient (needed by CloneExecutor) + recipientMysqld := createMysqldForTablet(recipientTablet) + defer recipientMysqld.Close() + + // Enable MySQL CLONE for the test + mysqlctl.SetMySQLCloneEnabled(true) + defer mysqlctl.SetMySQLCloneEnabled(false) + + // Execute clone + executor := &mysqlctl.CloneExecutor{ + DonorHost: "127.0.0.1", + DonorPort: donorTablet.MySQLPort, + DonorUser: "vt_clone", + DonorPassword: "", + UseSSL: false, + } + + err = executor.ExecuteClone(ctx, recipientMysqld, 5*time.Minute) + require.NoError(t, err, "Clone operation failed") + + // Connect to recipient and verify data + recipientConn, err := connectToTablet(ctx, recipientTablet) + require.NoError(t, err, "Failed to connect to recipient after clone") + defer recipientConn.Close() + + // Verify clone succeeded at MySQL level via performance_schema.clone_status + qr, err = recipientConn.ExecuteFetch( + "SELECT STATE, ERROR_NO, ERROR_MESSAGE FROM performance_schema.clone_status ORDER BY ID DESC LIMIT 1", 1, false) + require.NoError(t, err, "Failed to query clone_status") + require.Len(t, qr.Rows, 1, "Expected one clone_status row") + cloneState := qr.Rows[0][0].ToString() + cloneErrorNo := qr.Rows[0][1].ToString() + cloneErrorMsg := qr.Rows[0][2].ToString() + require.Equal(t, "Completed", cloneState, "Clone state should be Completed") + require.Equal(t, "0", cloneErrorNo, "Clone should have no error, got: %s", cloneErrorMsg) + + // Verify recipient has the cloned data + qr, err = recipientConn.ExecuteFetch("SELECT COUNT(*) FROM test_clone.clone_test", 1, false) + require.NoError(t, err, "Failed to count rows on recipient") + require.Len(t, qr.Rows, 1) + require.Equal(t, "10", qr.Rows[0][0].ToString(), "Recipient should have 10 rows after clone") + + // Verify actual data content matches + donorData, err := donorConn.ExecuteFetch("SELECT id, msg FROM test_clone.clone_test ORDER BY id", 100, false) + require.NoError(t, err) + + recipientData, err := recipientConn.ExecuteFetch("SELECT id, msg FROM test_clone.clone_test ORDER BY id", 100, false) + require.NoError(t, err) + + require.Equal(t, len(donorData.Rows), len(recipientData.Rows), "Row counts should match") + for i := range donorData.Rows { + assert.Equal(t, donorData.Rows[i][0].ToString(), recipientData.Rows[i][0].ToString(), "IDs should match at row %d", i) + assert.Equal(t, donorData.Rows[i][1].ToString(), recipientData.Rows[i][1].ToString(), "Messages should match at row %d", i) + } + + t.Logf("Clone test passed: successfully cloned 10 rows from donor (tablet %d) to recipient (tablet %d)", + donorTablet.TabletUID, recipientTablet.TabletUID) +} diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index c4ebbe4d94c..011c8704726 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -141,16 +141,19 @@ func (c *CloneExecutor) validateDonorRemote(ctx context.Context) error { // checkCloneCapability verifies that the MySQL version supports the CLONE plugin. func (c *CloneExecutor) checkCloneCapability(ctx context.Context, mysqld MysqlDaemon) error { - result, err := mysqld.FetchSuperQuery(ctx, "SELECT @@version") + versionStr, err := mysqld.GetVersionString(ctx) if err != nil { return fmt.Errorf("failed to query MySQL version: %w", err) } - if len(result.Rows) == 0 || len(result.Rows[0]) == 0 { - return errors.New("empty version result") + // GetVersionString may return either SQL query result (e.g., "8.0.44") or CLI output + // (e.g., "/usr/sbin/mysqld Ver 8.0.44..."). Try parsing as CLI output first to + // extract a clean version string. + _, version, parseErr := ParseVersionString(versionStr) + if parseErr == nil { + versionStr = fmt.Sprintf("%d.%d.%d", version.Major, version.Minor, version.Patch) } - versionStr := result.Rows[0][0].ToString() capableOf := mysql.ServerVersionCapableOf(versionStr) if capableOf == nil { return fmt.Errorf("unable to determine MySQL capabilities for version %q", versionStr) diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index 10e65cc2fa8..86f3c199414 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -213,17 +213,14 @@ func Test_waitForCloneComplete_ContextCanceled(t *testing.T) { func TestValidateRecipient(t *testing.T) { tests := []struct { name string - versionQuery *sqltypes.Result + version string pluginQuery *sqltypes.Result expectError bool errorContain string }{ { - name: "valid MySQL 8.0.32 with clone plugin", - versionQuery: sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.32", - ), + name: "valid MySQL 8.0.32 with clone plugin", + version: "8.0.32", pluginQuery: sqltypes.MakeTestResult( sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), "ACTIVE", @@ -231,30 +228,21 @@ func TestValidateRecipient(t *testing.T) { expectError: false, }, { - name: "MySQL version too old", - versionQuery: sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.16", - ), + name: "MySQL version too old", + version: "8.0.16", expectError: true, errorContain: "requires version 8.0.17", }, { - name: "clone plugin not installed", - versionQuery: sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.32", - ), + name: "clone plugin not installed", + version: "8.0.32", pluginQuery: sqltypes.MakeTestResult(sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar")), expectError: true, errorContain: "clone plugin is not installed", }, { - name: "clone plugin not active", - versionQuery: sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.32", - ), + name: "clone plugin not active", + version: "8.0.32", pluginQuery: sqltypes.MakeTestResult( sqltypes.MakeTestFields("PLUGIN_STATUS", "varchar"), "DISABLED", @@ -269,11 +257,11 @@ func TestValidateRecipient(t *testing.T) { fmd := NewFakeMysqlDaemon(nil) defer fmd.Close() - fmd.FetchSuperQueryMap = map[string]*sqltypes.Result{ - "SELECT @@version": tt.versionQuery, - } + fmd.Version = tt.version if tt.pluginQuery != nil { - fmd.FetchSuperQueryMap["SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'"] = tt.pluginQuery + fmd.FetchSuperQueryMap = map[string]*sqltypes.Result{ + "SELECT PLUGIN_STATUS FROM information_schema.PLUGINS WHERE PLUGIN_NAME = 'clone'": tt.pluginQuery, + } } executor := &CloneExecutor{ diff --git a/go/vt/mysqlctl/mysqld.go b/go/vt/mysqlctl/mysqld.go index 706eebd45ee..a54e7e0013e 100644 --- a/go/vt/mysqlctl/mysqld.go +++ b/go/vt/mysqlctl/mysqld.go @@ -153,6 +153,11 @@ func MySQLCloneEnabled() bool { return mysqlCloneEnabled } +// SetMySQLCloneEnabled sets the MySQL CLONE enabled flag. This is intended for testing. +func SetMySQLCloneEnabled(enabled bool) { + mysqlCloneEnabled = enabled +} + func registerReparentFlags(fs *pflag.FlagSet) { utils.SetFlagBoolVar(fs, &DisableActiveReparents, "disable-active-reparents", DisableActiveReparents, "if set, do not allow active reparents. Use this to protect a cluster using external reparents.") } From 13ace338c1e9654b1e35e1e9cfb8188be111a3a5 Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Tue, 23 Dec 2025 21:35:07 +0000 Subject: [PATCH 16/33] use time tickers in the for loop Signed-off-by: Nick Van Wiggeren --- go/vt/mysqlctl/clone.go | 88 +++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 47 deletions(-) diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 011c8704726..1ee5d2b8754 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -269,65 +269,59 @@ func (c *CloneExecutor) checkClonePluginInstalled(ctx context.Context, mysqld My // retry until MySQL is back and clone_status shows completion. func (c *CloneExecutor) waitForCloneComplete(ctx context.Context, mysqld MysqlDaemon, timeout time.Duration) error { const pollInterval = time.Second - - deadline := time.Now().Add(timeout) query := "SELECT STATE, ERROR_NO, ERROR_MESSAGE FROM performance_schema.clone_status ORDER BY ID DESC LIMIT 1" log.Infof("Waiting for clone to complete (timeout: %v)", timeout) + timer := time.NewTimer(timeout) + defer timer.Stop() + ticker := time.NewTicker(pollInterval) + defer ticker.Stop() + for { - // Check context cancellation select { case <-ctx.Done(): return ctx.Err() - default: - } - - // Check timeout - if time.Now().After(deadline) { + case <-timer.C: return fmt.Errorf("timeout waiting for clone to complete after %v", timeout) - } - - // Try to query clone status - connection may fail if MySQL is restarting - result, err := mysqld.FetchSuperQuery(ctx, query) - if err != nil { - // Connection failures are expected during MySQL restart - log.Infof("Clone status query failed (MySQL may be restarting): %v", err) - time.Sleep(pollInterval) - continue - } - - if len(result.Rows) == 0 { - // No clone status yet - MySQL may have just started - log.Infof("No clone status found, waiting...") - time.Sleep(pollInterval) - continue - } - - state := result.Rows[0][0].ToString() - errorNo := result.Rows[0][1].ToString() - errorMsg := result.Rows[0][2].ToString() + case <-ticker.C: + // Try to query clone status - connection may fail if MySQL is restarting + result, err := mysqld.FetchSuperQuery(ctx, query) + if err != nil { + // Connection failures are expected during MySQL restart + log.Infof("Clone status query failed (MySQL may be restarting): %v", err) + continue + } - log.Infof("Clone status: STATE=%s, ERROR_NO=%s", state, errorNo) + if len(result.Rows) == 0 { + // No clone status yet - MySQL may have just started + log.Infof("No clone status found, waiting...") + continue + } - switch state { - case "Completed": - if errorNo != "0" { - return fmt.Errorf("clone completed with error %s: %s", errorNo, errorMsg) + state := result.Rows[0][0].ToString() + errorNo := result.Rows[0][1].ToString() + errorMsg := result.Rows[0][2].ToString() + + log.Infof("Clone status: STATE=%s, ERROR_NO=%s", state, errorNo) + + switch state { + case "Completed": + if errorNo != "0" { + return fmt.Errorf("clone completed with error %s: %s", errorNo, errorMsg) + } + log.Infof("Clone completed successfully") + return nil + case "Failed": + return fmt.Errorf("clone failed with error %s: %s", errorNo, errorMsg) + case "In Progress", "Not Started": + // Still running, keep waiting + continue + default: + // Unknown state, keep waiting but log it + log.Warningf("Unknown clone state: %s", state) + continue } - log.Infof("Clone completed successfully") - return nil - case "Failed": - return fmt.Errorf("clone failed with error %s: %s", errorNo, errorMsg) - case "In Progress", "Not Started": - // Still running, keep waiting - time.Sleep(pollInterval) - continue - default: - // Unknown state, keep waiting but log it - log.Warningf("Unknown clone state: %s", state) - time.Sleep(pollInterval) - continue } } } From 43a534f63f94fffd6a8e332a1cb31390d65e704f Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Tue, 23 Dec 2025 13:56:10 -0800 Subject: [PATCH 17/33] tmrpctest: fix flaky TestGRPCTMServer timeout test Signed-off-by: Nick Van Wiggeren --- go/vt/vttablet/tmrpctest/test_tm_rpc.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/vt/vttablet/tmrpctest/test_tm_rpc.go b/go/vt/vttablet/tmrpctest/test_tm_rpc.go index 46f42245607..c267d624028 100644 --- a/go/vt/vttablet/tmrpctest/test_tm_rpc.go +++ b/go/vt/vttablet/tmrpctest/test_tm_rpc.go @@ -315,8 +315,8 @@ func tmRPCTestRPCTimeout(ctx context.Context, t *testing.T, client tmclient.Tabl if err := shortCtx.Err(); err != context.DeadlineExceeded { t.Errorf("tmRPCTestRPCTimeout: got %v want context.DeadlineExceeded", err) } - default: - t.Errorf("tmRPCTestRPCTimeout: context.Done() not closed") + case <-time.After(time.Second): + t.Errorf("tmRPCTestRPCTimeout: context.Done() not closed within timeout") } } From dbe89f7e5a1842092f3e491c6d37eaf98121743f Mon Sep 17 00:00:00 2001 From: Max Englander Date: Wed, 24 Dec 2025 19:01:53 -0500 Subject: [PATCH 18/33] fix test Signed-off-by: Max Englander --- go/vt/mysqlctl/clone_test.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/go/vt/mysqlctl/clone_test.go b/go/vt/mysqlctl/clone_test.go index 3dcd4874e96..55f970c8223 100644 --- a/go/vt/mysqlctl/clone_test.go +++ b/go/vt/mysqlctl/clone_test.go @@ -357,6 +357,7 @@ func createCloneFromDonorTestEnv(t *testing.T, donorHost string, donorPort int) } // Configure recipient mysqld for successful validation and clone by default + mysqld.Version = "8.0.32" mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ "SELECT @@version": sqltypes.MakeTestResult( sqltypes.MakeTestFields("@@version", "varchar"), @@ -564,10 +565,7 @@ func TestCloneFromDonor(t *testing.T) { cloneFromTablet: "cell1-100", setup: func(t *testing.T, env *cloneFromDonorTestEnv) { // Configure mysqld to return an old MySQL version - env.mysqld.FetchSuperQueryMap["SELECT @@version"] = sqltypes.MakeTestResult( - sqltypes.MakeTestFields("@@version", "varchar"), - "8.0.16", - ) + env.mysqld.Version = "8.0.16" }, wantErr: true, wantErrContains: "recipient validation failed", From e7e255916c56520b50c8c1e91d6a51e9f05cbee8 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Thu, 25 Dec 2025 18:28:37 -0500 Subject: [PATCH 19/33] go/vt/{mysqlctl,vttablet}: support restore tablet with CLONE Signed-off-by: Max Englander --- go/cmd/vttablet/cli/cli_test.go | 2 +- go/flags/endtoend/vtcombo.txt | 1 + go/flags/endtoend/vttablet.txt | 3 + go/test/endtoend/backup/clone/main_test.go | 299 +++++++++++++ go/test/endtoend/backup/clone/restore_test.go | 209 +++++++++ go/vt/mysqlctl/backup.go | 15 +- go/vt/mysqlctl/backup_test.go | 12 +- go/vt/mysqlctl/clone.go | 4 +- go/vt/vttablet/tabletmanager/restore.go | 405 ++++++++++++++---- go/vt/vttablet/tabletmanager/rpc_backup.go | 2 +- go/vt/vttablet/tabletmanager/tm_init.go | 45 +- go/vt/wrangler/testlib/backup_test.go | 12 +- test/config.json | 9 + 13 files changed, 889 insertions(+), 129 deletions(-) create mode 100644 go/test/endtoend/backup/clone/main_test.go create mode 100644 go/test/endtoend/backup/clone/restore_test.go diff --git a/go/cmd/vttablet/cli/cli_test.go b/go/cmd/vttablet/cli/cli_test.go index 305e736f4c4..da8cc9aa0bf 100644 --- a/go/cmd/vttablet/cli/cli_test.go +++ b/go/cmd/vttablet/cli/cli_test.go @@ -69,5 +69,5 @@ func TestRunFailsToStartTabletManager(t *testing.T) { defer cancel() err := Main.ExecuteContext(ctx) - require.ErrorContains(t, err, "you cannot enable --restore-from-backup without a my.cnf file") + require.ErrorContains(t, err, "you cannot enable --restore-from-backup or --restore-with-clone without a my.cnf file") } diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index edfb1058bf4..e2e6d2d0900 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -323,6 +323,7 @@ Flags: --restore-from-backup-ts string (init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050' --restore-to-pos string (init incremental restore parameter) if set, run a point in time recovery that ends with the given position. This will attempt to use one full backup followed by zero or more incremental backups --restore-to-timestamp string (init incremental restore parameter) if set, run a point in time recovery that restores up to the given timestamp, if possible. Given timestamp in RFC3339 format. Example: '2006-01-02T15:04:05Z07:00' + --restore-with-clone (init restore parameter) will restore from a clone, requires either --clone-from-primary or --clone-from-tablet, mutually exclusive with --restore-from-backup --retain-online-ddl-tables duration How long should vttablet keep an old migrated table before purging it (default 24h0m0s) --sanitize-log-messages Remove potentially sensitive information in tablet INFO, WARNING, and ERROR log messages such as query parameters. --schema-change-reload-timeout duration query server schema change reload timeout, this is how long to wait for the signaled schema reload operation to complete before giving up (default 30s) diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index d9a27b62ea6..068f1e029c9 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -70,6 +70,8 @@ Flags: --builtinbackup-progress duration how often to send progress updates when backing up large files. (default 5s) --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified --ceph-backup-storage-config string Path to JSON config file for ceph backup storage. (default "ceph_backup_config.json") + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. @@ -314,6 +316,7 @@ Flags: --restore-from-backup-ts string (init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050' --restore-to-pos string (init incremental restore parameter) if set, run a point in time recovery that ends with the given position. This will attempt to use one full backup followed by zero or more incremental backups --restore-to-timestamp string (init incremental restore parameter) if set, run a point in time recovery that restores up to the given timestamp, if possible. Given timestamp in RFC3339 format. Example: '2006-01-02T15:04:05Z07:00' + --restore-with-clone (init restore parameter) will restore from a clone, requires either --clone-from-primary or --clone-from-tablet, mutually exclusive with --restore-from-backup --retain-online-ddl-tables duration How long should vttablet keep an old migrated table before purging it (default 24h0m0s) --s3-backup-aws-endpoint string endpoint of the S3 backend (region must be provided). --s3-backup-aws-min-partsize int Minimum part size to use, defaults to 5MiB but can be increased due to the dataset size. (default 5242880) diff --git a/go/test/endtoend/backup/clone/main_test.go b/go/test/endtoend/backup/clone/main_test.go new file mode 100644 index 00000000000..02ec9bc0b96 --- /dev/null +++ b/go/test/endtoend/backup/clone/main_test.go @@ -0,0 +1,299 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clone + +import ( + "errors" + "flag" + "fmt" + "os" + "os/exec" + "path" + "strconv" + "strings" + "testing" + + "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/mysql/capabilities" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/utils" + "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/mysqlctl" + vtutils "vitess.io/vitess/go/vt/utils" +) + +var ( + primary *cluster.Vttablet + replica1 *cluster.Vttablet + replica2 *cluster.Vttablet + localCluster *cluster.LocalProcessCluster + newInitDBFile string + cell = cluster.DefaultCell + hostname = "localhost" + keyspaceName = "ks" + shardName = "0" + dbPassword = "VtDbaPass" + shardKsName = fmt.Sprintf("%s/%s", keyspaceName, shardName) + dbCredentialFile string + commonTabletArg = []string{ + vtutils.GetFlagVariantForTests("--vreplication-retry-delay"), "1s", + vtutils.GetFlagVariantForTests("--degraded-threshold"), "5s", + vtutils.GetFlagVariantForTests("--lock-tables-timeout"), "5s", + vtutils.GetFlagVariantForTests("--watch-replication-stream"), + vtutils.GetFlagVariantForTests("--enable-replication-reporter"), + vtutils.GetFlagVariantForTests("--serving-state-grace-period"), "1s", + } + vtInsertTest = ` + create table if not exists vt_insert_test ( + id bigint auto_increment, + msg varchar(64), + primary key (id) + ) Engine=InnoDB;` +) + +func TestMain(m *testing.M) { + flag.Parse() + + exitCode, err := func() (int, error) { + localCluster = cluster.NewCluster(cell, hostname) + defer localCluster.Teardown() + + // Setup EXTRA_MY_CNF for clone plugin + if err := setupExtraMyCnf(); err != nil { + log.Errorf("Failed to setup extra MySQL config: %v", err) + return 1, err + } + + // Start topo server + err := localCluster.StartTopo() + if err != nil { + return 1, err + } + + // Start keyspace + localCluster.Keyspaces = []cluster.Keyspace{ + { + Name: keyspaceName, + Shards: []cluster.Shard{ + { + Name: shardName, + }, + }, + }, + } + shard := &localCluster.Keyspaces[0].Shards[0] + vtctldClientProcess := cluster.VtctldClientProcessInstance(localCluster.VtctldProcess.GrpcPort, localCluster.TopoPort, "localhost", localCluster.TmpDirectory) + _, err = vtctldClientProcess.ExecuteCommandWithOutput("CreateKeyspace", keyspaceName, "--durability-policy=semi_sync") + if err != nil { + return 1, err + } + + // Create a new init_db.sql file that sets up passwords for all users and clone user + dbCredentialFile = cluster.WriteDbCredentialToTmp(localCluster.TmpDirectory) + initDb, _ := os.ReadFile(path.Join(os.Getenv("VTROOT"), "/config/init_db.sql")) + initClone, err := os.ReadFile(path.Join(os.Getenv("VTROOT"), "/config/init_clone.sql")) + if err != nil { + log.Warningf("init_clone.sql not found, clone tests may fail: %v", err) + initClone = []byte("") + } + + sql := string(initDb) + // The original init_db.sql does not have any passwords. Here we update the init file with passwords + sql, err = utils.GetInitDBSQL(sql, cluster.GetPasswordUpdateSQL(localCluster), string(initClone)) + if err != nil { + return 1, err + } + newInitDBFile = path.Join(localCluster.TmpDirectory, "init_db_with_passwords_and_clone.sql") + err = os.WriteFile(newInitDBFile, []byte(sql), 0666) + if err != nil { + return 1, err + } + + extraArgs := []string{"--db-credentials-file", dbCredentialFile} + commonTabletArg = append(commonTabletArg, "--db-credentials-file", dbCredentialFile) + + primary = localCluster.NewVttabletInstance("replica", 0, "") + replica1 = localCluster.NewVttabletInstance("replica", 0, "") + replica2 = localCluster.NewVttabletInstance("replica", 0, "") + shard.Vttablets = []*cluster.Vttablet{primary, replica1, replica2} + + // Start MySql processes + var mysqlProcs []*exec.Cmd + for _, tablet := range shard.Vttablets { + tablet.VttabletProcess = localCluster.VtprocessInstanceFromVttablet(tablet, shard.Name, keyspaceName) + tablet.VttabletProcess.DbPassword = dbPassword + tablet.VttabletProcess.ExtraArgs = commonTabletArg + tablet.VttabletProcess.SupportsBackup = true + + mysqlctlProcess, err := cluster.MysqlCtlProcessInstance(tablet.TabletUID, tablet.MySQLPort, localCluster.TmpDirectory) + if err != nil { + return 1, err + } + tablet.MysqlctlProcess = *mysqlctlProcess + tablet.MysqlctlProcess.InitDBFile = newInitDBFile + tablet.MysqlctlProcess.ExtraArgs = extraArgs + proc, err := tablet.MysqlctlProcess.StartProcess() + if err != nil { + return 1, err + } + mysqlProcs = append(mysqlProcs, proc) + } + for _, proc := range mysqlProcs { + if err := proc.Wait(); err != nil { + return 1, err + } + } + + if localCluster.VtTabletMajorVersion >= 16 { + // If vttablets are any lower than version 16, then they are running the replication manager. + // Running VTOrc and replication manager sometimes creates the situation where VTOrc has set up semi-sync on the primary, + // but the replication manager starts replication on the replica without setting semi-sync. This hangs the primary. + // Even if VTOrc fixes it, since there is no ongoing traffic, the state remains blocked. + if err := localCluster.StartVTOrc(keyspaceName); err != nil { + return 1, err + } + } + + return m.Run(), nil + }() + + if err != nil { + log.Error(err.Error()) + os.Exit(1) + } else { + os.Exit(exitCode) + } +} + +// setupExtraMyCnf sets EXTRA_MY_CNF to include clone plugin configuration +func setupExtraMyCnf() error { + cloneCnfPath := path.Join(os.Getenv("VTROOT"), "config", "mycnf", "clone.cnf") + if _, err := os.Stat(cloneCnfPath); os.IsNotExist(err) { + return fmt.Errorf("clone.cnf not found at %s", cloneCnfPath) + } + + // Check if EXTRA_MY_CNF is already set + existing := os.Getenv("EXTRA_MY_CNF") + if existing != "" { + // Append clone.cnf to existing + if err := os.Setenv("EXTRA_MY_CNF", existing+":"+cloneCnfPath); err != nil { + return fmt.Errorf("failed to set EXTRA_MY_CNF: %v", err) + } + } else { + if err := os.Setenv("EXTRA_MY_CNF", cloneCnfPath); err != nil { + return fmt.Errorf("failed to set EXTRA_MY_CNF: %v", err) + } + } + + log.Infof("Set EXTRA_MY_CNF to include clone plugin: %s", os.Getenv("EXTRA_MY_CNF")) + return nil +} + +// getMySQLVersion retrieves the MySQL version from a running tablet +func getMySQLVersion(t *testing.T, tablet *cluster.Vttablet) string { + qr, err := tablet.VttabletProcess.QueryTablet("SELECT VERSION()", keyspaceName, true) + if err != nil { + t.Logf("Failed to get MySQL version: %v", err) + return "" + } + if len(qr.Rows) == 0 { + return "" + } + return qr.Rows[0][0].ToString() +} + +// mysqlVersionSupportsClone checks if the MySQL version supports CLONE plugin +func mysqlVersionSupportsClone(versionStr string) bool { + // Parse version string to extract numeric version + // Format might be: "8.0.35-27" or "8.0.35" + parts := strings.Split(versionStr, "-") + versionPart := parts[0] + + // Parse the version + flavor, version, err := mysqlctl.ParseVersionString(versionPart) + if err != nil { + return false + } + + // Clone is only supported on MySQL 8.0.17+ + if flavor != mysqlctl.FlavorMySQL && flavor != mysqlctl.FlavorPercona { + return false + } + if version.Major < 8 || (version.Major == 8 && version.Minor == 0 && version.Patch < 17) { + return false + } + + // Verify clone capability + cleanVersion := fmt.Sprintf("%d.%d.%d", version.Major, version.Minor, version.Patch) + capableOf := mysql.ServerVersionCapableOf(cleanVersion) + if capableOf == nil { + return false + } + hasClone, err := capableOf(capabilities.MySQLClonePluginFlavorCapability) + return err == nil && hasClone +} + +// clonePluginAvailable checks if the clone plugin is installed and active +func clonePluginAvailable(t *testing.T, tablet *cluster.Vttablet) bool { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT PLUGIN_STATUS FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME = 'clone'", + keyspaceName, true) + if err != nil { + t.Logf("Failed to check clone plugin: %v", err) + return false + } + if len(qr.Rows) == 0 { + return false + } + status := qr.Rows[0][0].ToString() + return status == "ACTIVE" +} + +// parseVersionFromRow parses MySQL version from a result row +func parseVersionFromRow(row []sqltypes.Value) (int, int, int, error) { + if len(row) == 0 { + return 0, 0, 0, errors.New("empty row") + } + + versionStr := row[0].ToString() + // Version format: "8.0.35" or "8.0.35-27" + parts := strings.Split(versionStr, "-") + versionPart := parts[0] + + versionNums := strings.Split(versionPart, ".") + if len(versionNums) < 3 { + return 0, 0, 0, fmt.Errorf("invalid version format: %s", versionStr) + } + + major, err := strconv.Atoi(versionNums[0]) + if err != nil { + return 0, 0, 0, fmt.Errorf("invalid major version: %s", versionNums[0]) + } + + minor, err := strconv.Atoi(versionNums[1]) + if err != nil { + return 0, 0, 0, fmt.Errorf("invalid minor version: %s", versionNums[1]) + } + + patch, err := strconv.Atoi(versionNums[2]) + if err != nil { + return 0, 0, 0, fmt.Errorf("invalid patch version: %s", versionNums[2]) + } + + return major, minor, patch, nil +} diff --git a/go/test/endtoend/backup/clone/restore_test.go b/go/test/endtoend/backup/clone/restore_test.go new file mode 100644 index 00000000000..62362b2508a --- /dev/null +++ b/go/test/endtoend/backup/clone/restore_test.go @@ -0,0 +1,209 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clone + +import ( + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/test/endtoend/cluster" +) + +// TestCloneRestore tests clone-based replica provisioning via vttablet's +// --restore-with-clone flag. This simulates the workflow where a new replica +// is provisioned by cloning data from the primary instead of restoring from backup. +func TestCloneRestore(t *testing.T) { + // Initialize primary and replica1 first (need replica for semi-sync durability). + for _, tablet := range []*cluster.Vttablet{primary, replica1} { + err := localCluster.InitTablet(tablet, keyspaceName, shardName) + require.NoError(t, err) + err = tablet.VttabletProcess.Setup() + require.NoError(t, err) + } + + // Initialize shard primary. + err := localCluster.VtctldClientProcess.InitShardPrimary(keyspaceName, shardName, cell, primary.TabletUID) + require.NoError(t, err) + + // Wait for replica1 to catch up. + time.Sleep(2 * time.Second) + + // Now check if MySQL version supports clone (need vttablet running to query). + mysqlVersion := getMySQLVersion(t, primary) + if !mysqlVersionSupportsClone(mysqlVersion) { + t.Skipf("Skipping clone test: MySQL version %s does not support CLONE (requires 8.0.17+)", mysqlVersion) + } + + // Check if clone plugin is available. + if !clonePluginAvailable(t, primary) { + t.Skip("Skipping clone test: clone plugin not available") + } + + // Set up clean test data (table may have data from previous tests). + _, err = primary.VttabletProcess.QueryTablet(vtInsertTest, keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("TRUNCATE TABLE vt_insert_test", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_restore_1')", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_restore_2')", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_restore_3')", keyspaceName, true) + require.NoError(t, err) + + // Verify data exists on primary. + cluster.VerifyRowsInTablet(t, primary, keyspaceName, 3) + + // Bring up replica2 using clone from primary. + err = localCluster.InitTablet(replica2, keyspaceName, shardName) + require.NoError(t, err) + restoreWithClone(t, replica2, "replica", "SERVING", true) + + // Wait for MySQL to restart after clone. + time.Sleep(10 * time.Second) + + // Verify clone worked: data exists, clone_status confirms, replication is set up. + cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 3) + verifyClonedData(t, replica2) + verifyCloneWasUsed(t, replica2) + verifyReplicationTopology(t, replica2) + + // Insert rows on primary and verify they replicate to the cloned replica. + for i := 1; i <= 5; i++ { + _, err = primary.VttabletProcess.QueryTablet( + fmt.Sprintf("insert into vt_insert_test (msg) values ('after_clone_%d')", i), + keyspaceName, true) + require.NoError(t, err) + } + time.Sleep(5 * time.Second) + + cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 8) + verifyPostCloneReplication(t, replica2) + + // Cleanup. + tearDownRestoreTest() +} + +// restoreWithClone starts a tablet that will use MySQL CLONE to get its data. +func restoreWithClone(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string, cloneFromPrimary bool) { + tablet.VttabletProcess.ExtraArgs = []string{ + "--db-credentials-file", dbCredentialFile, + // Enable restore with clone - this triggers the clone logic. + "--restore-with-clone", + // Clone configuration - tells vttablet to clone instead of restoring from backup. + "--clone-from-primary", + "--db-clone-user", "vt_clone", + "--db-clone-password", "", + "--db-clone-use-ssl=false", + } + tablet.VttabletProcess.TabletType = tabletType + tablet.VttabletProcess.ServingStatus = waitForState + tablet.VttabletProcess.SupportsBackup = true + + err := tablet.VttabletProcess.Setup() + require.NoError(t, err) +} + +// verifyClonedData checks that the specific test data we inserted on primary +// exists on the cloned replica. This proves data was actually transferred. +func verifyClonedData(t *testing.T, tablet *cluster.Vttablet) { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT msg FROM vt_insert_test ORDER BY id", + keyspaceName, + true, + ) + require.NoError(t, err) + require.Len(t, qr.Rows, 3, "Expected 3 rows from clone") + + expectedValues := []string{"clone_restore_1", "clone_restore_2", "clone_restore_3"} + for i, row := range qr.Rows { + assert.Equal(t, expectedValues[i], row[0].ToString()) + } +} + +// verifyReplicationTopology checks that the cloned replica has properly joined +// the replication topology and is replicating from the primary. +func verifyReplicationTopology(t *testing.T, tablet *cluster.Vttablet) { + qr, err := tablet.VttabletProcess.QueryTablet("SHOW REPLICA STATUS", keyspaceName, true) + require.NoError(t, err) + require.NotEmpty(t, qr.Rows, "Replica status is empty - not replicating") + + // Find column indices. + var ioRunningIdx, sqlRunningIdx = -1, -1 + for i, field := range qr.Fields { + switch field.Name { + case "Replica_IO_Running": + ioRunningIdx = i + case "Replica_SQL_Running": + sqlRunningIdx = i + } + } + + row := qr.Rows[0] + assert.Equal(t, "Yes", row[ioRunningIdx].ToString(), "Replica IO thread not running") + assert.Equal(t, "Yes", row[sqlRunningIdx].ToString(), "Replica SQL thread not running") +} + +// verifyPostCloneReplication checks that data inserted after the clone +// was properly replicated to the cloned replica. +func verifyPostCloneReplication(t *testing.T, tablet *cluster.Vttablet) { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT msg FROM vt_insert_test WHERE msg LIKE 'after_clone_%' ORDER BY id", + keyspaceName, + true, + ) + require.NoError(t, err) + require.Len(t, qr.Rows, 5, "Expected 5 post-clone rows via replication") + + for i, row := range qr.Rows { + expected := fmt.Sprintf("after_clone_%d", i+1) + assert.Equal(t, expected, row[0].ToString()) + } +} + +// verifyCloneWasUsed checks performance_schema.clone_status to verify that +// MySQL CLONE was actually used to restore the tablet. +func verifyCloneWasUsed(t *testing.T, tablet *cluster.Vttablet) { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT STATE, SOURCE, ERROR_NO FROM performance_schema.clone_status", + keyspaceName, + true, + ) + require.NoError(t, err) + require.NotEmpty(t, qr.Rows, "clone_status is empty - CLONE was not used") + + row := qr.Rows[0] + assert.Equal(t, "Completed", row[0].ToString(), "Clone did not complete") + assert.NotEmpty(t, row[1].ToString(), "Clone source is empty") + assert.Equal(t, "0", row[2].ToString(), "Clone had an error") +} + +// tearDownRestoreTest cleans up tablets created during the restore test. +func tearDownRestoreTest() { + for _, tablet := range []*cluster.Vttablet{primary, replica1, replica2} { + if tablet != nil && tablet.VttabletProcess != nil { + _ = tablet.VttabletProcess.TearDown() + } + if tablet != nil { + _ = localCluster.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", tablet.Alias) + } + } +} diff --git a/go/vt/mysqlctl/backup.go b/go/vt/mysqlctl/backup.go index 97af3224894..0c4abdb1a39 100644 --- a/go/vt/mysqlctl/backup.go +++ b/go/vt/mysqlctl/backup.go @@ -315,21 +315,22 @@ func removeExistingFiles(cnf *Mycnf) error { // ShouldRestore checks whether a database with tables already exists // and returns whether a restore action should be performed -func ShouldRestore(ctx context.Context, params RestoreParams) (bool, error) { - if params.DeleteBeforeRestore || RestoreWasInterrupted(params.Cnf) { +func ShouldRestore(ctx context.Context, logger logutil.Logger, cnf *Mycnf, mysqld MysqlDaemon, + dbName string, deleteBeforeRestore bool) (bool, error) { + if deleteBeforeRestore || RestoreWasInterrupted(cnf) { return true, nil } - params.Logger.Infof("Restore: No %v file found, checking no existing data is present", RestoreState) + logger.Infof("Restore: No %v file found, checking no existing data is present", RestoreState) // Wait for mysqld to be ready, in case it was launched in parallel with us. // If this doesn't succeed, we should not attempt a restore - if err := params.Mysqld.Wait(ctx, params.Cnf); err != nil { + if err := mysqld.Wait(ctx, cnf); err != nil { return false, err } - if err := params.Mysqld.WaitForDBAGrants(ctx, DbaGrantWaitTime); err != nil { - params.Logger.Errorf("error waiting for the grants: %v", err) + if err := mysqld.WaitForDBAGrants(ctx, DbaGrantWaitTime); err != nil { + logger.Errorf("error waiting for the grants: %v", err) return false, err } - return checkNoDB(ctx, params.Mysqld, params.DbName) + return checkNoDB(ctx, mysqld, dbName) } // ensureRestoredGTIDPurgedMatchesManifest sees the following: when you restore a full backup, you want the MySQL server to have diff --git a/go/vt/mysqlctl/backup_test.go b/go/vt/mysqlctl/backup_test.go index 380ee762d24..cbdaf047ddf 100644 --- a/go/vt/mysqlctl/backup_test.go +++ b/go/vt/mysqlctl/backup_test.go @@ -693,12 +693,14 @@ func TestParseBackupName(t *testing.T) { func TestShouldRestore(t *testing.T) { env := createFakeBackupRestoreEnv(t) - b, err := ShouldRestore(env.ctx, env.restoreParams) + b, err := ShouldRestore(env.ctx, env.restoreParams.Logger, env.restoreParams.Cnf, + env.restoreParams.Mysqld, env.restoreParams.DbName, env.restoreParams.DeleteBeforeRestore) assert.False(t, b) assert.Error(t, err) env.restoreParams.DeleteBeforeRestore = true - b, err = ShouldRestore(env.ctx, env.restoreParams) + b, err = ShouldRestore(env.ctx, env.restoreParams.Logger, env.restoreParams.Cnf, + env.restoreParams.Mysqld, env.restoreParams.DbName, env.restoreParams.DeleteBeforeRestore) assert.True(t, b) assert.NoError(t, err) env.restoreParams.DeleteBeforeRestore = false @@ -706,14 +708,16 @@ func TestShouldRestore(t *testing.T) { env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ "SHOW DATABASES": {Rows: [][]sqltypes.Value{{sqltypes.NewVarBinary("any_db")}}}, } - b, err = ShouldRestore(env.ctx, env.restoreParams) + b, err = ShouldRestore(env.ctx, env.restoreParams.Logger, env.restoreParams.Cnf, + env.restoreParams.Mysqld, env.restoreParams.DbName, env.restoreParams.DeleteBeforeRestore) assert.NoError(t, err) assert.True(t, b) env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ "SHOW DATABASES": {Rows: [][]sqltypes.Value{{sqltypes.NewVarBinary("test")}}}, } - b, err = ShouldRestore(env.ctx, env.restoreParams) + b, err = ShouldRestore(env.ctx, env.restoreParams.Logger, env.restoreParams.Cnf, + env.restoreParams.Mysqld, env.restoreParams.DbName, env.restoreParams.DeleteBeforeRestore) assert.False(t, b) assert.NoError(t, err) } diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 4121758f926..295af64898d 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -43,8 +43,8 @@ var ( ) func init() { - // TODO: enable these flags for vttablet and vtbackup. - for _, cmd := range []string{ /*"vttablet", "vtbackup"*/ } { + // TODO: enable these flags for vtbackup. + for _, cmd := range []string{"vttablet" /*, "vtbackup"*/} { servenv.OnParseFor(cmd, registerCloneFlags) } } diff --git a/go/vt/vttablet/tabletmanager/restore.go b/go/vt/vttablet/tabletmanager/restore.go index a648f01efe4..cb5137f2439 100644 --- a/go/vt/vttablet/tabletmanager/restore.go +++ b/go/vt/vttablet/tabletmanager/restore.go @@ -18,7 +18,6 @@ package tabletmanager import ( "context" - "errors" "fmt" "time" @@ -52,9 +51,10 @@ var ( restoreFromBackupTsStr string restoreConcurrency = 4 waitForBackupInterval time.Duration + restoreWithClone bool - statsRestoreBackupTime *stats.String - statsRestoreBackupPosition *stats.String + statsRestoreBackupTime *stats.String + statsRestoreBackup *stats.String ) func registerRestoreFlags(fs *pflag.FlagSet) { @@ -63,6 +63,7 @@ func registerRestoreFlags(fs *pflag.FlagSet) { utils.SetFlagStringVar(fs, &restoreFromBackupTsStr, "restore-from-backup-ts", restoreFromBackupTsStr, "(init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050'") utils.SetFlagIntVar(fs, &restoreConcurrency, "restore-concurrency", restoreConcurrency, "(init restore parameter) how many concurrent files to restore at once") utils.SetFlagDurationVar(fs, &waitForBackupInterval, "wait-for-backup-interval", waitForBackupInterval, "(init restore parameter) if this is greater than 0, instead of starting up empty when no backups are found, keep checking at this interval for a backup to appear") + utils.SetFlagBoolVar(fs, &restoreWithClone, "restore-with-clone", restoreWithClone, "(init restore parameter) will restore from a clone, requires either --clone-from-primary or --clone-from-tablet, mutually exclusive with --restore-from-backup") } var ( @@ -84,14 +85,13 @@ func init() { servenv.OnParseFor("vttablet", registerIncrementalRestoreFlags) statsRestoreBackupTime = stats.NewString("RestoredBackupTime") - statsRestoreBackupPosition = stats.NewString("RestorePosition") + statsRestoreBackup = stats.NewString("RestorePosition") } -// RestoreData is the main entry point for backup restore. -// It will either work, fail gracefully, or return -// an error in case of a non-recoverable error. +// RestoreBackup is the main entry point for backup restore. It will either +// work, fail gracefully, or return an error in case of a non-recoverable error. // It takes the action lock so no RPC interferes. -func (tm *TabletManager) RestoreData( +func (tm *TabletManager) RestoreBackup( ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, @@ -105,9 +105,6 @@ func (tm *TabletManager) RestoreData( return err } defer tm.unlock() - if tm.Cnf == nil { - return errors.New("cannot perform restore without my.cnf, please restart vttablet with a my.cnf file specified") - } var ( err error @@ -115,31 +112,7 @@ func (tm *TabletManager) RestoreData( ) defer func() { - stopTime := time.Now() - - h := hook.NewSimpleHook("vttablet_restore_done") - h.ExtraEnv = tm.hookExtraEnv() - h.ExtraEnv["TM_RESTORE_DATA_START_TS"] = startTime.UTC().Format(time.RFC3339) - h.ExtraEnv["TM_RESTORE_DATA_STOP_TS"] = stopTime.UTC().Format(time.RFC3339) - h.ExtraEnv["TM_RESTORE_DATA_DURATION"] = stopTime.Sub(startTime).String() - - if err != nil { - h.ExtraEnv["TM_RESTORE_DATA_ERROR"] = err.Error() - } - - // vttablet_restore_done is best-effort (for now?). - go func() { - // Package vthook already logs the stdout/stderr of hooks when they - // are run, so we don't duplicate that here. - hr := h.Execute() - switch hr.ExitStatus { - case hook.HOOK_SUCCESS: - case hook.HOOK_DOES_NOT_EXIST: - log.Info("No vttablet_restore_done hook.") - default: - log.Warning("vttablet_restore_done hook failed") - } - }() + tm.invokeRestoreDoneHook(startTime, err) }() startTime = time.Now() @@ -150,16 +123,16 @@ func (tm *TabletManager) RestoreData( RestoreToTimestamp: protoutil.TimeToProto(restoreToTimetamp), AllowedBackupEngines: allowedBackupEngines, } - err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, req, mysqlShutdownTimeout) + err = tm.restoreBackupLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, req, mysqlShutdownTimeout) if err != nil { return err } + return nil } -func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, request *tabletmanagerdatapb.RestoreFromBackupRequest, mysqlShutdownTimeout time.Duration) error { +func (tm *TabletManager) restoreBackupLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, request *tabletmanagerdatapb.RestoreFromBackupRequest, mysqlShutdownTimeout time.Duration) error { tablet := tm.Tablet() - originalType := tablet.Type // Try to restore. Depending on the reason for failure, we may be ok. // If we're not ok, return an error and the tm will log.Fatalf, // causing the process to be restarted and the restore retried. @@ -177,7 +150,7 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L return vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("snapshot keyspace %v has no base_keyspace set", tablet.Keyspace)) } keyspace = keyspaceInfo.BaseKeyspace - log.Infof("Using base_keyspace %v to restore keyspace %v using a backup time of %v", keyspace, tablet.Keyspace, protoutil.TimeFromProto(request.BackupTime).UTC()) + logger.Infof("Using base_keyspace %v to restore keyspace %v using a backup time of %v", keyspace, tablet.Keyspace, protoutil.TimeFromProto(request.BackupTime).UTC()) } startTime := protoutil.TimeFromProto(request.BackupTime).UTC() @@ -216,35 +189,27 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L // Restore to given timestamp params.RestoreToTimestamp = restoreToTimestamp } - params.Logger.Infof("Restore: original tablet type=%v", originalType) - // Check whether we're going to restore before changing to RESTORE type, - // so we keep our PrimaryTermStartTime (if any) if we aren't actually restoring. - ok, err := mysqlctl.ShouldRestore(ctx, params) - if err != nil { - return err - } - if !ok { - params.Logger.Infof("Attempting to restore, but mysqld already contains data. Assuming vttablet was just restarted.") + rsm := tm.newRestoreStateManager(logger, deleteBeforeRestore) + + if ok, err := rsm.start(ctx); !ok || err != nil { + if err != nil { + return vterrors.Wrap(err, "failed to start restore") + } + // Restore cannot be started for a benign reason, e.g. mysqld already + // has data. return nil } - // We should not become primary after restore, because that would incorrectly - // start a new primary term, and it's likely our data dir will be out of date. - if originalType == topodatapb.TabletType_PRIMARY { - originalType = tm.baseTabletType - } - if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE, DBActionNone); err != nil { - return err - } + // Loop until a backup exists, unless we were told to give up immediately. var backupManifest *mysqlctl.BackupManifest for { backupManifest, err = mysqlctl.Restore(ctx, params) if backupManifest != nil { - statsRestoreBackupPosition.Set(replication.EncodePosition(backupManifest.Position)) + statsRestoreBackup.Set(replication.EncodePosition(backupManifest.Position)) statsRestoreBackupTime.Set(backupManifest.BackupTime) } - params.Logger.Infof("Restore: got a restore manifest: %v, err=%v, waitForBackupInterval=%v", backupManifest, err, waitForBackupInterval) + logger.Infof("Restore: got a restore manifest: %v, err=%v, waitForBackupInterval=%v", backupManifest, err, waitForBackupInterval) if waitForBackupInterval == 0 { break } @@ -264,8 +229,10 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L var pos replication.Position if backupManifest != nil { pos = backupManifest.Position - params.Logger.Infof("Restore: pos=%v", replication.EncodePosition(pos)) + logger.Infof("Restore: pos=%v", replication.EncodePosition(pos)) } + + var replCmd replicationCommand switch { case err == nil && backupManifest != nil: // Starting from here we won't be able to recover if we get stopped by a cancelled @@ -274,53 +241,96 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L // The whole point of point-in-time recovery is that we want to restore up to a given position, // and to NOT proceed from that position. We want to disable replication and NOT let the replica catch // up with the primary. - params.Logger.Infof("Restore: disabling replication") - if err := tm.disableReplication(context.Background()); err != nil { - return err - } + replCmd.action = replicationActionDisable } else if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL { // Reconnect to primary only for "NORMAL" keyspaces - params.Logger.Infof("Restore: starting replication at position %v", pos) - if err := tm.startReplication(ctx, pos, originalType); err != nil { - return err - } + replCmd.action = replicationActionStart + replCmd.position = &pos } case err == mysqlctl.ErrNoBackup: // Starting with empty database. // We just need to initialize replication - _, err := tm.initializeReplication(ctx, originalType) - if err != nil { - return err - } + replCmd.action = replicationActionInitialize case err == nil && params.DryRun: // Do nothing here, let the rest of code run - params.Logger.Infof("Dry run. No changes made") + logger.Infof("Dry run. No changes made") default: - bgCtx := context.Background() - // If anything failed, we should reset the original tablet type - if err := tm.tmState.ChangeTabletType(bgCtx, originalType, DBActionNone); err != nil { - log.Errorf("Could not change back to original tablet type %v: %v", originalType, err) + if err := rsm.abort(); err != nil { + logger.Errorf("Failed to abort restore: %v", err) } - return vterrors.Wrap(err, "Can't restore backup") + return vterrors.Wrap(err, "can't restore backup") } - // If we had type BACKUP or RESTORE it's better to set our type to the init-tablet-type to make result of the restore - // similar to completely clean start from scratch. - if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && initTabletType != "" { - initType, err := topoproto.ParseTabletType(initTabletType) - if err == nil { - originalType = initType - } - } if params.IsIncrementalRecovery() && !params.DryRun { // override - params.Logger.Infof("Restore: will set tablet type to DRAINED as this is a point in time recovery") - originalType = topodatapb.TabletType_DRAINED + logger.Infof("Restore: will set tablet type to DRAINED as this is a point in time recovery") + rsm.setNextTabletType(topodatapb.TabletType_DRAINED) + } + + if err := rsm.finish(ctx, replCmd); err != nil { + return vterrors.Wrap(err, "failed to finish restore") } - params.Logger.Infof("Restore: changing tablet type to %v for %s", originalType, tm.tabletAlias.String()) - // Change type back to original type if we're ok to serve. - bgCtx := context.Background() - return tm.tmState.ChangeTabletType(bgCtx, originalType, DBActionNone) + + return nil +} + +func (tm *TabletManager) restoreFromClone(ctx context.Context, logger logutil.Logger, deleteBeforeRestore bool) error { + if err := tm.lock(ctx); err != nil { + return err + } + defer tm.unlock() + + var ( + err error + startTime time.Time + ) + + defer func() { + tm.invokeRestoreDoneHook(startTime, err) + }() + + startTime = time.Now() + + err = tm.restoreFromCloneLocked(ctx, logger, deleteBeforeRestore) + if err != nil { + return err + } + + return nil +} + +func (tm *TabletManager) restoreFromCloneLocked( + ctx context.Context, + logger logutil.Logger, + deleteBeforeRestore bool) error { + rsm := tm.newRestoreStateManager(logger, deleteBeforeRestore) + + if ok, err := rsm.start(ctx); !ok || err != nil { + if err != nil { + return vterrors.Wrap(err, "failed to start restore") + } + // Restore cannot be started for a benign reason, e.g. mysqld already + // has data. + return nil + } + + tablet := tm.Tablet() + pos, err := mysqlctl.CloneFromDonor(ctx, tm.TopoServer, tm.MysqlDaemon, tablet.Keyspace, tablet.Shard) + if err != nil { + err = vterrors.Wrap(err, "failed to clone from donor") + if err := rsm.abort(); err != nil { + logger.Errorf("Failed to abort restore: %v", err) + } + return err + } + + statsRestoreBackup.Set(replication.EncodePosition(pos)) + + if err := rsm.finish(ctx, replicationCommand{action: replicationActionStart, position: &pos}); err != nil { + return vterrors.Wrap(err, "failed to finish restore") + } + + return nil } // disableReplication stops and resets replication on the mysql server. It moreover sets impossible replication @@ -387,3 +397,214 @@ func (tm *TabletManager) startReplication(ctx context.Context, pos replication.P return nil } + +func (tm *TabletManager) invokeRestoreDoneHook(startTime time.Time, err error) { + stopTime := time.Now() + + h := hook.NewSimpleHook("vttablet_restore_done") + h.ExtraEnv = tm.hookExtraEnv() + h.ExtraEnv["TM_RESTORE_DATA_START_TS"] = startTime.UTC().Format(time.RFC3339) + h.ExtraEnv["TM_RESTORE_DATA_STOP_TS"] = stopTime.UTC().Format(time.RFC3339) + h.ExtraEnv["TM_RESTORE_DATA_DURATION"] = stopTime.Sub(startTime).String() + + if err != nil { + h.ExtraEnv["TM_RESTORE_DATA_ERROR"] = err.Error() + } + + // vttablet_restore_done is best-effort (for now?). + go func() { + // Package vthook already logs the stdout/stderr of hooks when they + // are run, so we don't duplicate that here. + hr := h.Execute() + switch hr.ExitStatus { + case hook.HOOK_SUCCESS: + case hook.HOOK_DOES_NOT_EXIST: + log.Info("No vttablet_restore_done hook.") + default: + log.Warning("vttablet_restore_done hook failed") + } + }() +} + +type replicationAction int + +const ( + replicationActionNone replicationAction = iota + replicationActionDisable + replicationActionInitialize + replicationActionStart +) + +// newRestoreStateManager returns a new restoreStateManager, used to perform +// restore functionality that is common across restore methods. +func (tm *TabletManager) newRestoreStateManager(logger logutil.Logger, deleteBeforeRestore bool) *restoreStateManager { + return &restoreStateManager{ + deleteBeforeRestore: deleteBeforeRestore, + logger: logger, + tm: tm, + } +} + +// replicationCommand contains instructions for initializing, starting, or +// disabling replication. +type replicationCommand struct { + // action is the replication action to take. + action replicationAction + // position is used by the replicationActionStart action. + position *replication.Position +} + +// restoreState represents the state of a restoreStateManager. +type restoreState int + +const ( + // restoreNotStarted is the initial state of a restore. + restoreNotStarted restoreState = iota + // restoreStarted is used to indicate a restore has started. + restoreStarted + // restoreDone is used to indicate a restore is either + // finished or aborted. + restoreDone +) + +// restoreStateManager is used by restore methods (RestoreBackup, restoreClone) +// to perform common routines, such as transitioning the tablet type to and from +// RESTORE, and setting up replication. +type restoreStateManager struct { + deleteBeforeRestore bool + logger logutil.Logger + tm *TabletManager + + state restoreState + + prevTabletType topodatapb.TabletType + nextTabletType topodatapb.TabletType +} + +// abort reverts the tablet type to its previous state. +func (rt *restoreStateManager) abort() error { + if rt.state != restoreStarted { + return vterrors.New(vtrpcpb.Code_INTERNAL, "restore cannot be aborted in current state") + } + + // Transition to previous tablet type. + if err := rt.tm.tmState.ChangeTabletType(context.Background(), rt.prevTabletType, DBActionNone); err != nil { + return vterrors.Wrapf(err, "failed to change tablet type to %q", topoproto.TabletTypeLString(rt.prevTabletType)) + } + + // Mark the state as done. + rt.state = restoreDone + + return nil +} + +// finish completes the restore by reverting the tablet type to its next state +// (either previous state or a different state requested by setNextTabletType), +// and performs the supplied replication command. +func (rt *restoreStateManager) finish(ctx context.Context, replCmd replicationCommand) error { + if rt.state != restoreStarted { + return vterrors.New(vtrpcpb.Code_INTERNAL, "restore cannot be finished in current state") + } + + // Perform replication command. + switch replCmd.action { + case replicationActionDisable: + rt.logger.Infof("Restore: disabling replication") + if err := rt.tm.disableReplication(context.Background()); err != nil { + return vterrors.Wrap(err, "failed to disable replication") + } + case replicationActionInitialize: + if _, err := rt.tm.initializeReplication(ctx, rt.prevTabletType); err != nil { + return vterrors.Wrap(err, "failed to initialize replication") + } + case replicationActionStart: + if replCmd.position == nil { + return vterrors.New(vtrpcpb.Code_INTERNAL, "cannot start replication with nil position") + } + rt.logger.Infof("Restore: starting replication at position %v", *replCmd.position) + if err := rt.tm.startReplication(ctx, *replCmd.position, rt.prevTabletType); err != nil { + return vterrors.Wrapf(err, "failed to start replication with position %q", replCmd.position.String()) + } + case replicationActionNone: + fallthrough + default: + } + + // Transition to next tablet type. + if err := rt.tm.tmState.ChangeTabletType(context.Background(), rt.nextTabletType, DBActionNone); err != nil { + return vterrors.Wrapf(err, "failed to change tablet type to %q", topoproto.TabletTypeLString(rt.nextTabletType)) + } + + // Mark the state as done. + rt.state = restoreDone + + return nil +} + +// start the restore by changing the tablet type to RESTORE. +// +// Returns true with nil error if the tablet type is successfully changed to +// RESTORE. +// +// Returns false with nil error if the restore cannot be restarted for a benign +// reason, such as data already exists. +// +// Returns false with error if the restore cannot be started at this time (e.g. +// failed to check if data already exists) or the tablet type could not be +// changed to restore. +func (rt *restoreStateManager) start(ctx context.Context) (bool, error) { + if rt.state != restoreNotStarted { + return false, vterrors.New(vtrpcpb.Code_INTERNAL, "restore cannot be started in current state") + } + + if rt.tm.Cnf == nil { + return false, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "cannot perform restore without my.cnf, please restart vttablet with a my.cnf file specified") + } + + // Check whether we're going to restore before changing to RESTORE type, + // so we keep our PrimaryTermStartTime (if any) if we aren't actually restoring. + ok, err := mysqlctl.ShouldRestore(ctx, rt.logger, rt.tm.Cnf, rt.tm.MysqlDaemon, + topoproto.TabletDbName(rt.tm.Tablet()), rt.deleteBeforeRestore) + if err != nil { + return false, vterrors.Wrap(err, "failed to check if should restore") + } + if !ok { + rt.logger.Infof("Attempting to restore, but mysqld already contains data. Assuming vttablet was just restarted.") + return false, nil + } + + // Store previous tablet type so we can revert back to it later. + prevTabletType := rt.tm.Tablet().Type + // We should not become primary after restore, because that would incorrectly + // start a new primary term, and it's likely our data dir will be out of date. + if prevTabletType == topodatapb.TabletType_PRIMARY { + prevTabletType = rt.tm.baseTabletType + } + rt.prevTabletType = prevTabletType + + // Prepare next tablet type to transition to from RESTORE state. + nextTabletType := prevTabletType + // If we had type BACKUP or RESTORE it's better to set our type to the init-tablet-type to make result of the restore + // similar to completely clean start from scratch. + if (prevTabletType == topodatapb.TabletType_BACKUP || prevTabletType == topodatapb.TabletType_RESTORE) && initTabletType != "" { + initType, err := topoproto.ParseTabletType(initTabletType) + if err == nil { + nextTabletType = initType + } + } + rt.nextTabletType = nextTabletType + + // Transition to RESTORE state. + if err := rt.tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE, DBActionNone); err != nil { + return false, err + } + + // Mark the state as started. + rt.state = restoreStarted + + return true, nil +} + +func (rt *restoreStateManager) setNextTabletType(nextTabletType topodatapb.TabletType) { + rt.nextTabletType = nextTabletType +} diff --git a/go/vt/vttablet/tabletmanager/rpc_backup.go b/go/vt/vttablet/tabletmanager/rpc_backup.go index 60444a03545..9bd6469e9df 100644 --- a/go/vt/vttablet/tabletmanager/rpc_backup.go +++ b/go/vt/vttablet/tabletmanager/rpc_backup.go @@ -207,7 +207,7 @@ func (tm *TabletManager) RestoreFromBackup(ctx context.Context, logger logutil.L l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // Now we can run restore. - err = tm.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */, request, mysqlShutdownTimeout) + err = tm.restoreBackupLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */, request, mysqlShutdownTimeout) // Re-run health check to be sure to capture any replication delay. tm.QueryServiceControl.BroadcastHealth() diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index 35986a4209a..fa411a8ab96 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -881,16 +881,26 @@ func (tm *TabletManager) initTablet(ctx context.Context) error { func (tm *TabletManager) handleRestore(ctx context.Context, config *tabletenv.TabletConfig) (bool, error) { // Sanity check for inconsistent flags - if tm.Cnf == nil && restoreFromBackup { - return false, errors.New("you cannot enable --restore-from-backup without a my.cnf file") + if tm.Cnf == nil && (restoreFromBackup || restoreWithClone) { + return false, errors.New("you cannot enable --restore-from-backup or --restore-with-clone without a my.cnf file") + } + if restoreFromBackup && restoreWithClone { + return false, errors.New("--restore-from-backup and --restore-with-clone are mutually exclusive") } if restoreToTimestampStr != "" && restoreToPos != "" { return false, errors.New("--restore-to-timestamp and --restore-to-pos are mutually exclusive") } + if !restoreFromBackup || restoreWithClone { + return false, nil + } + // Restore in the background - if restoreFromBackup { - go func() { + go func() { + logger := logutil.NewConsoleLogger() + + switch { + case restoreFromBackup: // Zero date will cause us to use the latest, which is the default backupTime := time.Time{} // Or if a backup timestamp was specified then we use the last backup taken at or before that time @@ -913,23 +923,26 @@ func (tm *TabletManager) handleRestore(ctx context.Context, config *tabletenv.Ta // restoreFromBackup will just be a regular action // (same as if it was triggered remotely) - if err := tm.RestoreData(ctx, logutil.NewConsoleLogger(), waitForBackupInterval, false /* deleteBeforeRestore */, backupTime, restoreToTimestamp, restoreToPos, restoreFromBackupAllowedEngines, mysqlShutdownTimeout); err != nil { + if err := tm.RestoreBackup(ctx, logutil.NewConsoleLogger(), waitForBackupInterval, false /* deleteBeforeRestore */, backupTime, restoreToTimestamp, restoreToPos, restoreFromBackupAllowedEngines, mysqlShutdownTimeout); err != nil { log.Exitf("RestoreFromBackup failed: %v", err) } - - // Make sure we have the correct privileges for the DBA user before we start the state manager. - err := tm.waitForDBAGrants(config, mysqlctl.DbaGrantWaitTime) - if err != nil { - log.Exitf("Failed waiting for DBA grants: %v", err) + case restoreWithClone: + if err := tm.restoreFromClone(ctx, logger, false /*deleteBeforeRestore*/); err != nil { + log.Exitf("restoreFromClone failed: %v", err) } + } - // Open the state manager after restore is done. - tm.tmState.Open() - }() - return true, nil - } + // Make sure we have the correct privileges for the DBA user before we start the state manager. + err := tm.waitForDBAGrants(config, mysqlctl.DbaGrantWaitTime) + if err != nil { + log.Exitf("Failed waiting for DBA grants: %v", err) + } + + // Open the state manager after restore is done. + tm.tmState.Open() + }() - return false, nil + return true, nil } // waitForDBAGrants waits for DBA user to have the required privileges to function properly. diff --git a/go/vt/wrangler/testlib/backup_test.go b/go/vt/wrangler/testlib/backup_test.go index b540fc9f8f0..ca5b6dec40e 100644 --- a/go/vt/wrangler/testlib/backup_test.go +++ b/go/vt/wrangler/testlib/backup_test.go @@ -263,7 +263,7 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { RelayLogInfoPath: path.Join(root, "relay-log.info"), } - err = destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* backupTime */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) + err = destTablet.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* backupTime */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) if err != nil { return err } @@ -303,7 +303,7 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { primary.FakeMysqlDaemon.SetReplicationPositionPos = primary.FakeMysqlDaemon.GetPrimaryPositionLocked() // restore primary from latest backup - require.NoError(t, primary.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout), + require.NoError(t, primary.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout), "RestoreData failed") // tablet was created as PRIMARY, so it's baseTabletType is PRIMARY assert.Equal(t, topodatapb.TabletType_PRIMARY, primary.Tablet.Type) @@ -319,7 +319,7 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { } // Test restore with the backup timestamp - require.NoError(t, primary.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, backupTime, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout), + require.NoError(t, primary.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, backupTime, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout), "RestoreData with backup timestamp failed") assert.Equal(t, topodatapb.TabletType_PRIMARY, primary.Tablet.Type) assert.False(t, primary.FakeMysqlDaemon.Replicating) @@ -521,7 +521,7 @@ func TestBackupRestoreLagged(t *testing.T) { errCh = make(chan error, 1) go func(ctx context.Context, tablet *FakeTablet) { - errCh <- tablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) + errCh <- tablet.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) }(ctx, destTablet) timer = time.NewTicker(1 * time.Second) @@ -715,7 +715,7 @@ func TestRestoreUnreachablePrimary(t *testing.T) { defer cancel() // Restore will return an error while trying to contact the primary for its position, but otherwise will succeed. // The replication won't be running however, since we can't run errant GTID detection without the primary being online. - err = destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) + err = destTablet.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) require.ErrorContains(t, err, "DeadlineExceeded") // verify the full status require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed") @@ -871,7 +871,7 @@ func TestDisableActiveReparents(t *testing.T) { RelayLogInfoPath: path.Join(root, "relay-log.info"), } - require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout)) + require.NoError(t, destTablet.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout)) // verify the full status require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed") assert.False(t, destTablet.FakeMysqlDaemon.Replicating) diff --git a/test/config.json b/test/config.json index ab1fa3d00ac..2c14b490276 100644 --- a/test/config.json +++ b/test/config.json @@ -91,6 +91,15 @@ "RetryMax": 1, "Tags": [] }, + "backup_clone": { + "File": "unused.go", + "Args": ["vitess.io/vitess/go/test/endtoend/backup/clone", "-timeout", "30m"], + "Command": [], + "Manual": false, + "Shard": "21", + "RetryMax": 1, + "Tags": [] + }, "backup_pitr": { "File": "unused.go", "Args": ["vitess.io/vitess/go/test/endtoend/backup/pitr", "-timeout", "30m"], From d4f5da87f33924507b3cc010f2bfe422993cbd9e Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 26 Dec 2025 06:46:34 -0500 Subject: [PATCH 20/33] rm redundant clone capability check Signed-off-by: Max Englander --- go/test/endtoend/backup/clone/main_test.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/go/test/endtoend/backup/clone/main_test.go b/go/test/endtoend/backup/clone/main_test.go index 02ec9bc0b96..662a950fd46 100644 --- a/go/test/endtoend/backup/clone/main_test.go +++ b/go/test/endtoend/backup/clone/main_test.go @@ -225,20 +225,11 @@ func mysqlVersionSupportsClone(versionStr string) bool { versionPart := parts[0] // Parse the version - flavor, version, err := mysqlctl.ParseVersionString(versionPart) + _, version, err := mysqlctl.ParseVersionString(versionPart) if err != nil { return false } - // Clone is only supported on MySQL 8.0.17+ - if flavor != mysqlctl.FlavorMySQL && flavor != mysqlctl.FlavorPercona { - return false - } - if version.Major < 8 || (version.Major == 8 && version.Minor == 0 && version.Patch < 17) { - return false - } - - // Verify clone capability cleanVersion := fmt.Sprintf("%d.%d.%d", version.Major, version.Minor, version.Patch) capableOf := mysql.ServerVersionCapableOf(cleanVersion) if capableOf == nil { From 48d394019153766e19ac6909ca513cf91ea99866 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 26 Dec 2025 07:23:13 -0500 Subject: [PATCH 21/33] fix bug, tests Signed-off-by: Max Englander --- go/test/endtoend/backup/clone/main_test.go | 77 +++++++------------ go/test/endtoend/backup/clone/restore_test.go | 28 ++++--- go/vt/vttablet/tabletmanager/tm_init.go | 2 +- 3 files changed, 45 insertions(+), 62 deletions(-) diff --git a/go/test/endtoend/backup/clone/main_test.go b/go/test/endtoend/backup/clone/main_test.go index 662a950fd46..994c2436d6b 100644 --- a/go/test/endtoend/backup/clone/main_test.go +++ b/go/test/endtoend/backup/clone/main_test.go @@ -27,30 +27,29 @@ import ( "strings" "testing" - "vitess.io/vitess/go/mysql" + "github.com/stretchr/testify/require" "vitess.io/vitess/go/mysql/capabilities" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/utils" "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/mysqlctl" vtutils "vitess.io/vitess/go/vt/utils" ) var ( - primary *cluster.Vttablet - replica1 *cluster.Vttablet - replica2 *cluster.Vttablet - localCluster *cluster.LocalProcessCluster - newInitDBFile string - cell = cluster.DefaultCell - hostname = "localhost" - keyspaceName = "ks" - shardName = "0" - dbPassword = "VtDbaPass" - shardKsName = fmt.Sprintf("%s/%s", keyspaceName, shardName) - dbCredentialFile string - commonTabletArg = []string{ + primary *cluster.Vttablet + replica1 *cluster.Vttablet + replica2 *cluster.Vttablet + localCluster *cluster.LocalProcessCluster + newInitDBFile string + cell = cluster.DefaultCell + hostname = "localhost" + keyspaceName = "ks" + shardName = "0" + dbPassword = "VtDbaPass" + shardKsName = fmt.Sprintf("%s/%s", keyspaceName, shardName) + dbCredentialFile string + vttabletExtraArgs = []string{ vtutils.GetFlagVariantForTests("--vreplication-retry-delay"), "1s", vtutils.GetFlagVariantForTests("--degraded-threshold"), "5s", vtutils.GetFlagVariantForTests("--lock-tables-timeout"), "5s", @@ -124,8 +123,10 @@ func TestMain(m *testing.M) { return 1, err } - extraArgs := []string{"--db-credentials-file", dbCredentialFile} - commonTabletArg = append(commonTabletArg, "--db-credentials-file", dbCredentialFile) + mysqlctlExtraArgs := []string{"--db-credentials-file", dbCredentialFile} + vttabletExtraArgs = append(vttabletExtraArgs, + "--db-credentials-file", dbCredentialFile, + "--mysql-clone-enabled") primary = localCluster.NewVttabletInstance("replica", 0, "") replica1 = localCluster.NewVttabletInstance("replica", 0, "") @@ -137,7 +138,7 @@ func TestMain(m *testing.M) { for _, tablet := range shard.Vttablets { tablet.VttabletProcess = localCluster.VtprocessInstanceFromVttablet(tablet, shard.Name, keyspaceName) tablet.VttabletProcess.DbPassword = dbPassword - tablet.VttabletProcess.ExtraArgs = commonTabletArg + tablet.VttabletProcess.ExtraArgs = vttabletExtraArgs tablet.VttabletProcess.SupportsBackup = true mysqlctlProcess, err := cluster.MysqlCtlProcessInstance(tablet.TabletUID, tablet.MySQLPort, localCluster.TmpDirectory) @@ -146,7 +147,7 @@ func TestMain(m *testing.M) { } tablet.MysqlctlProcess = *mysqlctlProcess tablet.MysqlctlProcess.InitDBFile = newInitDBFile - tablet.MysqlctlProcess.ExtraArgs = extraArgs + tablet.MysqlctlProcess.ExtraArgs = mysqlctlExtraArgs proc, err := tablet.MysqlctlProcess.StartProcess() if err != nil { return 1, err @@ -204,39 +205,13 @@ func setupExtraMyCnf() error { return nil } -// getMySQLVersion retrieves the MySQL version from a running tablet -func getMySQLVersion(t *testing.T, tablet *cluster.Vttablet) string { - qr, err := tablet.VttabletProcess.QueryTablet("SELECT VERSION()", keyspaceName, true) - if err != nil { - t.Logf("Failed to get MySQL version: %v", err) - return "" - } - if len(qr.Rows) == 0 { - return "" - } - return qr.Rows[0][0].ToString() -} - // mysqlVersionSupportsClone checks if the MySQL version supports CLONE plugin -func mysqlVersionSupportsClone(versionStr string) bool { - // Parse version string to extract numeric version - // Format might be: "8.0.35-27" or "8.0.35" - parts := strings.Split(versionStr, "-") - versionPart := parts[0] - - // Parse the version - _, version, err := mysqlctl.ParseVersionString(versionPart) - if err != nil { - return false - } - - cleanVersion := fmt.Sprintf("%d.%d.%d", version.Major, version.Minor, version.Patch) - capableOf := mysql.ServerVersionCapableOf(cleanVersion) - if capableOf == nil { - return false - } - hasClone, err := capableOf(capabilities.MySQLClonePluginFlavorCapability) - return err == nil && hasClone +func mysqlVersionSupportsClone(t *testing.T, tablet *cluster.Vttablet) bool { + conn, err := tablet.VttabletProcess.TabletConn(keyspaceName, false) + require.NoError(t, err, "failed to get tablet connection") + ok, err := conn.SupportsCapability(capabilities.MySQLClonePluginFlavorCapability) + require.NoError(t, err, "failed to check clone capability") + return ok } // clonePluginAvailable checks if the clone plugin is installed and active diff --git a/go/test/endtoend/backup/clone/restore_test.go b/go/test/endtoend/backup/clone/restore_test.go index 62362b2508a..e599ad900c5 100644 --- a/go/test/endtoend/backup/clone/restore_test.go +++ b/go/test/endtoend/backup/clone/restore_test.go @@ -47,9 +47,8 @@ func TestCloneRestore(t *testing.T) { time.Sleep(2 * time.Second) // Now check if MySQL version supports clone (need vttablet running to query). - mysqlVersion := getMySQLVersion(t, primary) - if !mysqlVersionSupportsClone(mysqlVersion) { - t.Skipf("Skipping clone test: MySQL version %s does not support CLONE (requires 8.0.17+)", mysqlVersion) + if !mysqlVersionSupportsClone(t, primary) { + t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") } // Check if clone plugin is available. @@ -75,13 +74,20 @@ func TestCloneRestore(t *testing.T) { // Bring up replica2 using clone from primary. err = localCluster.InitTablet(replica2, keyspaceName, shardName) require.NoError(t, err) - restoreWithClone(t, replica2, "replica", "SERVING", true) - - // Wait for MySQL to restart after clone. - time.Sleep(10 * time.Second) + restoreWithClone(t, replica2, "replica", "SERVING") + + // Wait for data to exist. + require.Eventually(t, func() bool { + qr, _ := replica2.VttabletProcess.QueryTablet("select * from vt_insert_test", keyspaceName, true) + if qr != nil { + if len(qr.Rows) == 3 { + return true + } + } + return false + }, 5*time.Minute, 10*time.Second) - // Verify clone worked: data exists, clone_status confirms, replication is set up. - cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 3) + // Verify clone worked: clone_status confirms, replication is set up. verifyClonedData(t, replica2) verifyCloneWasUsed(t, replica2) verifyReplicationTopology(t, replica2) @@ -103,10 +109,12 @@ func TestCloneRestore(t *testing.T) { } // restoreWithClone starts a tablet that will use MySQL CLONE to get its data. -func restoreWithClone(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string, cloneFromPrimary bool) { +func restoreWithClone(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string) { tablet.VttabletProcess.ExtraArgs = []string{ "--db-credentials-file", dbCredentialFile, + "--mysql-clone-enabled", // Enable restore with clone - this triggers the clone logic. + "--restore-from-backup=false", "--restore-with-clone", // Clone configuration - tells vttablet to clone instead of restoring from backup. "--clone-from-primary", diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index fa411a8ab96..9e628c510bc 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -891,7 +891,7 @@ func (tm *TabletManager) handleRestore(ctx context.Context, config *tabletenv.Ta return false, errors.New("--restore-to-timestamp and --restore-to-pos are mutually exclusive") } - if !restoreFromBackup || restoreWithClone { + if !restoreFromBackup && !restoreWithClone { return false, nil } From b44ad0f7945edbde67c0d76ed61c014ceb25a4cf Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 26 Dec 2025 08:39:19 -0500 Subject: [PATCH 22/33] rm redundant clone sql init Signed-off-by: Max Englander --- go/vt/mysqlctl/mysqld.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/go/vt/mysqlctl/mysqld.go b/go/vt/mysqlctl/mysqld.go index a54e7e0013e..37dde908366 100644 --- a/go/vt/mysqlctl/mysqld.go +++ b/go/vt/mysqlctl/mysqld.go @@ -830,12 +830,6 @@ func (mysqld *Mysqld) Init(ctx context.Context, cnf *Mycnf, initDBSQLFile string if err := mysqld.executeMysqlScript(ctx, params, string(script)); err != nil { return fmt.Errorf("can't run init-db-sql-file (%v): %v", initDBSQLFile, err) } - // Execute clone-specific init SQL if enabled - if mysqlCloneEnabled { - if err := mysqld.executeMysqlScript(ctx, params, config.InitClone); err != nil { - return fmt.Errorf("failed to initialize clone support: %v", err) - } - } return nil } From a50414a12973edffdaf00720a92cefa2bac81eb2 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 26 Dec 2025 09:49:46 -0500 Subject: [PATCH 23/33] fix imports Signed-off-by: Max Englander --- go/test/endtoend/backup/clone/main_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/test/endtoend/backup/clone/main_test.go b/go/test/endtoend/backup/clone/main_test.go index 994c2436d6b..31f60c51fea 100644 --- a/go/test/endtoend/backup/clone/main_test.go +++ b/go/test/endtoend/backup/clone/main_test.go @@ -28,6 +28,7 @@ import ( "testing" "github.com/stretchr/testify/require" + "vitess.io/vitess/go/mysql/capabilities" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/test/endtoend/cluster" From dbbf495992c520f513c53966728cf332f7cf13dd Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 26 Dec 2025 10:43:45 -0500 Subject: [PATCH 24/33] fix tests Signed-off-by: Max Englander --- go/test/endtoend/backup/clone/restore_test.go | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/go/test/endtoend/backup/clone/restore_test.go b/go/test/endtoend/backup/clone/restore_test.go index e599ad900c5..bfd26afaf53 100644 --- a/go/test/endtoend/backup/clone/restore_test.go +++ b/go/test/endtoend/backup/clone/restore_test.go @@ -18,6 +18,7 @@ package clone import ( "fmt" + "os" "testing" "time" @@ -31,6 +32,9 @@ import ( // --restore-with-clone flag. This simulates the workflow where a new replica // is provisioned by cloning data from the primary instead of restoring from backup. func TestCloneRestore(t *testing.T) { + t.Cleanup(func() { removeBackups(t) }) + t.Cleanup(tearDownRestoreTest) + // Initialize primary and replica1 first (need replica for semi-sync durability). for _, tablet := range []*cluster.Vttablet{primary, replica1} { err := localCluster.InitTablet(tablet, keyspaceName, shardName) @@ -71,6 +75,17 @@ func TestCloneRestore(t *testing.T) { // Verify data exists on primary. cluster.VerifyRowsInTablet(t, primary, keyspaceName, 3) + // Clean up replica2's MySQL data from previous test so clone can work. + // Stop MySQL, remove data directory, and restart with fresh data. + err = replica2.MysqlctlProcess.Stop() + require.NoError(t, err) + err = os.RemoveAll(replica2.VttabletProcess.Directory) + require.NoError(t, err) + proc, err := replica2.MysqlctlProcess.StartProcess() + require.NoError(t, err) + err = proc.Wait() + require.NoError(t, err) + // Bring up replica2 using clone from primary. err = localCluster.InitTablet(replica2, keyspaceName, shardName) require.NoError(t, err) @@ -103,9 +118,6 @@ func TestCloneRestore(t *testing.T) { cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 8) verifyPostCloneReplication(t, replica2) - - // Cleanup. - tearDownRestoreTest() } // restoreWithClone starts a tablet that will use MySQL CLONE to get its data. From cf1a804e33c4ed2e5fc0fff60e470619ea311d04 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 26 Dec 2025 10:48:03 -0500 Subject: [PATCH 25/33] fix tests Signed-off-by: Max Englander --- go/test/endtoend/backup/clone/main_test.go | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/go/test/endtoend/backup/clone/main_test.go b/go/test/endtoend/backup/clone/main_test.go index 31f60c51fea..ac96ac42fde 100644 --- a/go/test/endtoend/backup/clone/main_test.go +++ b/go/test/endtoend/backup/clone/main_test.go @@ -264,3 +264,33 @@ func parseVersionFromRow(row []sqltypes.Value) (int, int, int, error) { return major, minor, patch, nil } + +// removeBackups removes all backups for the test shard. +func removeBackups(t *testing.T) { + backups, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("GetBackups", shardKsName) + require.NoError(t, err) + for _, backup := range splitLines(backups) { + if backup != "" { + _, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("RemoveBackup", shardKsName, backup) + require.NoError(t, err) + } + } +} + +// splitLines splits a string by newlines, filtering out empty lines. +func splitLines(s string) []string { + var result []string + start := 0 + for i, c := range s { + if c == '\n' { + if i > start { + result = append(result, s[start:i]) + } + start = i + 1 + } + } + if start < len(s) { + result = append(result, s[start:]) + } + return result +} From 3c58666901e50107bc718c4a3ae369e29d83c9a4 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Tue, 13 Jan 2026 21:09:20 -0500 Subject: [PATCH 26/33] go/cmd/vtbackup: add `--restore-from-clone` support to `vtbackup` (#19089) Signed-off-by: Max Englander --- go/cmd/vtbackup/cli/vtbackup.go | 81 +++++---- go/flags/endtoend/vtbackup.txt | 6 +- go/test/endtoend/backup/clone/backup_test.go | 164 +++++++++++++++++++ go/vt/mysqlctl/clone.go | 3 +- go/vt/mysqlctl/mysqld.go | 6 + 5 files changed, 221 insertions(+), 39 deletions(-) create mode 100644 go/test/endtoend/backup/clone/backup_test.go diff --git a/go/cmd/vtbackup/cli/vtbackup.go b/go/cmd/vtbackup/cli/vtbackup.go index a8bbadd87ba..f661d9aac2d 100644 --- a/go/cmd/vtbackup/cli/vtbackup.go +++ b/go/cmd/vtbackup/cli/vtbackup.go @@ -93,6 +93,7 @@ var ( initShard string concurrency = 4 incrementalFromPos string + restoreWithClone bool // mysqlctld-like flags mysqlPort = 3306 @@ -157,7 +158,7 @@ When run periodically for each shard, vtbackup can ensure these configurable pol * Old backups for the shard are removed. Whatever system launches vtbackup is responsible for the following: - - Running vtbackup with similar flags that would be used for a vttablet and + - Running vtbackup with similar flags that would be used for a vttablet and mysqlctld in the target shard to be backed up. - Provisioning as much disk space for vtbackup as would be given to vttablet. @@ -226,6 +227,7 @@ func init() { utils.SetFlagStringVar(Main.Flags(), &initShard, "init-shard", initShard, "(init parameter) shard to use for this tablet") Main.Flags().IntVar(&concurrency, "concurrency", concurrency, "(init restore parameter) how many concurrent files to restore at once") utils.SetFlagStringVar(Main.Flags(), &incrementalFromPos, "incremental-from-pos", incrementalFromPos, "Position, or name of backup from which to create an incremental backup. Default: empty. If given, then this backup becomes an incremental backup from given position or given backup. If value is 'auto', this backup will be taken from the last successful backup position.") + Main.Flags().BoolVar(&restoreWithClone, "restore-with-clone", restoreWithClone, "(init parameter) will perform the restore phase with MySQL CLONE, requires either --clone-from-primary or --clone-from-tablet") // mysqlctld-like flags utils.SetFlagIntVar(Main.Flags(), &mysqlPort, "mysql-port", mysqlPort, "MySQL port") @@ -457,42 +459,49 @@ func takeBackup(ctx, backgroundCtx context.Context, topoServer *topo.Server, bac return nil } - phase.Set(phaseNameRestoreLastBackup, int64(1)) - defer phase.Set(phaseNameRestoreLastBackup, int64(0)) - backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) - log.Infof("Restoring latest backup from directory %v", backupDir) - restoreAt := time.Now() - params := mysqlctl.RestoreParams{ - Cnf: mycnf, - Mysqld: mysqld, - Logger: logutil.NewConsoleLogger(), - Concurrency: concurrency, - HookExtraEnv: extraEnv, - DeleteBeforeRestore: true, - DbName: dbName, - Keyspace: initKeyspace, - Shard: initShard, - Stats: backupstats.RestoreStats(), - MysqlShutdownTimeout: mysqlShutdownTimeout, - } - backupManifest, err := mysqlctl.Restore(ctx, params) var restorePos replication.Position - switch err { - case nil: - // if err is nil, we expect backupManifest to be non-nil - restorePos = backupManifest.Position - log.Infof("Successfully restored from backup at replication position %v", restorePos) - case mysqlctl.ErrNoBackup: - // There is no backup found, but we may be taking the initial backup of a shard - if !allowFirstBackup { - return errors.New("no backup found; not starting up empty since --initial_backup flag was not enabled") - } - restorePos = replication.Position{} - default: - return fmt.Errorf("can't restore from backup: %v", err) - } - deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) - phase.Set(phaseNameRestoreLastBackup, int64(0)) + if restoreWithClone { + restorePos, err = mysqlctl.CloneFromDonor(ctx, topoServer, mysqld, initKeyspace, initShard) + if err != nil { + return fmt.Errorf("restore with clone failed: %v", err) + } + } else { + phase.Set(phaseNameRestoreLastBackup, int64(1)) + defer phase.Set(phaseNameRestoreLastBackup, int64(0)) + backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) + log.Infof("Restoring latest backup from directory %v", backupDir) + restoreAt := time.Now() + params := mysqlctl.RestoreParams{ + Cnf: mycnf, + Mysqld: mysqld, + Logger: logutil.NewConsoleLogger(), + Concurrency: concurrency, + HookExtraEnv: extraEnv, + DeleteBeforeRestore: true, + DbName: dbName, + Keyspace: initKeyspace, + Shard: initShard, + Stats: backupstats.RestoreStats(), + MysqlShutdownTimeout: mysqlShutdownTimeout, + } + backupManifest, err := mysqlctl.Restore(ctx, params) + switch err { + case nil: + // if err is nil, we expect backupManifest to be non-nil + restorePos = backupManifest.Position + log.Infof("Successfully restored from backup at replication position %v", restorePos) + case mysqlctl.ErrNoBackup: + // There is no backup found, but we may be taking the initial backup of a shard + if !allowFirstBackup { + return errors.New("no backup found; not starting up empty since --initial_backup flag was not enabled") + } + restorePos = replication.Position{} + default: + return fmt.Errorf("can't restore from backup: %v", err) + } + deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) + phase.Set(phaseNameRestoreLastBackup, int64(0)) + } // As of MySQL 8.0.21, you can disable redo logging using the ALTER INSTANCE // DISABLE INNODB REDO_LOG statement. This functionality is intended for diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt index 87bb94fd43c..38f9f72eddc 100644 --- a/go/flags/endtoend/vtbackup.txt +++ b/go/flags/endtoend/vtbackup.txt @@ -6,7 +6,7 @@ When run periodically for each shard, vtbackup can ensure these configurable pol * Old backups for the shard are removed. Whatever system launches vtbackup is responsible for the following: - - Running vtbackup with similar flags that would be used for a vttablet and + - Running vtbackup with similar flags that would be used for a vttablet and mysqlctld in the target shard to be backed up. - Provisioning as much disk space for vtbackup as would be given to vttablet. @@ -68,6 +68,8 @@ Flags: --builtinbackup-mysqld-timeout duration how long to wait for mysqld to shutdown at the start of the backup. (default 10m0s) --builtinbackup-progress duration how often to send progress updates when backing up large files. (default 5s) --ceph-backup-storage-config string Path to JSON config file for ceph backup storage. (default "ceph_backup_config.json") + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --concurrency int (init restore parameter) how many concurrent files to restore at once (default 4) @@ -189,6 +191,7 @@ Flags: --mycnf-slow-log-path string mysql slow query log path --mycnf-socket-file string mysql socket file --mycnf-tmp-dir string mysql tmp directory + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-shell-backup-location string location where the backup will be stored @@ -207,6 +210,7 @@ Flags: --purge-logs-interval duration how often try to remove old logs (default 1h0m0s) --remote-operation-timeout duration time to wait for a remote operation (default 15s) --restart-before-backup Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs. + --restore-with-clone (init parameter) will perform the restore phase with MySQL CLONE, requires either --clone-from-primary or --clone-from-tablet --s3-backup-aws-endpoint string endpoint of the S3 backend (region must be provided). --s3-backup-aws-min-partsize int Minimum part size to use, defaults to 5MiB but can be increased due to the dataset size. (default 5242880) --s3-backup-aws-region string AWS region to use. (default "us-east-1") diff --git a/go/test/endtoend/backup/clone/backup_test.go b/go/test/endtoend/backup/clone/backup_test.go new file mode 100644 index 00000000000..33eb01be722 --- /dev/null +++ b/go/test/endtoend/backup/clone/backup_test.go @@ -0,0 +1,164 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clone + +import ( + "os" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/vt/log" + vtutils "vitess.io/vitess/go/vt/utils" +) + +func TestCloneBackup(t *testing.T) { + t.Cleanup(func() { removeBackups(t) }) + t.Cleanup(tearDown) + + // Initialize tablets first so we can connect to MySQL. + for _, tablet := range []*cluster.Vttablet{primary, replica1} { + err := localCluster.InitTablet(tablet, keyspaceName, shardName) + require.NoError(t, err) + err = tablet.VttabletProcess.Setup() + require.NoError(t, err) + } + + // Initialize shard primary. + err := localCluster.VtctldClientProcess.InitShardPrimary(keyspaceName, shardName, cell, primary.TabletUID) + require.NoError(t, err) + + // Now check if MySQL version supports clone (need vttablet running to query). + if !mysqlVersionSupportsClone(t, primary) { + t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") + } + + // Check if clone plugin is available. + if !clonePluginAvailable(t, primary) { + t.Skip("Skipping clone test: clone plugin not available") + } + + // Set up clean test data (table may have data from previous tests). + _, err = primary.VttabletProcess.QueryTablet(vtInsertTest, keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("TRUNCATE TABLE vt_insert_test", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_test_1')", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_test_2')", keyspaceName, true) + require.NoError(t, err) + + // Verify data exists on primary. + cluster.VerifyRowsInTablet(t, primary, keyspaceName, 2) + + // Wait for replica to catch up. + time.Sleep(2 * time.Second) + cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 2) + + // Take a backup using clone from primary. + log.Infof("Starting vtbackup with --clone-from-primary") + err = vtbackupWithClone(t) + require.NoError(t, err) + + // Verify a backup was created. + backups := verifyBackupCount(t, shardKsName, 1) + assert.NotEmpty(t, backups) + + // Insert more data AFTER the backup was taken. + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('after_backup')", keyspaceName, true) + require.NoError(t, err) + cluster.VerifyRowsInTablet(t, primary, keyspaceName, 3) + + // Now bring up replica2 and restore from the backup we just created. + // This verifies the clone-based backup actually contains the data. + log.Infof("Restoring replica2 from backup to verify clone worked") + err = localCluster.InitTablet(replica2, keyspaceName, shardName) + require.NoError(t, err) + restore(t, replica2, "replica", "SERVING") + + // Give replica2 time to catch up via replication. + time.Sleep(5 * time.Second) + + // Verify replica2 has ALL the data (2 rows from before backup + 1 from after). + // The 2 pre-backup rows prove the clone-based backup worked. + // The 3rd row proves replication is working after restore. + cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 3) + log.Infof("Clone backup verification successful: replica2 has all data") +} + +func vtbackupWithClone(t *testing.T) error { + mysqlSocket, err := os.CreateTemp("", "vtbackup_clone_test_mysql.sock") + require.NoError(t, err) + defer os.Remove(mysqlSocket.Name()) + + extraArgs := []string{ + "--allow_first_backup", + "--db-credentials-file", dbCredentialFile, + "--mysql-clone-enabled", + vtutils.GetFlagVariantForTests("--mysql-socket"), mysqlSocket.Name(), + // Clone from primary instead of restoring from backup. + "--restore-with-clone", + "--clone-from-primary", + // Clone credentials - use vt_clone user which is created with @'%' host + // and BACKUP_ADMIN privilege in init_db.sql (no password). + "--db-clone-user", "vt_clone", + "--db-clone-password", "", + "--db-clone-use-ssl=false", + } + + log.Infof("Starting vtbackup with clone args: %v", extraArgs) + return localCluster.StartVtbackup(newInitDBFile, false, keyspaceName, shardName, cell, extraArgs...) +} + +func verifyBackupCount(t *testing.T, shardKsName string, expected int) []string { + backups, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("GetBackups", shardKsName) + require.NoError(t, err) + + var result []string + for _, line := range splitLines(backups) { + if line != "" { + result = append(result, line) + } + } + assert.Equalf(t, expected, len(result), "expected %d backups, got %d", expected, len(result)) + return result +} + +func restore(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string) { + // Start tablet with restore enabled. MySQL is already running from TestMain. + log.Infof("restoring tablet %s", time.Now()) + tablet.VttabletProcess.ExtraArgs = []string{"--db-credentials-file", dbCredentialFile} + tablet.VttabletProcess.TabletType = tabletType + tablet.VttabletProcess.ServingStatus = waitForState + tablet.VttabletProcess.SupportsBackup = true + err := tablet.VttabletProcess.Setup() + require.NoError(t, err) +} + +func tearDown() { + for _, tablet := range []*cluster.Vttablet{primary, replica1, replica2} { + if tablet != nil && tablet.VttabletProcess != nil { + _ = tablet.VttabletProcess.TearDown() + } + if tablet != nil { + _ = localCluster.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", tablet.Alias) + } + } +} diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 295af64898d..5b4e0252528 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -43,8 +43,7 @@ var ( ) func init() { - // TODO: enable these flags for vtbackup. - for _, cmd := range []string{"vttablet" /*, "vtbackup"*/} { + for _, cmd := range []string{"vttablet", "vtbackup"} { servenv.OnParseFor(cmd, registerCloneFlags) } } diff --git a/go/vt/mysqlctl/mysqld.go b/go/vt/mysqlctl/mysqld.go index 37dde908366..c032dff2db8 100644 --- a/go/vt/mysqlctl/mysqld.go +++ b/go/vt/mysqlctl/mysqld.go @@ -138,6 +138,9 @@ func init() { for _, cmd := range []string{"mysqlctl", "mysqlctld", "vtcombo", "vttablet", "vttestserver"} { servenv.OnParseFor(cmd, registerPoolFlags) } + for _, cmd := range []string{"mysqlctl", "mysqlctld", "vtcombo", "vttablet", "vttestserver", "vtbackup"} { + servenv.OnParseFor(cmd, registerMySQLDCloneFlags) + } } func registerMySQLDFlags(fs *pflag.FlagSet) { @@ -145,6 +148,9 @@ func registerMySQLDFlags(fs *pflag.FlagSet) { utils.SetFlagStringVar(fs, &mycnfTemplateFile, "mysqlctl-mycnf-template", mycnfTemplateFile, "template file to use for generating the my.cnf file during server init") utils.SetFlagStringVar(fs, &socketFile, "mysqlctl-socket", socketFile, "socket file to use for remote mysqlctl actions (empty for local actions)") utils.SetFlagDurationVar(fs, &replicationConnectRetry, "replication-connect-retry", replicationConnectRetry, "how long to wait in between replica reconnect attempts. Only precise to the second.") +} + +func registerMySQLDCloneFlags(fs *pflag.FlagSet) { utils.SetFlagBoolVar(fs, &mysqlCloneEnabled, "mysql-clone-enabled", mysqlCloneEnabled, "Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+)") } From 05bb719ecdaeccaa0081feaa4d9e06d109845b90 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Wed, 14 Jan 2026 00:32:53 -0500 Subject: [PATCH 27/33] update flags Signed-off-by: Max Englander --- go/flags/endtoend/mysqlctl.txt | 1 - go/flags/endtoend/mysqlctld.txt | 1 - go/flags/endtoend/vtbackup.txt | 1 + go/flags/endtoend/vtcombo.txt | 1 - go/flags/endtoend/vttablet.txt | 1 + go/flags/endtoend/vttestserver.txt | 1 - 6 files changed, 2 insertions(+), 4 deletions(-) diff --git a/go/flags/endtoend/mysqlctl.txt b/go/flags/endtoend/mysqlctl.txt index e252218fd95..36c87e9d6ad 100644 --- a/go/flags/endtoend/mysqlctl.txt +++ b/go/flags/endtoend/mysqlctl.txt @@ -70,7 +70,6 @@ Flags: --log_dir string If non-empty, write log files in this directory --logtostderr log to standard error instead of files --max-stack-size int configure the maximum stack size in bytes (default 67108864) - --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port. (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-socket string Path to the mysqld socket file. diff --git a/go/flags/endtoend/mysqlctld.txt b/go/flags/endtoend/mysqlctld.txt index d5c977ce411..e728e1561d2 100644 --- a/go/flags/endtoend/mysqlctld.txt +++ b/go/flags/endtoend/mysqlctld.txt @@ -99,7 +99,6 @@ Flags: --log_dir string If non-empty, write log files in this directory --logtostderr log to standard error instead of files --max-stack-size int configure the maximum stack size in bytes (default 67108864) - --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-socket string Path to the mysqld socket file diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt index 38f9f72eddc..ed50ef78a65 100644 --- a/go/flags/endtoend/vtbackup.txt +++ b/go/flags/endtoend/vtbackup.txt @@ -70,6 +70,7 @@ Flags: --ceph-backup-storage-config string Path to JSON config file for ceph backup storage. (default "ceph_backup_config.json") --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. + --clone-restart-wait-timeout duration Timeout for waiting for MySQL to restart after CLONE REMOTE. (default 5m0s) --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --concurrency int (init restore parameter) how many concurrent files to restore at once (default 4) diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index e2e6d2d0900..70fb7f1d710 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -223,7 +223,6 @@ Flags: --mycnf-tmp-dir string mysql tmp directory --mysql-allow-clear-text-without-tls If set, the server will allow the use of a clear text password over non-SSL connections. --mysql-auth-server-impl string Which auth server implementation to use. Options: none, ldap, clientcert, static, vault. (default "static") - --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-default-workload string Default session workload (OLTP, OLAP, DBA) (default "OLTP") --mysql-port int mysql port (default 3306) --mysql-server-bind-address string Binds on this address when listening to MySQL binary protocol. Useful to restrict listening to 'localhost' only for instance. diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index 068f1e029c9..e8e7c9eacf2 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -72,6 +72,7 @@ Flags: --ceph-backup-storage-config string Path to JSON config file for ceph backup storage. (default "ceph_backup_config.json") --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. + --clone-restart-wait-timeout duration Timeout for waiting for MySQL to restart after CLONE REMOTE. (default 5m0s) --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. diff --git a/go/flags/endtoend/vttestserver.txt b/go/flags/endtoend/vttestserver.txt index d41a6b2c2bc..ad05efc4b19 100644 --- a/go/flags/endtoend/vttestserver.txt +++ b/go/flags/endtoend/vttestserver.txt @@ -91,7 +91,6 @@ Flags: --max-table-shard-size int The maximum number of initial rows in a table shard. Ignored if--initialize-with-random-data is false. The actual number is chosen randomly (default 10000) --min-table-shard-size int The minimum number of initial rows in a table shard. Ignored if--initialize-with-random-data is false. The actual number is chosen randomly. (default 1000) --mysql-bind-host string which host to bind vtgate mysql listener to (default "localhost") - --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-only If this flag is set only mysql is initialized. The rest of the vitess components are not started. Also, the output specifies the mysql unix socket instead of the vtgate port. --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-shell-backup-location string location where the backup will be stored From 3f157ae849f76e90bcb305e4f43af6935476a64d Mon Sep 17 00:00:00 2001 From: Max Englander Date: Wed, 14 Jan 2026 11:02:46 -0500 Subject: [PATCH 28/33] tidy Signed-off-by: Max Englander --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 7f549194a8e..0cee85073b1 100644 --- a/go.mod +++ b/go.mod @@ -102,7 +102,6 @@ require ( github.com/kr/pretty v0.3.1 github.com/kr/text v0.2.0 github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249 - github.com/olekukonko/errors v1.1.0 github.com/shirou/gopsutil/v4 v4.25.8 github.com/spf13/afero v1.15.0 github.com/spf13/jwalterweatherman v1.1.0 @@ -140,6 +139,7 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/olekukonko/cat v0.0.0-20250911104152-50322a0618f6 // indirect + github.com/olekukonko/errors v1.1.0 // indirect github.com/olekukonko/ll v0.1.1 // indirect github.com/opencontainers/runtime-spec v1.2.1 // indirect github.com/puzpuzpuz/xsync/v3 v3.5.1 // indirect From 4d539b752f4ef9fba339dd91c1904b7e47918226 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Wed, 14 Jan 2026 14:07:02 -0500 Subject: [PATCH 29/33] flags Signed-off-by: Max Englander --- go/flags/endtoend/vtcombo.txt | 4 ++++ go/flags/endtoend/vttestserver.txt | 4 ++++ go/vt/mysqlctl/clone.go | 2 +- go/vt/mysqlctl/mysqld.go | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index 70fb7f1d710..823ff74c535 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -36,6 +36,9 @@ Flags: --builtinbackup-progress duration how often to send progress updates when backing up large files. (default 5s) --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified --cell string cell to use + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. + --clone-restart-wait-timeout duration Timeout for waiting for MySQL to restart after CLONE REMOTE. (default 5m0s) --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. @@ -223,6 +226,7 @@ Flags: --mycnf-tmp-dir string mysql tmp directory --mysql-allow-clear-text-without-tls If set, the server will allow the use of a clear text password over non-SSL connections. --mysql-auth-server-impl string Which auth server implementation to use. Options: none, ldap, clientcert, static, vault. (default "static") + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-default-workload string Default session workload (OLTP, OLAP, DBA) (default "OLTP") --mysql-port int mysql port (default 3306) --mysql-server-bind-address string Binds on this address when listening to MySQL binary protocol. Useful to restrict listening to 'localhost' only for instance. diff --git a/go/flags/endtoend/vttestserver.txt b/go/flags/endtoend/vttestserver.txt index ad05efc4b19..f9f6750d71a 100644 --- a/go/flags/endtoend/vttestserver.txt +++ b/go/flags/endtoend/vttestserver.txt @@ -19,6 +19,9 @@ Flags: --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified --cells strings Comma separated list of cells (default [test]) --charset string MySQL charset (default "utf8mb4") + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. + --clone-restart-wait-timeout duration Timeout for waiting for MySQL to restart after CLONE REMOTE. (default 5m0s) --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. @@ -91,6 +94,7 @@ Flags: --max-table-shard-size int The maximum number of initial rows in a table shard. Ignored if--initialize-with-random-data is false. The actual number is chosen randomly (default 10000) --min-table-shard-size int The minimum number of initial rows in a table shard. Ignored if--initialize-with-random-data is false. The actual number is chosen randomly. (default 1000) --mysql-bind-host string which host to bind vtgate mysql listener to (default "localhost") + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-only If this flag is set only mysql is initialized. The rest of the vitess components are not started. Also, the output specifies the mysql unix socket instead of the vtgate port. --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-shell-backup-location string location where the backup will be stored diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index ede6d25be64..8d29b6be3a5 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -54,7 +54,7 @@ var ( ) func init() { - for _, cmd := range []string{"vttablet", "vtbackup"} { + for _, cmd := range []string{"vtcombo", "vttablet", "vtbackup", "vttestserver"} { servenv.OnParseFor(cmd, registerCloneFlags) } } diff --git a/go/vt/mysqlctl/mysqld.go b/go/vt/mysqlctl/mysqld.go index 9a36d3a0b3d..e67139f4556 100644 --- a/go/vt/mysqlctl/mysqld.go +++ b/go/vt/mysqlctl/mysqld.go @@ -138,7 +138,7 @@ func init() { for _, cmd := range []string{"mysqlctl", "mysqlctld", "vtcombo", "vttablet", "vttestserver"} { servenv.OnParseFor(cmd, registerPoolFlags) } - for _, cmd := range []string{"vttablet", "vtbackup"} { + for _, cmd := range []string{"vtcombo", "vttablet", "vtbackup", "vttestserver"} { servenv.OnParseFor(cmd, registerMySQLDCloneFlags) } } From fc6b8e720e8b652cba81c577add48b4f32388996 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Fri, 16 Jan 2026 21:58:30 -0500 Subject: [PATCH 30/33] address cr feedback Signed-off-by: Max Englander --- go/cmd/vtbackup/cli/vtbackup.go | 7 +- go/test/endtoend/backup/clone/backup_test.go | 40 ++++-- go/test/endtoend/backup/clone/main_test.go | 81 +++++------- go/test/endtoend/backup/clone/restore_test.go | 115 +++++++++--------- 4 files changed, 119 insertions(+), 124 deletions(-) diff --git a/go/cmd/vtbackup/cli/vtbackup.go b/go/cmd/vtbackup/cli/vtbackup.go index f661d9aac2d..9a977b23103 100644 --- a/go/cmd/vtbackup/cli/vtbackup.go +++ b/go/cmd/vtbackup/cli/vtbackup.go @@ -53,6 +53,7 @@ import ( tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/proto/vtrpc" ) const ( @@ -463,7 +464,7 @@ func takeBackup(ctx, backgroundCtx context.Context, topoServer *topo.Server, bac if restoreWithClone { restorePos, err = mysqlctl.CloneFromDonor(ctx, topoServer, mysqld, initKeyspace, initShard) if err != nil { - return fmt.Errorf("restore with clone failed: %v", err) + return vterrors.Wrap(err, "restore with clone failed") } } else { phase.Set(phaseNameRestoreLastBackup, int64(1)) @@ -493,11 +494,11 @@ func takeBackup(ctx, backgroundCtx context.Context, topoServer *topo.Server, bac case mysqlctl.ErrNoBackup: // There is no backup found, but we may be taking the initial backup of a shard if !allowFirstBackup { - return errors.New("no backup found; not starting up empty since --initial_backup flag was not enabled") + return vterrors.New(vtrpc.Code_FAILED_PRECONDITION, "no backup found; not starting up empty since --initial_backup flag was not enabled") } restorePos = replication.Position{} default: - return fmt.Errorf("can't restore from backup: %v", err) + return vterrors.Wrap(err, "can't restore from backup") } deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) phase.Set(phaseNameRestoreLastBackup, int64(0)) diff --git a/go/test/endtoend/backup/clone/backup_test.go b/go/test/endtoend/backup/clone/backup_test.go index c7115105f82..c6e9cf90d2a 100644 --- a/go/test/endtoend/backup/clone/backup_test.go +++ b/go/test/endtoend/backup/clone/backup_test.go @@ -18,6 +18,7 @@ package clone import ( "os" + "strings" "testing" "time" @@ -33,6 +34,10 @@ func TestCloneBackup(t *testing.T) { t.Cleanup(func() { removeBackups(t) }) t.Cleanup(tearDown) + // Disable VTOrc recoveries, so that it's not racing with InitShardPrimary + // call to set the primary. + localCluster.DisableVTOrcRecoveries(t) + // Initialize tablets first so we can connect to MySQL. for _, tablet := range []*cluster.Vttablet{primary, replica1} { err := localCluster.InitTablet(tablet, keyspaceName, shardName) @@ -47,7 +52,12 @@ func TestCloneBackup(t *testing.T) { // Now check if MySQL version supports clone (need vttablet running to query). if !mysqlVersionSupportsClone(t, primary) { - t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") + ci, ok := os.LookupEnv("CI") + if !ok || strings.ToLower(ci) != "true" { + t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") + } else { + require.FailNow(t, "CI should be running versions of mysqld that support CLONE") + } } // Check if clone plugin is available. @@ -69,8 +79,13 @@ func TestCloneBackup(t *testing.T) { cluster.VerifyRowsInTablet(t, primary, keyspaceName, 2) // Wait for replica to catch up. - time.Sleep(2 * time.Second) - cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 2) + waitInsertedRows( + t, + replica1, + []string{"clone_test_1", "clone_test_2"}, + 30*time.Second, + 100*time.Millisecond, + ) // Take a backup using clone from primary. log.Infof("Starting vtbackup with --clone-from-primary") @@ -93,13 +108,16 @@ func TestCloneBackup(t *testing.T) { require.NoError(t, err) restore(t, replica2, "replica", "SERVING") - // Give replica2 time to catch up via replication. - time.Sleep(5 * time.Second) - // Verify replica2 has ALL the data (2 rows from before backup + 1 from after). // The 2 pre-backup rows prove the clone-based backup worked. // The 3rd row proves replication is working after restore. - cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 3) + waitInsertedRows( + t, + replica2, + []string{"clone_test_1", "clone_test_2", "after_backup"}, + 30*time.Second, + 100*time.Millisecond, + ) log.Infof("Clone backup verification successful: replica2 has all data") } @@ -109,7 +127,7 @@ func vtbackupWithClone(t *testing.T) error { defer os.Remove(mysqlSocket.Name()) extraArgs := []string{ - "--allow_first_backup", + "--allow-first-backup", "--db-credentials-file", dbCredentialFile, "--mysql-clone-enabled", vtutils.GetFlagVariantForTests("--mysql-socket"), mysqlSocket.Name(), @@ -132,19 +150,19 @@ func verifyBackupCount(t *testing.T, shardKsName string, expected int) []string require.NoError(t, err) var result []string - for _, line := range splitLines(backups) { + for line := range strings.SplitSeq(backups, "\n") { if line != "" { result = append(result, line) } } - assert.Equalf(t, expected, len(result), "expected %d backups, got %d", expected, len(result)) + assert.Len(t, result, expected, "expected %d backups, got %d", expected, len(result)) return result } func restore(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string) { // Start tablet with restore enabled. MySQL is already running from TestMain. log.Infof("restoring tablet %s", time.Now()) - tablet.VttabletProcess.ExtraArgs = []string{"--db-credentials-file", dbCredentialFile} + tablet.VttabletProcess.ExtraArgs = vttabletExtraArgs tablet.VttabletProcess.TabletType = tabletType tablet.VttabletProcess.ServingStatus = waitForState tablet.VttabletProcess.SupportsBackup = true diff --git a/go/test/endtoend/backup/clone/main_test.go b/go/test/endtoend/backup/clone/main_test.go index 0ae21e65e07..231023b7e09 100644 --- a/go/test/endtoend/backup/clone/main_test.go +++ b/go/test/endtoend/backup/clone/main_test.go @@ -17,20 +17,18 @@ limitations under the License. package clone import ( - "errors" "flag" "fmt" "os" "os/exec" "path" - "strconv" "strings" "testing" + "time" "github.com/stretchr/testify/require" "vitess.io/vitess/go/mysql/capabilities" - "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/utils" "vitess.io/vitess/go/vt/log" @@ -231,45 +229,11 @@ func clonePluginAvailable(t *testing.T, tablet *cluster.Vttablet) bool { return status == "ACTIVE" } -// parseVersionFromRow parses MySQL version from a result row -func parseVersionFromRow(row []sqltypes.Value) (int, int, int, error) { - if len(row) == 0 { - return 0, 0, 0, errors.New("empty row") - } - - versionStr := row[0].ToString() - // Version format: "8.0.35" or "8.0.35-27" - parts := strings.Split(versionStr, "-") - versionPart := parts[0] - - versionNums := strings.Split(versionPart, ".") - if len(versionNums) < 3 { - return 0, 0, 0, fmt.Errorf("invalid version format: %s", versionStr) - } - - major, err := strconv.Atoi(versionNums[0]) - if err != nil { - return 0, 0, 0, fmt.Errorf("invalid major version: %s", versionNums[0]) - } - - minor, err := strconv.Atoi(versionNums[1]) - if err != nil { - return 0, 0, 0, fmt.Errorf("invalid minor version: %s", versionNums[1]) - } - - patch, err := strconv.Atoi(versionNums[2]) - if err != nil { - return 0, 0, 0, fmt.Errorf("invalid patch version: %s", versionNums[2]) - } - - return major, minor, patch, nil -} - // removeBackups removes all backups for the test shard. func removeBackups(t *testing.T) { backups, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("GetBackups", shardKsName) require.NoError(t, err) - for _, backup := range splitLines(backups) { + for backup := range strings.SplitSeq(backups, "\n") { if backup != "" { _, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("RemoveBackup", shardKsName, backup) require.NoError(t, err) @@ -277,20 +241,33 @@ func removeBackups(t *testing.T) { } } -// splitLines splits a string by newlines, filtering out empty lines. -func splitLines(s string) []string { - var result []string - start := 0 - for i, c := range s { - if c == '\n' { - if i > start { - result = append(result, s[start:i]) +// waitInsertedRows checks that the specific test data we inserted on primary +// exists on the cloned replica. This proves data was actually transferred. +func waitInsertedRows( + t *testing.T, + tablet *cluster.Vttablet, + expectedValues []string, + waitFor time.Duration, + tickInterval time.Duration, +) { + require.Eventually(t, func() bool { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT msg FROM vt_insert_test ORDER BY id", + keyspaceName, + true, + ) + if err != nil { + return false + } + if len(qr.Rows) != len(expectedValues) { + return false + } + + for i, row := range qr.Rows { + if row[0].ToString() != expectedValues[i] { + return false } - start = i + 1 } - } - if start < len(s) { - result = append(result, s[start:]) - } - return result + return true + }, waitFor, tickInterval) } diff --git a/go/test/endtoend/backup/clone/restore_test.go b/go/test/endtoend/backup/clone/restore_test.go index 7d743d5a67a..0f46eb754e9 100644 --- a/go/test/endtoend/backup/clone/restore_test.go +++ b/go/test/endtoend/backup/clone/restore_test.go @@ -19,6 +19,7 @@ package clone import ( "fmt" "os" + "strings" "testing" "time" @@ -35,6 +36,10 @@ func TestCloneRestore(t *testing.T) { t.Cleanup(func() { removeBackups(t) }) t.Cleanup(tearDownRestoreTest) + // Disable VTOrc recoveries, so that it's not racing with InitShardPrimary + // call to set the primary. + localCluster.DisableVTOrcRecoveries(t) + // Initialize primary and replica1 first (need replica for semi-sync durability). for _, tablet := range []*cluster.Vttablet{primary, replica1} { err := localCluster.InitTablet(tablet, keyspaceName, shardName) @@ -47,12 +52,14 @@ func TestCloneRestore(t *testing.T) { err := localCluster.VtctldClientProcess.InitShardPrimary(keyspaceName, shardName, cell, primary.TabletUID) require.NoError(t, err) - // Wait for replica1 to catch up. - time.Sleep(2 * time.Second) - // Now check if MySQL version supports clone (need vttablet running to query). if !mysqlVersionSupportsClone(t, primary) { - t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") + ci, ok := os.LookupEnv("CI") + if !ok || strings.ToLower(ci) != "true" { + t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") + } else { + require.FailNow(t, "CI should be running versions of mysqld that support CLONE") + } } // Check if clone plugin is available. @@ -91,21 +98,16 @@ func TestCloneRestore(t *testing.T) { require.NoError(t, err) restoreWithClone(t, replica2, "replica", "SERVING") - // Wait for data to exist. - require.Eventually(t, func() bool { - qr, _ := replica2.VttabletProcess.QueryTablet("select * from vt_insert_test", keyspaceName, true) - if qr != nil { - if len(qr.Rows) == 3 { - return true - } - } - return false - }, 5*time.Minute, 10*time.Second) - // Verify clone worked: clone_status confirms, replication is set up. - verifyClonedData(t, replica2) + waitInsertedRows( + t, + replica2, + []string{"clone_restore_1", "clone_restore_2", "clone_restore_3"}, + 30*time.Second, + 100*time.Millisecond, + ) verifyCloneWasUsed(t, replica2) - verifyReplicationTopology(t, replica2) + waitReplicationTopology(t, replica2) // Insert rows on primary and verify they replicate to the cloned replica. for i := 1; i <= 5; i++ { @@ -114,17 +116,25 @@ func TestCloneRestore(t *testing.T) { keyspaceName, true) require.NoError(t, err) } - time.Sleep(5 * time.Second) - cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 8) + // Wait for replica to catch up. + waitInsertedRows( + t, + replica2, + []string{"clone_restore_1", "clone_restore_2", "clone_restore_3", "after_clone_1", "after_clone_2", "after_clone_3", "after_clone_4", "after_clone_5"}, + 30*time.Second, + 100*time.Millisecond, + ) + verifyPostCloneReplication(t, replica2) } // restoreWithClone starts a tablet that will use MySQL CLONE to get its data. func restoreWithClone(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string) { - tablet.VttabletProcess.ExtraArgs = []string{ - "--db-credentials-file", dbCredentialFile, - "--mysql-clone-enabled", + // Start with the base vttablet flags (includes replication flags) + cloneArgs := append([]string{}, vttabletExtraArgs...) + // Add clone-specific flags + cloneArgs = append(cloneArgs, // Enable restore with clone - this triggers the clone logic. "--restore-from-backup=false", "--restore-with-clone", @@ -133,7 +143,8 @@ func restoreWithClone(t *testing.T, tablet *cluster.Vttablet, tabletType string, "--db-clone-user", "vt_clone", "--db-clone-password", "", "--db-clone-use-ssl=false", - } + ) + tablet.VttabletProcess.ExtraArgs = cloneArgs tablet.VttabletProcess.TabletType = tabletType tablet.VttabletProcess.ServingStatus = waitForState tablet.VttabletProcess.SupportsBackup = true @@ -142,44 +153,32 @@ func restoreWithClone(t *testing.T, tablet *cluster.Vttablet, tabletType string, require.NoError(t, err) } -// verifyClonedData checks that the specific test data we inserted on primary -// exists on the cloned replica. This proves data was actually transferred. -func verifyClonedData(t *testing.T, tablet *cluster.Vttablet) { - qr, err := tablet.VttabletProcess.QueryTablet( - "SELECT msg FROM vt_insert_test ORDER BY id", - keyspaceName, - true, - ) - require.NoError(t, err) - require.Len(t, qr.Rows, 3, "Expected 3 rows from clone") - - expectedValues := []string{"clone_restore_1", "clone_restore_2", "clone_restore_3"} - for i, row := range qr.Rows { - assert.Equal(t, expectedValues[i], row[0].ToString()) - } -} - -// verifyReplicationTopology checks that the cloned replica has properly joined +// waitReplicationTopology checks that the cloned replica has properly joined // the replication topology and is replicating from the primary. -func verifyReplicationTopology(t *testing.T, tablet *cluster.Vttablet) { - qr, err := tablet.VttabletProcess.QueryTablet("SHOW REPLICA STATUS", keyspaceName, true) - require.NoError(t, err) - require.NotEmpty(t, qr.Rows, "Replica status is empty - not replicating") - - // Find column indices. - var ioRunningIdx, sqlRunningIdx = -1, -1 - for i, field := range qr.Fields { - switch field.Name { - case "Replica_IO_Running": - ioRunningIdx = i - case "Replica_SQL_Running": - sqlRunningIdx = i +func waitReplicationTopology(t *testing.T, tablet *cluster.Vttablet) { + require.Eventually(t, func() bool { + qr, err := tablet.VttabletProcess.QueryTablet("SHOW REPLICA STATUS", keyspaceName, true) + if err != nil { + return false + } + if len(qr.Rows) == 0 { + return false } - } - row := qr.Rows[0] - assert.Equal(t, "Yes", row[ioRunningIdx].ToString(), "Replica IO thread not running") - assert.Equal(t, "Yes", row[sqlRunningIdx].ToString(), "Replica SQL thread not running") + // Find column indices. + var ioRunningIdx, sqlRunningIdx = -1, -1 + for i, field := range qr.Fields { + switch field.Name { + case "Replica_IO_Running": + ioRunningIdx = i + case "Replica_SQL_Running": + sqlRunningIdx = i + } + } + + row := qr.Rows[0] + return row[ioRunningIdx].ToString() == "Yes" && row[sqlRunningIdx].ToString() == "Yes" + }, 10*time.Second, 100*time.Millisecond) } // verifyPostCloneReplication checks that data inserted after the clone From 4f94b9e7effb33dd28deadaa1a5700835a66db5f Mon Sep 17 00:00:00 2001 From: Max Englander Date: Sat, 17 Jan 2026 13:47:30 -0500 Subject: [PATCH 31/33] cr: use logger var Signed-off-by: Max Englander --- go/vt/vttablet/tabletmanager/tm_init.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index 9e628c510bc..ec2e995fc2d 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -923,7 +923,7 @@ func (tm *TabletManager) handleRestore(ctx context.Context, config *tabletenv.Ta // restoreFromBackup will just be a regular action // (same as if it was triggered remotely) - if err := tm.RestoreBackup(ctx, logutil.NewConsoleLogger(), waitForBackupInterval, false /* deleteBeforeRestore */, backupTime, restoreToTimestamp, restoreToPos, restoreFromBackupAllowedEngines, mysqlShutdownTimeout); err != nil { + if err := tm.RestoreBackup(ctx, logger, waitForBackupInterval, false /* deleteBeforeRestore */, backupTime, restoreToTimestamp, restoreToPos, restoreFromBackupAllowedEngines, mysqlShutdownTimeout); err != nil { log.Exitf("RestoreFromBackup failed: %v", err) } case restoreWithClone: From ae9855fa5cd66f5c9f7bf530de8209f83fba6739 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Tue, 20 Jan 2026 16:18:42 -0500 Subject: [PATCH 32/33] cr: case insensitive, flag validation Signed-off-by: Max Englander --- go/test/endtoend/backup/clone/restore_test.go | 2 +- go/vt/vttablet/tabletmanager/tm_init.go | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/backup/clone/restore_test.go b/go/test/endtoend/backup/clone/restore_test.go index 0f46eb754e9..ee64d9d82fa 100644 --- a/go/test/endtoend/backup/clone/restore_test.go +++ b/go/test/endtoend/backup/clone/restore_test.go @@ -177,7 +177,7 @@ func waitReplicationTopology(t *testing.T, tablet *cluster.Vttablet) { } row := qr.Rows[0] - return row[ioRunningIdx].ToString() == "Yes" && row[sqlRunningIdx].ToString() == "Yes" + return strings.EqualFold(row[ioRunningIdx].ToString(), "Yes") && strings.EqualFold(row[sqlRunningIdx].ToString(), "Yes") }, 10*time.Second, 100*time.Millisecond) } diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index ec2e995fc2d..9c05fb939a4 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -887,6 +887,18 @@ func (tm *TabletManager) handleRestore(ctx context.Context, config *tabletenv.Ta if restoreFromBackup && restoreWithClone { return false, errors.New("--restore-from-backup and --restore-with-clone are mutually exclusive") } + if len(restoreFromBackupAllowedEngines) > 0 && restoreWithClone { + return false, errors.New("--restore-from-backup-allowed-engines and --restore-with-clone are mutually exclusive") + } + if restoreFromBackupTsStr != "" && restoreWithClone { + return false, errors.New("--restore-from-backup-ts and --restore-with-clone are mutually exclusive") + } + if restoreToPos != "" && restoreWithClone { + return false, errors.New("--restore-to-pos and --restore-with-clone are mutually exclusive") + } + if restoreToTimestampStr != "" && restoreWithClone { + return false, errors.New("--restore-to-timestamp and --restore-with-clone are mutually exclusive") + } if restoreToTimestampStr != "" && restoreToPos != "" { return false, errors.New("--restore-to-timestamp and --restore-to-pos are mutually exclusive") } From 0493abb6603e3b8adc8cfe74ee540242a8bfdf4a Mon Sep 17 00:00:00 2001 From: Max Englander Date: Tue, 20 Jan 2026 16:21:55 -0500 Subject: [PATCH 33/33] cr: remove default action Signed-off-by: Max Englander --- go/vt/vttablet/tabletmanager/restore.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/restore.go b/go/vt/vttablet/tabletmanager/restore.go index cb5137f2439..9c09facf89f 100644 --- a/go/vt/vttablet/tabletmanager/restore.go +++ b/go/vt/vttablet/tabletmanager/restore.go @@ -525,9 +525,6 @@ func (rt *restoreStateManager) finish(ctx context.Context, replCmd replicationCo if err := rt.tm.startReplication(ctx, *replCmd.position, rt.prevTabletType); err != nil { return vterrors.Wrapf(err, "failed to start replication with position %q", replCmd.position.String()) } - case replicationActionNone: - fallthrough - default: } // Transition to next tablet type.