diff --git a/go/cmd/vtbackup/cli/vtbackup.go b/go/cmd/vtbackup/cli/vtbackup.go index a8bbadd87ba..9a977b23103 100644 --- a/go/cmd/vtbackup/cli/vtbackup.go +++ b/go/cmd/vtbackup/cli/vtbackup.go @@ -53,6 +53,7 @@ import ( tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/proto/vtrpc" ) const ( @@ -93,6 +94,7 @@ var ( initShard string concurrency = 4 incrementalFromPos string + restoreWithClone bool // mysqlctld-like flags mysqlPort = 3306 @@ -157,7 +159,7 @@ When run periodically for each shard, vtbackup can ensure these configurable pol * Old backups for the shard are removed. Whatever system launches vtbackup is responsible for the following: - - Running vtbackup with similar flags that would be used for a vttablet and + - Running vtbackup with similar flags that would be used for a vttablet and mysqlctld in the target shard to be backed up. - Provisioning as much disk space for vtbackup as would be given to vttablet. @@ -226,6 +228,7 @@ func init() { utils.SetFlagStringVar(Main.Flags(), &initShard, "init-shard", initShard, "(init parameter) shard to use for this tablet") Main.Flags().IntVar(&concurrency, "concurrency", concurrency, "(init restore parameter) how many concurrent files to restore at once") utils.SetFlagStringVar(Main.Flags(), &incrementalFromPos, "incremental-from-pos", incrementalFromPos, "Position, or name of backup from which to create an incremental backup. Default: empty. If given, then this backup becomes an incremental backup from given position or given backup. If value is 'auto', this backup will be taken from the last successful backup position.") + Main.Flags().BoolVar(&restoreWithClone, "restore-with-clone", restoreWithClone, "(init parameter) will perform the restore phase with MySQL CLONE, requires either --clone-from-primary or --clone-from-tablet") // mysqlctld-like flags utils.SetFlagIntVar(Main.Flags(), &mysqlPort, "mysql-port", mysqlPort, "MySQL port") @@ -457,42 +460,49 @@ func takeBackup(ctx, backgroundCtx context.Context, topoServer *topo.Server, bac return nil } - phase.Set(phaseNameRestoreLastBackup, int64(1)) - defer phase.Set(phaseNameRestoreLastBackup, int64(0)) - backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) - log.Infof("Restoring latest backup from directory %v", backupDir) - restoreAt := time.Now() - params := mysqlctl.RestoreParams{ - Cnf: mycnf, - Mysqld: mysqld, - Logger: logutil.NewConsoleLogger(), - Concurrency: concurrency, - HookExtraEnv: extraEnv, - DeleteBeforeRestore: true, - DbName: dbName, - Keyspace: initKeyspace, - Shard: initShard, - Stats: backupstats.RestoreStats(), - MysqlShutdownTimeout: mysqlShutdownTimeout, - } - backupManifest, err := mysqlctl.Restore(ctx, params) var restorePos replication.Position - switch err { - case nil: - // if err is nil, we expect backupManifest to be non-nil - restorePos = backupManifest.Position - log.Infof("Successfully restored from backup at replication position %v", restorePos) - case mysqlctl.ErrNoBackup: - // There is no backup found, but we may be taking the initial backup of a shard - if !allowFirstBackup { - return errors.New("no backup found; not starting up empty since --initial_backup flag was not enabled") - } - restorePos = replication.Position{} - default: - return fmt.Errorf("can't restore from backup: %v", err) - } - deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) - phase.Set(phaseNameRestoreLastBackup, int64(0)) + if restoreWithClone { + restorePos, err = mysqlctl.CloneFromDonor(ctx, topoServer, mysqld, initKeyspace, initShard) + if err != nil { + return vterrors.Wrap(err, "restore with clone failed") + } + } else { + phase.Set(phaseNameRestoreLastBackup, int64(1)) + defer phase.Set(phaseNameRestoreLastBackup, int64(0)) + backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) + log.Infof("Restoring latest backup from directory %v", backupDir) + restoreAt := time.Now() + params := mysqlctl.RestoreParams{ + Cnf: mycnf, + Mysqld: mysqld, + Logger: logutil.NewConsoleLogger(), + Concurrency: concurrency, + HookExtraEnv: extraEnv, + DeleteBeforeRestore: true, + DbName: dbName, + Keyspace: initKeyspace, + Shard: initShard, + Stats: backupstats.RestoreStats(), + MysqlShutdownTimeout: mysqlShutdownTimeout, + } + backupManifest, err := mysqlctl.Restore(ctx, params) + switch err { + case nil: + // if err is nil, we expect backupManifest to be non-nil + restorePos = backupManifest.Position + log.Infof("Successfully restored from backup at replication position %v", restorePos) + case mysqlctl.ErrNoBackup: + // There is no backup found, but we may be taking the initial backup of a shard + if !allowFirstBackup { + return vterrors.New(vtrpc.Code_FAILED_PRECONDITION, "no backup found; not starting up empty since --initial_backup flag was not enabled") + } + restorePos = replication.Position{} + default: + return vterrors.Wrap(err, "can't restore from backup") + } + deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) + phase.Set(phaseNameRestoreLastBackup, int64(0)) + } // As of MySQL 8.0.21, you can disable redo logging using the ALTER INSTANCE // DISABLE INNODB REDO_LOG statement. This functionality is intended for diff --git a/go/cmd/vttablet/cli/cli_test.go b/go/cmd/vttablet/cli/cli_test.go index 305e736f4c4..da8cc9aa0bf 100644 --- a/go/cmd/vttablet/cli/cli_test.go +++ b/go/cmd/vttablet/cli/cli_test.go @@ -69,5 +69,5 @@ func TestRunFailsToStartTabletManager(t *testing.T) { defer cancel() err := Main.ExecuteContext(ctx) - require.ErrorContains(t, err, "you cannot enable --restore-from-backup without a my.cnf file") + require.ErrorContains(t, err, "you cannot enable --restore-from-backup or --restore-with-clone without a my.cnf file") } diff --git a/go/flags/endtoend/mysqlctl.txt b/go/flags/endtoend/mysqlctl.txt index e252218fd95..36c87e9d6ad 100644 --- a/go/flags/endtoend/mysqlctl.txt +++ b/go/flags/endtoend/mysqlctl.txt @@ -70,7 +70,6 @@ Flags: --log_dir string If non-empty, write log files in this directory --logtostderr log to standard error instead of files --max-stack-size int configure the maximum stack size in bytes (default 67108864) - --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port. (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-socket string Path to the mysqld socket file. diff --git a/go/flags/endtoend/mysqlctld.txt b/go/flags/endtoend/mysqlctld.txt index d5c977ce411..e728e1561d2 100644 --- a/go/flags/endtoend/mysqlctld.txt +++ b/go/flags/endtoend/mysqlctld.txt @@ -99,7 +99,6 @@ Flags: --log_dir string If non-empty, write log files in this directory --logtostderr log to standard error instead of files --max-stack-size int configure the maximum stack size in bytes (default 67108864) - --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-socket string Path to the mysqld socket file diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt index 87bb94fd43c..ed50ef78a65 100644 --- a/go/flags/endtoend/vtbackup.txt +++ b/go/flags/endtoend/vtbackup.txt @@ -6,7 +6,7 @@ When run periodically for each shard, vtbackup can ensure these configurable pol * Old backups for the shard are removed. Whatever system launches vtbackup is responsible for the following: - - Running vtbackup with similar flags that would be used for a vttablet and + - Running vtbackup with similar flags that would be used for a vttablet and mysqlctld in the target shard to be backed up. - Provisioning as much disk space for vtbackup as would be given to vttablet. @@ -68,6 +68,9 @@ Flags: --builtinbackup-mysqld-timeout duration how long to wait for mysqld to shutdown at the start of the backup. (default 10m0s) --builtinbackup-progress duration how often to send progress updates when backing up large files. (default 5s) --ceph-backup-storage-config string Path to JSON config file for ceph backup storage. (default "ceph_backup_config.json") + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. + --clone-restart-wait-timeout duration Timeout for waiting for MySQL to restart after CLONE REMOTE. (default 5m0s) --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --concurrency int (init restore parameter) how many concurrent files to restore at once (default 4) @@ -189,6 +192,7 @@ Flags: --mycnf-slow-log-path string mysql slow query log path --mycnf-socket-file string mysql socket file --mycnf-tmp-dir string mysql tmp directory + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-shell-backup-location string location where the backup will be stored @@ -207,6 +211,7 @@ Flags: --purge-logs-interval duration how often try to remove old logs (default 1h0m0s) --remote-operation-timeout duration time to wait for a remote operation (default 15s) --restart-before-backup Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs. + --restore-with-clone (init parameter) will perform the restore phase with MySQL CLONE, requires either --clone-from-primary or --clone-from-tablet --s3-backup-aws-endpoint string endpoint of the S3 backend (region must be provided). --s3-backup-aws-min-partsize int Minimum part size to use, defaults to 5MiB but can be increased due to the dataset size. (default 5242880) --s3-backup-aws-region string AWS region to use. (default "us-east-1") diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index edfb1058bf4..823ff74c535 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -36,6 +36,9 @@ Flags: --builtinbackup-progress duration how often to send progress updates when backing up large files. (default 5s) --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified --cell string cell to use + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. + --clone-restart-wait-timeout duration Timeout for waiting for MySQL to restart after CLONE REMOTE. (default 5m0s) --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. @@ -323,6 +326,7 @@ Flags: --restore-from-backup-ts string (init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050' --restore-to-pos string (init incremental restore parameter) if set, run a point in time recovery that ends with the given position. This will attempt to use one full backup followed by zero or more incremental backups --restore-to-timestamp string (init incremental restore parameter) if set, run a point in time recovery that restores up to the given timestamp, if possible. Given timestamp in RFC3339 format. Example: '2006-01-02T15:04:05Z07:00' + --restore-with-clone (init restore parameter) will restore from a clone, requires either --clone-from-primary or --clone-from-tablet, mutually exclusive with --restore-from-backup --retain-online-ddl-tables duration How long should vttablet keep an old migrated table before purging it (default 24h0m0s) --sanitize-log-messages Remove potentially sensitive information in tablet INFO, WARNING, and ERROR log messages such as query parameters. --schema-change-reload-timeout duration query server schema change reload timeout, this is how long to wait for the signaled schema reload operation to complete before giving up (default 30s) diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index d9a27b62ea6..e8e7c9eacf2 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -70,6 +70,9 @@ Flags: --builtinbackup-progress duration how often to send progress updates when backing up large files. (default 5s) --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified --ceph-backup-storage-config string Path to JSON config file for ceph backup storage. (default "ceph_backup_config.json") + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. + --clone-restart-wait-timeout duration Timeout for waiting for MySQL to restart after CLONE REMOTE. (default 5m0s) --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. @@ -314,6 +317,7 @@ Flags: --restore-from-backup-ts string (init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050' --restore-to-pos string (init incremental restore parameter) if set, run a point in time recovery that ends with the given position. This will attempt to use one full backup followed by zero or more incremental backups --restore-to-timestamp string (init incremental restore parameter) if set, run a point in time recovery that restores up to the given timestamp, if possible. Given timestamp in RFC3339 format. Example: '2006-01-02T15:04:05Z07:00' + --restore-with-clone (init restore parameter) will restore from a clone, requires either --clone-from-primary or --clone-from-tablet, mutually exclusive with --restore-from-backup --retain-online-ddl-tables duration How long should vttablet keep an old migrated table before purging it (default 24h0m0s) --s3-backup-aws-endpoint string endpoint of the S3 backend (region must be provided). --s3-backup-aws-min-partsize int Minimum part size to use, defaults to 5MiB but can be increased due to the dataset size. (default 5242880) diff --git a/go/flags/endtoend/vttestserver.txt b/go/flags/endtoend/vttestserver.txt index d41a6b2c2bc..f9f6750d71a 100644 --- a/go/flags/endtoend/vttestserver.txt +++ b/go/flags/endtoend/vttestserver.txt @@ -19,6 +19,9 @@ Flags: --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified --cells strings Comma separated list of cells (default [test]) --charset string MySQL charset (default "utf8mb4") + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. + --clone-restart-wait-timeout duration Timeout for waiting for MySQL to restart after CLONE REMOTE. (default 5m0s) --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. diff --git a/go/test/endtoend/backup/clone/backup_test.go b/go/test/endtoend/backup/clone/backup_test.go new file mode 100644 index 00000000000..c6e9cf90d2a --- /dev/null +++ b/go/test/endtoend/backup/clone/backup_test.go @@ -0,0 +1,182 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clone + +import ( + "os" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/vt/log" + vtutils "vitess.io/vitess/go/vt/utils" +) + +func TestCloneBackup(t *testing.T) { + t.Cleanup(func() { removeBackups(t) }) + t.Cleanup(tearDown) + + // Disable VTOrc recoveries, so that it's not racing with InitShardPrimary + // call to set the primary. + localCluster.DisableVTOrcRecoveries(t) + + // Initialize tablets first so we can connect to MySQL. + for _, tablet := range []*cluster.Vttablet{primary, replica1} { + err := localCluster.InitTablet(tablet, keyspaceName, shardName) + require.NoError(t, err) + err = tablet.VttabletProcess.Setup() + require.NoError(t, err) + } + + // Initialize shard primary. + err := localCluster.VtctldClientProcess.InitShardPrimary(keyspaceName, shardName, cell, primary.TabletUID) + require.NoError(t, err) + + // Now check if MySQL version supports clone (need vttablet running to query). + if !mysqlVersionSupportsClone(t, primary) { + ci, ok := os.LookupEnv("CI") + if !ok || strings.ToLower(ci) != "true" { + t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") + } else { + require.FailNow(t, "CI should be running versions of mysqld that support CLONE") + } + } + + // Check if clone plugin is available. + if !clonePluginAvailable(t, primary) { + t.Skip("Skipping clone test: clone plugin not available") + } + + // Set up clean test data (table may have data from previous tests). + _, err = primary.VttabletProcess.QueryTablet(vtInsertTest, keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("TRUNCATE TABLE vt_insert_test", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_test_1')", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_test_2')", keyspaceName, true) + require.NoError(t, err) + + // Verify data exists on primary. + cluster.VerifyRowsInTablet(t, primary, keyspaceName, 2) + + // Wait for replica to catch up. + waitInsertedRows( + t, + replica1, + []string{"clone_test_1", "clone_test_2"}, + 30*time.Second, + 100*time.Millisecond, + ) + + // Take a backup using clone from primary. + log.Infof("Starting vtbackup with --clone-from-primary") + err = vtbackupWithClone(t) + require.NoError(t, err) + + // Verify a backup was created. + backups := verifyBackupCount(t, shardKsName, 1) + assert.NotEmpty(t, backups) + + // Insert more data AFTER the backup was taken. + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('after_backup')", keyspaceName, true) + require.NoError(t, err) + cluster.VerifyRowsInTablet(t, primary, keyspaceName, 3) + + // Now bring up replica2 and restore from the backup we just created. + // This verifies the clone-based backup actually contains the data. + log.Infof("Restoring replica2 from backup to verify clone worked") + err = localCluster.InitTablet(replica2, keyspaceName, shardName) + require.NoError(t, err) + restore(t, replica2, "replica", "SERVING") + + // Verify replica2 has ALL the data (2 rows from before backup + 1 from after). + // The 2 pre-backup rows prove the clone-based backup worked. + // The 3rd row proves replication is working after restore. + waitInsertedRows( + t, + replica2, + []string{"clone_test_1", "clone_test_2", "after_backup"}, + 30*time.Second, + 100*time.Millisecond, + ) + log.Infof("Clone backup verification successful: replica2 has all data") +} + +func vtbackupWithClone(t *testing.T) error { + mysqlSocket, err := os.CreateTemp("", "vtbackup_clone_test_mysql.sock") + require.NoError(t, err) + defer os.Remove(mysqlSocket.Name()) + + extraArgs := []string{ + "--allow-first-backup", + "--db-credentials-file", dbCredentialFile, + "--mysql-clone-enabled", + vtutils.GetFlagVariantForTests("--mysql-socket"), mysqlSocket.Name(), + // Clone from primary instead of restoring from backup. + "--restore-with-clone", + "--clone-from-primary", + // Clone credentials - use vt_clone user which is created with @'%' host + // and BACKUP_ADMIN privilege in init_db.sql (no password). + "--db-clone-user", "vt_clone", + "--db-clone-password", "", + "--db-clone-use-ssl=false", + } + + log.Infof("Starting vtbackup with clone args: %v", extraArgs) + return localCluster.StartVtbackup(newInitDBFile, false, keyspaceName, shardName, cell, extraArgs...) +} + +func verifyBackupCount(t *testing.T, shardKsName string, expected int) []string { + backups, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("GetBackups", shardKsName) + require.NoError(t, err) + + var result []string + for line := range strings.SplitSeq(backups, "\n") { + if line != "" { + result = append(result, line) + } + } + assert.Len(t, result, expected, "expected %d backups, got %d", expected, len(result)) + return result +} + +func restore(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string) { + // Start tablet with restore enabled. MySQL is already running from TestMain. + log.Infof("restoring tablet %s", time.Now()) + tablet.VttabletProcess.ExtraArgs = vttabletExtraArgs + tablet.VttabletProcess.TabletType = tabletType + tablet.VttabletProcess.ServingStatus = waitForState + tablet.VttabletProcess.SupportsBackup = true + err := tablet.VttabletProcess.Setup() + require.NoError(t, err) +} + +func tearDown() { + for _, tablet := range []*cluster.Vttablet{primary, replica1, replica2} { + if tablet != nil && tablet.VttabletProcess != nil { + _ = tablet.VttabletProcess.TearDown() + } + if tablet != nil { + _ = localCluster.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", tablet.Alias) + } + } +} diff --git a/go/test/endtoend/backup/clone/main_test.go b/go/test/endtoend/backup/clone/main_test.go new file mode 100644 index 00000000000..231023b7e09 --- /dev/null +++ b/go/test/endtoend/backup/clone/main_test.go @@ -0,0 +1,273 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clone + +import ( + "flag" + "fmt" + "os" + "os/exec" + "path" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql/capabilities" + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/utils" + "vitess.io/vitess/go/vt/log" + vtutils "vitess.io/vitess/go/vt/utils" +) + +var ( + primary *cluster.Vttablet + replica1 *cluster.Vttablet + replica2 *cluster.Vttablet + localCluster *cluster.LocalProcessCluster + newInitDBFile string + cell = cluster.DefaultCell + hostname = "localhost" + keyspaceName = "ks" + shardName = "0" + dbPassword = "VtDbaPass" + shardKsName = fmt.Sprintf("%s/%s", keyspaceName, shardName) + dbCredentialFile string + vttabletExtraArgs = []string{ + vtutils.GetFlagVariantForTests("--vreplication-retry-delay"), "1s", + vtutils.GetFlagVariantForTests("--degraded-threshold"), "5s", + vtutils.GetFlagVariantForTests("--lock-tables-timeout"), "5s", + vtutils.GetFlagVariantForTests("--watch-replication-stream"), + vtutils.GetFlagVariantForTests("--enable-replication-reporter"), + vtutils.GetFlagVariantForTests("--serving-state-grace-period"), "1s", + } + vtInsertTest = ` + create table if not exists vt_insert_test ( + id bigint auto_increment, + msg varchar(64), + primary key (id) + ) Engine=InnoDB;` +) + +func TestMain(m *testing.M) { + flag.Parse() + + exitCode, err := func() (int, error) { + localCluster = cluster.NewCluster(cell, hostname) + defer localCluster.Teardown() + + // Setup EXTRA_MY_CNF for clone plugin + if err := setupExtraMyCnf(); err != nil { + log.Errorf("Failed to setup extra MySQL config: %v", err) + return 1, err + } + + // Start topo server + err := localCluster.StartTopo() + if err != nil { + return 1, err + } + + // Start keyspace + localCluster.Keyspaces = []cluster.Keyspace{ + { + Name: keyspaceName, + Shards: []cluster.Shard{ + { + Name: shardName, + }, + }, + }, + } + shard := &localCluster.Keyspaces[0].Shards[0] + vtctldClientProcess := cluster.VtctldClientProcessInstance(localCluster.VtctldProcess.GrpcPort, localCluster.TopoPort, "localhost", localCluster.TmpDirectory) + _, err = vtctldClientProcess.ExecuteCommandWithOutput("CreateKeyspace", keyspaceName, "--durability-policy=semi_sync") + if err != nil { + return 1, err + } + + // Create a new init_db.sql file that sets up passwords for all users and clone user + dbCredentialFile = cluster.WriteDbCredentialToTmp(localCluster.TmpDirectory) + initDb, _ := os.ReadFile(path.Join(os.Getenv("VTROOT"), "/config/init_db.sql")) + initClone, err := os.ReadFile(path.Join(os.Getenv("VTROOT"), "/config/init_clone.sql")) + if err != nil { + log.Warningf("init_clone.sql not found, clone tests may fail: %v", err) + initClone = []byte("") + } + + sql := string(initDb) + // The original init_db.sql does not have any passwords. Here we update the init file with passwords + sql, err = utils.GetInitDBSQL(sql, cluster.GetPasswordUpdateSQL(localCluster), string(initClone)) + if err != nil { + return 1, err + } + newInitDBFile = path.Join(localCluster.TmpDirectory, "init_db_with_passwords_and_clone.sql") + err = os.WriteFile(newInitDBFile, []byte(sql), 0666) + if err != nil { + return 1, err + } + + mysqlctlExtraArgs := []string{"--db-credentials-file", dbCredentialFile} + vttabletExtraArgs = append(vttabletExtraArgs, + "--db-credentials-file", dbCredentialFile, + "--mysql-clone-enabled") + + primary = localCluster.NewVttabletInstance("replica", 0, "") + replica1 = localCluster.NewVttabletInstance("replica", 0, "") + replica2 = localCluster.NewVttabletInstance("replica", 0, "") + shard.Vttablets = []*cluster.Vttablet{primary, replica1, replica2} + + // Start MySql processes + var mysqlProcs []*exec.Cmd + for _, tablet := range shard.Vttablets { + tablet.VttabletProcess = localCluster.VtprocessInstanceFromVttablet(tablet, shard.Name, keyspaceName) + tablet.VttabletProcess.DbPassword = dbPassword + tablet.VttabletProcess.ExtraArgs = vttabletExtraArgs + tablet.VttabletProcess.SupportsBackup = true + + mysqlctlProcess, err := cluster.MysqlCtlProcessInstance(tablet.TabletUID, tablet.MySQLPort, localCluster.TmpDirectory) + if err != nil { + return 1, err + } + tablet.MysqlctlProcess = *mysqlctlProcess + tablet.MysqlctlProcess.InitDBFile = newInitDBFile + tablet.MysqlctlProcess.ExtraArgs = mysqlctlExtraArgs + proc, err := tablet.MysqlctlProcess.StartProcess() + if err != nil { + return 1, err + } + mysqlProcs = append(mysqlProcs, proc) + } + for _, proc := range mysqlProcs { + if err := proc.Wait(); err != nil { + return 1, err + } + } + + if localCluster.VtTabletMajorVersion >= 16 { + // If vttablets are any lower than version 16, then they are running the replication manager. + // Running VTOrc and replication manager sometimes creates the situation where VTOrc has set up semi-sync on the primary, + // but the replication manager starts replication on the replica without setting semi-sync. This hangs the primary. + // Even if VTOrc fixes it, since there is no ongoing traffic, the state remains blocked. + if err := localCluster.StartVTOrc(cell, keyspaceName); err != nil { + return 1, err + } + } + + return m.Run(), nil + }() + + if err != nil { + log.Error(err.Error()) + os.Exit(1) + } else { + os.Exit(exitCode) + } +} + +// setupExtraMyCnf sets EXTRA_MY_CNF to include clone plugin configuration +func setupExtraMyCnf() error { + cloneCnfPath := path.Join(os.Getenv("VTROOT"), "config", "mycnf", "clone.cnf") + if _, err := os.Stat(cloneCnfPath); os.IsNotExist(err) { + return fmt.Errorf("clone.cnf not found at %s", cloneCnfPath) + } + + // Check if EXTRA_MY_CNF is already set + existing := os.Getenv("EXTRA_MY_CNF") + if existing != "" { + // Append clone.cnf to existing + if err := os.Setenv("EXTRA_MY_CNF", existing+":"+cloneCnfPath); err != nil { + return fmt.Errorf("failed to set EXTRA_MY_CNF: %v", err) + } + } else { + if err := os.Setenv("EXTRA_MY_CNF", cloneCnfPath); err != nil { + return fmt.Errorf("failed to set EXTRA_MY_CNF: %v", err) + } + } + + log.Infof("Set EXTRA_MY_CNF to include clone plugin: %s", os.Getenv("EXTRA_MY_CNF")) + return nil +} + +// mysqlVersionSupportsClone checks if the MySQL version supports CLONE plugin +func mysqlVersionSupportsClone(t *testing.T, tablet *cluster.Vttablet) bool { + conn, err := tablet.VttabletProcess.TabletConn(keyspaceName, false) + require.NoError(t, err, "failed to get tablet connection") + ok, err := conn.SupportsCapability(capabilities.MySQLClonePluginFlavorCapability) + require.NoError(t, err, "failed to check clone capability") + return ok +} + +// clonePluginAvailable checks if the clone plugin is installed and active +func clonePluginAvailable(t *testing.T, tablet *cluster.Vttablet) bool { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT PLUGIN_STATUS FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME = 'clone'", + keyspaceName, true) + if err != nil { + t.Logf("Failed to check clone plugin: %v", err) + return false + } + if len(qr.Rows) == 0 { + return false + } + status := qr.Rows[0][0].ToString() + return status == "ACTIVE" +} + +// removeBackups removes all backups for the test shard. +func removeBackups(t *testing.T) { + backups, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("GetBackups", shardKsName) + require.NoError(t, err) + for backup := range strings.SplitSeq(backups, "\n") { + if backup != "" { + _, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("RemoveBackup", shardKsName, backup) + require.NoError(t, err) + } + } +} + +// waitInsertedRows checks that the specific test data we inserted on primary +// exists on the cloned replica. This proves data was actually transferred. +func waitInsertedRows( + t *testing.T, + tablet *cluster.Vttablet, + expectedValues []string, + waitFor time.Duration, + tickInterval time.Duration, +) { + require.Eventually(t, func() bool { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT msg FROM vt_insert_test ORDER BY id", + keyspaceName, + true, + ) + if err != nil { + return false + } + if len(qr.Rows) != len(expectedValues) { + return false + } + + for i, row := range qr.Rows { + if row[0].ToString() != expectedValues[i] { + return false + } + } + return true + }, waitFor, tickInterval) +} diff --git a/go/test/endtoend/backup/clone/restore_test.go b/go/test/endtoend/backup/clone/restore_test.go new file mode 100644 index 00000000000..0f46eb754e9 --- /dev/null +++ b/go/test/endtoend/backup/clone/restore_test.go @@ -0,0 +1,228 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clone + +import ( + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/test/endtoend/cluster" +) + +// TestCloneRestore tests clone-based replica provisioning via vttablet's +// --restore-with-clone flag. This simulates the workflow where a new replica +// is provisioned by cloning data from the primary instead of restoring from backup. +func TestCloneRestore(t *testing.T) { + t.Cleanup(func() { removeBackups(t) }) + t.Cleanup(tearDownRestoreTest) + + // Disable VTOrc recoveries, so that it's not racing with InitShardPrimary + // call to set the primary. + localCluster.DisableVTOrcRecoveries(t) + + // Initialize primary and replica1 first (need replica for semi-sync durability). + for _, tablet := range []*cluster.Vttablet{primary, replica1} { + err := localCluster.InitTablet(tablet, keyspaceName, shardName) + require.NoError(t, err) + err = tablet.VttabletProcess.Setup() + require.NoError(t, err) + } + + // Initialize shard primary. + err := localCluster.VtctldClientProcess.InitShardPrimary(keyspaceName, shardName, cell, primary.TabletUID) + require.NoError(t, err) + + // Now check if MySQL version supports clone (need vttablet running to query). + if !mysqlVersionSupportsClone(t, primary) { + ci, ok := os.LookupEnv("CI") + if !ok || strings.ToLower(ci) != "true" { + t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") + } else { + require.FailNow(t, "CI should be running versions of mysqld that support CLONE") + } + } + + // Check if clone plugin is available. + if !clonePluginAvailable(t, primary) { + t.Skip("Skipping clone test: clone plugin not available") + } + + // Set up clean test data (table may have data from previous tests). + _, err = primary.VttabletProcess.QueryTablet(vtInsertTest, keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("TRUNCATE TABLE vt_insert_test", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_restore_1')", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_restore_2')", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_restore_3')", keyspaceName, true) + require.NoError(t, err) + + // Verify data exists on primary. + cluster.VerifyRowsInTablet(t, primary, keyspaceName, 3) + + // Clean up replica2's MySQL data from previous test so clone can work. + // Stop MySQL, remove data directory, and restart with fresh data. + err = replica2.MysqlctlProcess.Stop() + require.NoError(t, err) + err = os.RemoveAll(replica2.VttabletProcess.Directory) + require.NoError(t, err) + proc, err := replica2.MysqlctlProcess.StartProcess() + require.NoError(t, err) + err = proc.Wait() + require.NoError(t, err) + + // Bring up replica2 using clone from primary. + err = localCluster.InitTablet(replica2, keyspaceName, shardName) + require.NoError(t, err) + restoreWithClone(t, replica2, "replica", "SERVING") + + // Verify clone worked: clone_status confirms, replication is set up. + waitInsertedRows( + t, + replica2, + []string{"clone_restore_1", "clone_restore_2", "clone_restore_3"}, + 30*time.Second, + 100*time.Millisecond, + ) + verifyCloneWasUsed(t, replica2) + waitReplicationTopology(t, replica2) + + // Insert rows on primary and verify they replicate to the cloned replica. + for i := 1; i <= 5; i++ { + _, err = primary.VttabletProcess.QueryTablet( + fmt.Sprintf("insert into vt_insert_test (msg) values ('after_clone_%d')", i), + keyspaceName, true) + require.NoError(t, err) + } + + // Wait for replica to catch up. + waitInsertedRows( + t, + replica2, + []string{"clone_restore_1", "clone_restore_2", "clone_restore_3", "after_clone_1", "after_clone_2", "after_clone_3", "after_clone_4", "after_clone_5"}, + 30*time.Second, + 100*time.Millisecond, + ) + + verifyPostCloneReplication(t, replica2) +} + +// restoreWithClone starts a tablet that will use MySQL CLONE to get its data. +func restoreWithClone(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string) { + // Start with the base vttablet flags (includes replication flags) + cloneArgs := append([]string{}, vttabletExtraArgs...) + // Add clone-specific flags + cloneArgs = append(cloneArgs, + // Enable restore with clone - this triggers the clone logic. + "--restore-from-backup=false", + "--restore-with-clone", + // Clone configuration - tells vttablet to clone instead of restoring from backup. + "--clone-from-primary", + "--db-clone-user", "vt_clone", + "--db-clone-password", "", + "--db-clone-use-ssl=false", + ) + tablet.VttabletProcess.ExtraArgs = cloneArgs + tablet.VttabletProcess.TabletType = tabletType + tablet.VttabletProcess.ServingStatus = waitForState + tablet.VttabletProcess.SupportsBackup = true + + err := tablet.VttabletProcess.Setup() + require.NoError(t, err) +} + +// waitReplicationTopology checks that the cloned replica has properly joined +// the replication topology and is replicating from the primary. +func waitReplicationTopology(t *testing.T, tablet *cluster.Vttablet) { + require.Eventually(t, func() bool { + qr, err := tablet.VttabletProcess.QueryTablet("SHOW REPLICA STATUS", keyspaceName, true) + if err != nil { + return false + } + if len(qr.Rows) == 0 { + return false + } + + // Find column indices. + var ioRunningIdx, sqlRunningIdx = -1, -1 + for i, field := range qr.Fields { + switch field.Name { + case "Replica_IO_Running": + ioRunningIdx = i + case "Replica_SQL_Running": + sqlRunningIdx = i + } + } + + row := qr.Rows[0] + return row[ioRunningIdx].ToString() == "Yes" && row[sqlRunningIdx].ToString() == "Yes" + }, 10*time.Second, 100*time.Millisecond) +} + +// verifyPostCloneReplication checks that data inserted after the clone +// was properly replicated to the cloned replica. +func verifyPostCloneReplication(t *testing.T, tablet *cluster.Vttablet) { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT msg FROM vt_insert_test WHERE msg LIKE 'after_clone_%' ORDER BY id", + keyspaceName, + true, + ) + require.NoError(t, err) + require.Len(t, qr.Rows, 5, "Expected 5 post-clone rows via replication") + + for i, row := range qr.Rows { + expected := fmt.Sprintf("after_clone_%d", i+1) + assert.Equal(t, expected, row[0].ToString()) + } +} + +// verifyCloneWasUsed checks performance_schema.clone_status to verify that +// MySQL CLONE was actually used to restore the tablet. +func verifyCloneWasUsed(t *testing.T, tablet *cluster.Vttablet) { + qr, err := tablet.VttabletProcess.QueryTablet( + "SELECT STATE, SOURCE, ERROR_NO FROM performance_schema.clone_status", + keyspaceName, + true, + ) + require.NoError(t, err) + require.NotEmpty(t, qr.Rows, "clone_status is empty - CLONE was not used") + + row := qr.Rows[0] + assert.Equal(t, "Completed", row[0].ToString(), "Clone did not complete") + assert.NotEmpty(t, row[1].ToString(), "Clone source is empty") + assert.Equal(t, "0", row[2].ToString(), "Clone had an error") +} + +// tearDownRestoreTest cleans up tablets created during the restore test. +func tearDownRestoreTest() { + for _, tablet := range []*cluster.Vttablet{primary, replica1, replica2} { + if tablet != nil && tablet.VttabletProcess != nil { + _ = tablet.VttabletProcess.TearDown() + } + if tablet != nil { + _ = localCluster.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", tablet.Alias) + } + } +} diff --git a/go/vt/mysqlctl/backup.go b/go/vt/mysqlctl/backup.go index 97af3224894..0c4abdb1a39 100644 --- a/go/vt/mysqlctl/backup.go +++ b/go/vt/mysqlctl/backup.go @@ -315,21 +315,22 @@ func removeExistingFiles(cnf *Mycnf) error { // ShouldRestore checks whether a database with tables already exists // and returns whether a restore action should be performed -func ShouldRestore(ctx context.Context, params RestoreParams) (bool, error) { - if params.DeleteBeforeRestore || RestoreWasInterrupted(params.Cnf) { +func ShouldRestore(ctx context.Context, logger logutil.Logger, cnf *Mycnf, mysqld MysqlDaemon, + dbName string, deleteBeforeRestore bool) (bool, error) { + if deleteBeforeRestore || RestoreWasInterrupted(cnf) { return true, nil } - params.Logger.Infof("Restore: No %v file found, checking no existing data is present", RestoreState) + logger.Infof("Restore: No %v file found, checking no existing data is present", RestoreState) // Wait for mysqld to be ready, in case it was launched in parallel with us. // If this doesn't succeed, we should not attempt a restore - if err := params.Mysqld.Wait(ctx, params.Cnf); err != nil { + if err := mysqld.Wait(ctx, cnf); err != nil { return false, err } - if err := params.Mysqld.WaitForDBAGrants(ctx, DbaGrantWaitTime); err != nil { - params.Logger.Errorf("error waiting for the grants: %v", err) + if err := mysqld.WaitForDBAGrants(ctx, DbaGrantWaitTime); err != nil { + logger.Errorf("error waiting for the grants: %v", err) return false, err } - return checkNoDB(ctx, params.Mysqld, params.DbName) + return checkNoDB(ctx, mysqld, dbName) } // ensureRestoredGTIDPurgedMatchesManifest sees the following: when you restore a full backup, you want the MySQL server to have diff --git a/go/vt/mysqlctl/backup_test.go b/go/vt/mysqlctl/backup_test.go index 380ee762d24..cbdaf047ddf 100644 --- a/go/vt/mysqlctl/backup_test.go +++ b/go/vt/mysqlctl/backup_test.go @@ -693,12 +693,14 @@ func TestParseBackupName(t *testing.T) { func TestShouldRestore(t *testing.T) { env := createFakeBackupRestoreEnv(t) - b, err := ShouldRestore(env.ctx, env.restoreParams) + b, err := ShouldRestore(env.ctx, env.restoreParams.Logger, env.restoreParams.Cnf, + env.restoreParams.Mysqld, env.restoreParams.DbName, env.restoreParams.DeleteBeforeRestore) assert.False(t, b) assert.Error(t, err) env.restoreParams.DeleteBeforeRestore = true - b, err = ShouldRestore(env.ctx, env.restoreParams) + b, err = ShouldRestore(env.ctx, env.restoreParams.Logger, env.restoreParams.Cnf, + env.restoreParams.Mysqld, env.restoreParams.DbName, env.restoreParams.DeleteBeforeRestore) assert.True(t, b) assert.NoError(t, err) env.restoreParams.DeleteBeforeRestore = false @@ -706,14 +708,16 @@ func TestShouldRestore(t *testing.T) { env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ "SHOW DATABASES": {Rows: [][]sqltypes.Value{{sqltypes.NewVarBinary("any_db")}}}, } - b, err = ShouldRestore(env.ctx, env.restoreParams) + b, err = ShouldRestore(env.ctx, env.restoreParams.Logger, env.restoreParams.Cnf, + env.restoreParams.Mysqld, env.restoreParams.DbName, env.restoreParams.DeleteBeforeRestore) assert.NoError(t, err) assert.True(t, b) env.mysqld.FetchSuperQueryMap = map[string]*sqltypes.Result{ "SHOW DATABASES": {Rows: [][]sqltypes.Value{{sqltypes.NewVarBinary("test")}}}, } - b, err = ShouldRestore(env.ctx, env.restoreParams) + b, err = ShouldRestore(env.ctx, env.restoreParams.Logger, env.restoreParams.Cnf, + env.restoreParams.Mysqld, env.restoreParams.DbName, env.restoreParams.DeleteBeforeRestore) assert.False(t, b) assert.NoError(t, err) } diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 928853742e7..8d29b6be3a5 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -54,8 +54,7 @@ var ( ) func init() { - // TODO: enable these flags for vttablet and vtbackup. - for _, cmd := range []string{ /*"vttablet", "vtbackup"*/ } { + for _, cmd := range []string{"vtcombo", "vttablet", "vtbackup", "vttestserver"} { servenv.OnParseFor(cmd, registerCloneFlags) } } diff --git a/go/vt/mysqlctl/mysqld.go b/go/vt/mysqlctl/mysqld.go index b255fc32f4c..e67139f4556 100644 --- a/go/vt/mysqlctl/mysqld.go +++ b/go/vt/mysqlctl/mysqld.go @@ -138,6 +138,9 @@ func init() { for _, cmd := range []string{"mysqlctl", "mysqlctld", "vtcombo", "vttablet", "vttestserver"} { servenv.OnParseFor(cmd, registerPoolFlags) } + for _, cmd := range []string{"vtcombo", "vttablet", "vtbackup", "vttestserver"} { + servenv.OnParseFor(cmd, registerMySQLDCloneFlags) + } } func registerMySQLDFlags(fs *pflag.FlagSet) { @@ -145,7 +148,6 @@ func registerMySQLDFlags(fs *pflag.FlagSet) { utils.SetFlagStringVar(fs, &mycnfTemplateFile, "mysqlctl-mycnf-template", mycnfTemplateFile, "template file to use for generating the my.cnf file during server init") utils.SetFlagStringVar(fs, &socketFile, "mysqlctl-socket", socketFile, "socket file to use for remote mysqlctl actions (empty for local actions)") utils.SetFlagDurationVar(fs, &replicationConnectRetry, "replication-connect-retry", replicationConnectRetry, "how long to wait in between replica reconnect attempts. Only precise to the second.") - utils.SetFlagBoolVar(fs, &mysqlCloneEnabled, "mysql-clone-enabled", mysqlCloneEnabled, "Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+)") } // MySQLCloneEnabled returns whether MySQL CLONE support is enabled. @@ -158,6 +160,10 @@ func SetMySQLCloneEnabled(enabled bool) { mysqlCloneEnabled = enabled } +func registerMySQLDCloneFlags(fs *pflag.FlagSet) { + utils.SetFlagBoolVar(fs, &mysqlCloneEnabled, "mysql-clone-enabled", mysqlCloneEnabled, "Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+)") +} + func registerReparentFlags(fs *pflag.FlagSet) { utils.SetFlagBoolVar(fs, &DisableActiveReparents, "disable-active-reparents", DisableActiveReparents, "if set, do not allow active reparents. Use this to protect a cluster using external reparents.") } diff --git a/go/vt/vttablet/tabletmanager/restore.go b/go/vt/vttablet/tabletmanager/restore.go index a648f01efe4..cb5137f2439 100644 --- a/go/vt/vttablet/tabletmanager/restore.go +++ b/go/vt/vttablet/tabletmanager/restore.go @@ -18,7 +18,6 @@ package tabletmanager import ( "context" - "errors" "fmt" "time" @@ -52,9 +51,10 @@ var ( restoreFromBackupTsStr string restoreConcurrency = 4 waitForBackupInterval time.Duration + restoreWithClone bool - statsRestoreBackupTime *stats.String - statsRestoreBackupPosition *stats.String + statsRestoreBackupTime *stats.String + statsRestoreBackup *stats.String ) func registerRestoreFlags(fs *pflag.FlagSet) { @@ -63,6 +63,7 @@ func registerRestoreFlags(fs *pflag.FlagSet) { utils.SetFlagStringVar(fs, &restoreFromBackupTsStr, "restore-from-backup-ts", restoreFromBackupTsStr, "(init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050'") utils.SetFlagIntVar(fs, &restoreConcurrency, "restore-concurrency", restoreConcurrency, "(init restore parameter) how many concurrent files to restore at once") utils.SetFlagDurationVar(fs, &waitForBackupInterval, "wait-for-backup-interval", waitForBackupInterval, "(init restore parameter) if this is greater than 0, instead of starting up empty when no backups are found, keep checking at this interval for a backup to appear") + utils.SetFlagBoolVar(fs, &restoreWithClone, "restore-with-clone", restoreWithClone, "(init restore parameter) will restore from a clone, requires either --clone-from-primary or --clone-from-tablet, mutually exclusive with --restore-from-backup") } var ( @@ -84,14 +85,13 @@ func init() { servenv.OnParseFor("vttablet", registerIncrementalRestoreFlags) statsRestoreBackupTime = stats.NewString("RestoredBackupTime") - statsRestoreBackupPosition = stats.NewString("RestorePosition") + statsRestoreBackup = stats.NewString("RestorePosition") } -// RestoreData is the main entry point for backup restore. -// It will either work, fail gracefully, or return -// an error in case of a non-recoverable error. +// RestoreBackup is the main entry point for backup restore. It will either +// work, fail gracefully, or return an error in case of a non-recoverable error. // It takes the action lock so no RPC interferes. -func (tm *TabletManager) RestoreData( +func (tm *TabletManager) RestoreBackup( ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, @@ -105,9 +105,6 @@ func (tm *TabletManager) RestoreData( return err } defer tm.unlock() - if tm.Cnf == nil { - return errors.New("cannot perform restore without my.cnf, please restart vttablet with a my.cnf file specified") - } var ( err error @@ -115,31 +112,7 @@ func (tm *TabletManager) RestoreData( ) defer func() { - stopTime := time.Now() - - h := hook.NewSimpleHook("vttablet_restore_done") - h.ExtraEnv = tm.hookExtraEnv() - h.ExtraEnv["TM_RESTORE_DATA_START_TS"] = startTime.UTC().Format(time.RFC3339) - h.ExtraEnv["TM_RESTORE_DATA_STOP_TS"] = stopTime.UTC().Format(time.RFC3339) - h.ExtraEnv["TM_RESTORE_DATA_DURATION"] = stopTime.Sub(startTime).String() - - if err != nil { - h.ExtraEnv["TM_RESTORE_DATA_ERROR"] = err.Error() - } - - // vttablet_restore_done is best-effort (for now?). - go func() { - // Package vthook already logs the stdout/stderr of hooks when they - // are run, so we don't duplicate that here. - hr := h.Execute() - switch hr.ExitStatus { - case hook.HOOK_SUCCESS: - case hook.HOOK_DOES_NOT_EXIST: - log.Info("No vttablet_restore_done hook.") - default: - log.Warning("vttablet_restore_done hook failed") - } - }() + tm.invokeRestoreDoneHook(startTime, err) }() startTime = time.Now() @@ -150,16 +123,16 @@ func (tm *TabletManager) RestoreData( RestoreToTimestamp: protoutil.TimeToProto(restoreToTimetamp), AllowedBackupEngines: allowedBackupEngines, } - err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, req, mysqlShutdownTimeout) + err = tm.restoreBackupLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, req, mysqlShutdownTimeout) if err != nil { return err } + return nil } -func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, request *tabletmanagerdatapb.RestoreFromBackupRequest, mysqlShutdownTimeout time.Duration) error { +func (tm *TabletManager) restoreBackupLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, request *tabletmanagerdatapb.RestoreFromBackupRequest, mysqlShutdownTimeout time.Duration) error { tablet := tm.Tablet() - originalType := tablet.Type // Try to restore. Depending on the reason for failure, we may be ok. // If we're not ok, return an error and the tm will log.Fatalf, // causing the process to be restarted and the restore retried. @@ -177,7 +150,7 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L return vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("snapshot keyspace %v has no base_keyspace set", tablet.Keyspace)) } keyspace = keyspaceInfo.BaseKeyspace - log.Infof("Using base_keyspace %v to restore keyspace %v using a backup time of %v", keyspace, tablet.Keyspace, protoutil.TimeFromProto(request.BackupTime).UTC()) + logger.Infof("Using base_keyspace %v to restore keyspace %v using a backup time of %v", keyspace, tablet.Keyspace, protoutil.TimeFromProto(request.BackupTime).UTC()) } startTime := protoutil.TimeFromProto(request.BackupTime).UTC() @@ -216,35 +189,27 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L // Restore to given timestamp params.RestoreToTimestamp = restoreToTimestamp } - params.Logger.Infof("Restore: original tablet type=%v", originalType) - // Check whether we're going to restore before changing to RESTORE type, - // so we keep our PrimaryTermStartTime (if any) if we aren't actually restoring. - ok, err := mysqlctl.ShouldRestore(ctx, params) - if err != nil { - return err - } - if !ok { - params.Logger.Infof("Attempting to restore, but mysqld already contains data. Assuming vttablet was just restarted.") + rsm := tm.newRestoreStateManager(logger, deleteBeforeRestore) + + if ok, err := rsm.start(ctx); !ok || err != nil { + if err != nil { + return vterrors.Wrap(err, "failed to start restore") + } + // Restore cannot be started for a benign reason, e.g. mysqld already + // has data. return nil } - // We should not become primary after restore, because that would incorrectly - // start a new primary term, and it's likely our data dir will be out of date. - if originalType == topodatapb.TabletType_PRIMARY { - originalType = tm.baseTabletType - } - if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE, DBActionNone); err != nil { - return err - } + // Loop until a backup exists, unless we were told to give up immediately. var backupManifest *mysqlctl.BackupManifest for { backupManifest, err = mysqlctl.Restore(ctx, params) if backupManifest != nil { - statsRestoreBackupPosition.Set(replication.EncodePosition(backupManifest.Position)) + statsRestoreBackup.Set(replication.EncodePosition(backupManifest.Position)) statsRestoreBackupTime.Set(backupManifest.BackupTime) } - params.Logger.Infof("Restore: got a restore manifest: %v, err=%v, waitForBackupInterval=%v", backupManifest, err, waitForBackupInterval) + logger.Infof("Restore: got a restore manifest: %v, err=%v, waitForBackupInterval=%v", backupManifest, err, waitForBackupInterval) if waitForBackupInterval == 0 { break } @@ -264,8 +229,10 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L var pos replication.Position if backupManifest != nil { pos = backupManifest.Position - params.Logger.Infof("Restore: pos=%v", replication.EncodePosition(pos)) + logger.Infof("Restore: pos=%v", replication.EncodePosition(pos)) } + + var replCmd replicationCommand switch { case err == nil && backupManifest != nil: // Starting from here we won't be able to recover if we get stopped by a cancelled @@ -274,53 +241,96 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L // The whole point of point-in-time recovery is that we want to restore up to a given position, // and to NOT proceed from that position. We want to disable replication and NOT let the replica catch // up with the primary. - params.Logger.Infof("Restore: disabling replication") - if err := tm.disableReplication(context.Background()); err != nil { - return err - } + replCmd.action = replicationActionDisable } else if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL { // Reconnect to primary only for "NORMAL" keyspaces - params.Logger.Infof("Restore: starting replication at position %v", pos) - if err := tm.startReplication(ctx, pos, originalType); err != nil { - return err - } + replCmd.action = replicationActionStart + replCmd.position = &pos } case err == mysqlctl.ErrNoBackup: // Starting with empty database. // We just need to initialize replication - _, err := tm.initializeReplication(ctx, originalType) - if err != nil { - return err - } + replCmd.action = replicationActionInitialize case err == nil && params.DryRun: // Do nothing here, let the rest of code run - params.Logger.Infof("Dry run. No changes made") + logger.Infof("Dry run. No changes made") default: - bgCtx := context.Background() - // If anything failed, we should reset the original tablet type - if err := tm.tmState.ChangeTabletType(bgCtx, originalType, DBActionNone); err != nil { - log.Errorf("Could not change back to original tablet type %v: %v", originalType, err) + if err := rsm.abort(); err != nil { + logger.Errorf("Failed to abort restore: %v", err) } - return vterrors.Wrap(err, "Can't restore backup") + return vterrors.Wrap(err, "can't restore backup") } - // If we had type BACKUP or RESTORE it's better to set our type to the init-tablet-type to make result of the restore - // similar to completely clean start from scratch. - if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && initTabletType != "" { - initType, err := topoproto.ParseTabletType(initTabletType) - if err == nil { - originalType = initType - } - } if params.IsIncrementalRecovery() && !params.DryRun { // override - params.Logger.Infof("Restore: will set tablet type to DRAINED as this is a point in time recovery") - originalType = topodatapb.TabletType_DRAINED + logger.Infof("Restore: will set tablet type to DRAINED as this is a point in time recovery") + rsm.setNextTabletType(topodatapb.TabletType_DRAINED) + } + + if err := rsm.finish(ctx, replCmd); err != nil { + return vterrors.Wrap(err, "failed to finish restore") } - params.Logger.Infof("Restore: changing tablet type to %v for %s", originalType, tm.tabletAlias.String()) - // Change type back to original type if we're ok to serve. - bgCtx := context.Background() - return tm.tmState.ChangeTabletType(bgCtx, originalType, DBActionNone) + + return nil +} + +func (tm *TabletManager) restoreFromClone(ctx context.Context, logger logutil.Logger, deleteBeforeRestore bool) error { + if err := tm.lock(ctx); err != nil { + return err + } + defer tm.unlock() + + var ( + err error + startTime time.Time + ) + + defer func() { + tm.invokeRestoreDoneHook(startTime, err) + }() + + startTime = time.Now() + + err = tm.restoreFromCloneLocked(ctx, logger, deleteBeforeRestore) + if err != nil { + return err + } + + return nil +} + +func (tm *TabletManager) restoreFromCloneLocked( + ctx context.Context, + logger logutil.Logger, + deleteBeforeRestore bool) error { + rsm := tm.newRestoreStateManager(logger, deleteBeforeRestore) + + if ok, err := rsm.start(ctx); !ok || err != nil { + if err != nil { + return vterrors.Wrap(err, "failed to start restore") + } + // Restore cannot be started for a benign reason, e.g. mysqld already + // has data. + return nil + } + + tablet := tm.Tablet() + pos, err := mysqlctl.CloneFromDonor(ctx, tm.TopoServer, tm.MysqlDaemon, tablet.Keyspace, tablet.Shard) + if err != nil { + err = vterrors.Wrap(err, "failed to clone from donor") + if err := rsm.abort(); err != nil { + logger.Errorf("Failed to abort restore: %v", err) + } + return err + } + + statsRestoreBackup.Set(replication.EncodePosition(pos)) + + if err := rsm.finish(ctx, replicationCommand{action: replicationActionStart, position: &pos}); err != nil { + return vterrors.Wrap(err, "failed to finish restore") + } + + return nil } // disableReplication stops and resets replication on the mysql server. It moreover sets impossible replication @@ -387,3 +397,214 @@ func (tm *TabletManager) startReplication(ctx context.Context, pos replication.P return nil } + +func (tm *TabletManager) invokeRestoreDoneHook(startTime time.Time, err error) { + stopTime := time.Now() + + h := hook.NewSimpleHook("vttablet_restore_done") + h.ExtraEnv = tm.hookExtraEnv() + h.ExtraEnv["TM_RESTORE_DATA_START_TS"] = startTime.UTC().Format(time.RFC3339) + h.ExtraEnv["TM_RESTORE_DATA_STOP_TS"] = stopTime.UTC().Format(time.RFC3339) + h.ExtraEnv["TM_RESTORE_DATA_DURATION"] = stopTime.Sub(startTime).String() + + if err != nil { + h.ExtraEnv["TM_RESTORE_DATA_ERROR"] = err.Error() + } + + // vttablet_restore_done is best-effort (for now?). + go func() { + // Package vthook already logs the stdout/stderr of hooks when they + // are run, so we don't duplicate that here. + hr := h.Execute() + switch hr.ExitStatus { + case hook.HOOK_SUCCESS: + case hook.HOOK_DOES_NOT_EXIST: + log.Info("No vttablet_restore_done hook.") + default: + log.Warning("vttablet_restore_done hook failed") + } + }() +} + +type replicationAction int + +const ( + replicationActionNone replicationAction = iota + replicationActionDisable + replicationActionInitialize + replicationActionStart +) + +// newRestoreStateManager returns a new restoreStateManager, used to perform +// restore functionality that is common across restore methods. +func (tm *TabletManager) newRestoreStateManager(logger logutil.Logger, deleteBeforeRestore bool) *restoreStateManager { + return &restoreStateManager{ + deleteBeforeRestore: deleteBeforeRestore, + logger: logger, + tm: tm, + } +} + +// replicationCommand contains instructions for initializing, starting, or +// disabling replication. +type replicationCommand struct { + // action is the replication action to take. + action replicationAction + // position is used by the replicationActionStart action. + position *replication.Position +} + +// restoreState represents the state of a restoreStateManager. +type restoreState int + +const ( + // restoreNotStarted is the initial state of a restore. + restoreNotStarted restoreState = iota + // restoreStarted is used to indicate a restore has started. + restoreStarted + // restoreDone is used to indicate a restore is either + // finished or aborted. + restoreDone +) + +// restoreStateManager is used by restore methods (RestoreBackup, restoreClone) +// to perform common routines, such as transitioning the tablet type to and from +// RESTORE, and setting up replication. +type restoreStateManager struct { + deleteBeforeRestore bool + logger logutil.Logger + tm *TabletManager + + state restoreState + + prevTabletType topodatapb.TabletType + nextTabletType topodatapb.TabletType +} + +// abort reverts the tablet type to its previous state. +func (rt *restoreStateManager) abort() error { + if rt.state != restoreStarted { + return vterrors.New(vtrpcpb.Code_INTERNAL, "restore cannot be aborted in current state") + } + + // Transition to previous tablet type. + if err := rt.tm.tmState.ChangeTabletType(context.Background(), rt.prevTabletType, DBActionNone); err != nil { + return vterrors.Wrapf(err, "failed to change tablet type to %q", topoproto.TabletTypeLString(rt.prevTabletType)) + } + + // Mark the state as done. + rt.state = restoreDone + + return nil +} + +// finish completes the restore by reverting the tablet type to its next state +// (either previous state or a different state requested by setNextTabletType), +// and performs the supplied replication command. +func (rt *restoreStateManager) finish(ctx context.Context, replCmd replicationCommand) error { + if rt.state != restoreStarted { + return vterrors.New(vtrpcpb.Code_INTERNAL, "restore cannot be finished in current state") + } + + // Perform replication command. + switch replCmd.action { + case replicationActionDisable: + rt.logger.Infof("Restore: disabling replication") + if err := rt.tm.disableReplication(context.Background()); err != nil { + return vterrors.Wrap(err, "failed to disable replication") + } + case replicationActionInitialize: + if _, err := rt.tm.initializeReplication(ctx, rt.prevTabletType); err != nil { + return vterrors.Wrap(err, "failed to initialize replication") + } + case replicationActionStart: + if replCmd.position == nil { + return vterrors.New(vtrpcpb.Code_INTERNAL, "cannot start replication with nil position") + } + rt.logger.Infof("Restore: starting replication at position %v", *replCmd.position) + if err := rt.tm.startReplication(ctx, *replCmd.position, rt.prevTabletType); err != nil { + return vterrors.Wrapf(err, "failed to start replication with position %q", replCmd.position.String()) + } + case replicationActionNone: + fallthrough + default: + } + + // Transition to next tablet type. + if err := rt.tm.tmState.ChangeTabletType(context.Background(), rt.nextTabletType, DBActionNone); err != nil { + return vterrors.Wrapf(err, "failed to change tablet type to %q", topoproto.TabletTypeLString(rt.nextTabletType)) + } + + // Mark the state as done. + rt.state = restoreDone + + return nil +} + +// start the restore by changing the tablet type to RESTORE. +// +// Returns true with nil error if the tablet type is successfully changed to +// RESTORE. +// +// Returns false with nil error if the restore cannot be restarted for a benign +// reason, such as data already exists. +// +// Returns false with error if the restore cannot be started at this time (e.g. +// failed to check if data already exists) or the tablet type could not be +// changed to restore. +func (rt *restoreStateManager) start(ctx context.Context) (bool, error) { + if rt.state != restoreNotStarted { + return false, vterrors.New(vtrpcpb.Code_INTERNAL, "restore cannot be started in current state") + } + + if rt.tm.Cnf == nil { + return false, vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, "cannot perform restore without my.cnf, please restart vttablet with a my.cnf file specified") + } + + // Check whether we're going to restore before changing to RESTORE type, + // so we keep our PrimaryTermStartTime (if any) if we aren't actually restoring. + ok, err := mysqlctl.ShouldRestore(ctx, rt.logger, rt.tm.Cnf, rt.tm.MysqlDaemon, + topoproto.TabletDbName(rt.tm.Tablet()), rt.deleteBeforeRestore) + if err != nil { + return false, vterrors.Wrap(err, "failed to check if should restore") + } + if !ok { + rt.logger.Infof("Attempting to restore, but mysqld already contains data. Assuming vttablet was just restarted.") + return false, nil + } + + // Store previous tablet type so we can revert back to it later. + prevTabletType := rt.tm.Tablet().Type + // We should not become primary after restore, because that would incorrectly + // start a new primary term, and it's likely our data dir will be out of date. + if prevTabletType == topodatapb.TabletType_PRIMARY { + prevTabletType = rt.tm.baseTabletType + } + rt.prevTabletType = prevTabletType + + // Prepare next tablet type to transition to from RESTORE state. + nextTabletType := prevTabletType + // If we had type BACKUP or RESTORE it's better to set our type to the init-tablet-type to make result of the restore + // similar to completely clean start from scratch. + if (prevTabletType == topodatapb.TabletType_BACKUP || prevTabletType == topodatapb.TabletType_RESTORE) && initTabletType != "" { + initType, err := topoproto.ParseTabletType(initTabletType) + if err == nil { + nextTabletType = initType + } + } + rt.nextTabletType = nextTabletType + + // Transition to RESTORE state. + if err := rt.tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE, DBActionNone); err != nil { + return false, err + } + + // Mark the state as started. + rt.state = restoreStarted + + return true, nil +} + +func (rt *restoreStateManager) setNextTabletType(nextTabletType topodatapb.TabletType) { + rt.nextTabletType = nextTabletType +} diff --git a/go/vt/vttablet/tabletmanager/rpc_backup.go b/go/vt/vttablet/tabletmanager/rpc_backup.go index 60444a03545..9bd6469e9df 100644 --- a/go/vt/vttablet/tabletmanager/rpc_backup.go +++ b/go/vt/vttablet/tabletmanager/rpc_backup.go @@ -207,7 +207,7 @@ func (tm *TabletManager) RestoreFromBackup(ctx context.Context, logger logutil.L l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // Now we can run restore. - err = tm.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */, request, mysqlShutdownTimeout) + err = tm.restoreBackupLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */, request, mysqlShutdownTimeout) // Re-run health check to be sure to capture any replication delay. tm.QueryServiceControl.BroadcastHealth() diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index 35986a4209a..ec2e995fc2d 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -881,16 +881,26 @@ func (tm *TabletManager) initTablet(ctx context.Context) error { func (tm *TabletManager) handleRestore(ctx context.Context, config *tabletenv.TabletConfig) (bool, error) { // Sanity check for inconsistent flags - if tm.Cnf == nil && restoreFromBackup { - return false, errors.New("you cannot enable --restore-from-backup without a my.cnf file") + if tm.Cnf == nil && (restoreFromBackup || restoreWithClone) { + return false, errors.New("you cannot enable --restore-from-backup or --restore-with-clone without a my.cnf file") + } + if restoreFromBackup && restoreWithClone { + return false, errors.New("--restore-from-backup and --restore-with-clone are mutually exclusive") } if restoreToTimestampStr != "" && restoreToPos != "" { return false, errors.New("--restore-to-timestamp and --restore-to-pos are mutually exclusive") } + if !restoreFromBackup && !restoreWithClone { + return false, nil + } + // Restore in the background - if restoreFromBackup { - go func() { + go func() { + logger := logutil.NewConsoleLogger() + + switch { + case restoreFromBackup: // Zero date will cause us to use the latest, which is the default backupTime := time.Time{} // Or if a backup timestamp was specified then we use the last backup taken at or before that time @@ -913,23 +923,26 @@ func (tm *TabletManager) handleRestore(ctx context.Context, config *tabletenv.Ta // restoreFromBackup will just be a regular action // (same as if it was triggered remotely) - if err := tm.RestoreData(ctx, logutil.NewConsoleLogger(), waitForBackupInterval, false /* deleteBeforeRestore */, backupTime, restoreToTimestamp, restoreToPos, restoreFromBackupAllowedEngines, mysqlShutdownTimeout); err != nil { + if err := tm.RestoreBackup(ctx, logger, waitForBackupInterval, false /* deleteBeforeRestore */, backupTime, restoreToTimestamp, restoreToPos, restoreFromBackupAllowedEngines, mysqlShutdownTimeout); err != nil { log.Exitf("RestoreFromBackup failed: %v", err) } - - // Make sure we have the correct privileges for the DBA user before we start the state manager. - err := tm.waitForDBAGrants(config, mysqlctl.DbaGrantWaitTime) - if err != nil { - log.Exitf("Failed waiting for DBA grants: %v", err) + case restoreWithClone: + if err := tm.restoreFromClone(ctx, logger, false /*deleteBeforeRestore*/); err != nil { + log.Exitf("restoreFromClone failed: %v", err) } + } - // Open the state manager after restore is done. - tm.tmState.Open() - }() - return true, nil - } + // Make sure we have the correct privileges for the DBA user before we start the state manager. + err := tm.waitForDBAGrants(config, mysqlctl.DbaGrantWaitTime) + if err != nil { + log.Exitf("Failed waiting for DBA grants: %v", err) + } + + // Open the state manager after restore is done. + tm.tmState.Open() + }() - return false, nil + return true, nil } // waitForDBAGrants waits for DBA user to have the required privileges to function properly. diff --git a/go/vt/wrangler/testlib/backup_test.go b/go/vt/wrangler/testlib/backup_test.go index b540fc9f8f0..ca5b6dec40e 100644 --- a/go/vt/wrangler/testlib/backup_test.go +++ b/go/vt/wrangler/testlib/backup_test.go @@ -263,7 +263,7 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { RelayLogInfoPath: path.Join(root, "relay-log.info"), } - err = destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* backupTime */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) + err = destTablet.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* backupTime */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) if err != nil { return err } @@ -303,7 +303,7 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { primary.FakeMysqlDaemon.SetReplicationPositionPos = primary.FakeMysqlDaemon.GetPrimaryPositionLocked() // restore primary from latest backup - require.NoError(t, primary.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout), + require.NoError(t, primary.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout), "RestoreData failed") // tablet was created as PRIMARY, so it's baseTabletType is PRIMARY assert.Equal(t, topodatapb.TabletType_PRIMARY, primary.Tablet.Type) @@ -319,7 +319,7 @@ func testBackupRestore(t *testing.T, cDetails *compressionDetails) error { } // Test restore with the backup timestamp - require.NoError(t, primary.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, backupTime, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout), + require.NoError(t, primary.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, backupTime, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout), "RestoreData with backup timestamp failed") assert.Equal(t, topodatapb.TabletType_PRIMARY, primary.Tablet.Type) assert.False(t, primary.FakeMysqlDaemon.Replicating) @@ -521,7 +521,7 @@ func TestBackupRestoreLagged(t *testing.T) { errCh = make(chan error, 1) go func(ctx context.Context, tablet *FakeTablet) { - errCh <- tablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) + errCh <- tablet.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) }(ctx, destTablet) timer = time.NewTicker(1 * time.Second) @@ -715,7 +715,7 @@ func TestRestoreUnreachablePrimary(t *testing.T) { defer cancel() // Restore will return an error while trying to contact the primary for its position, but otherwise will succeed. // The replication won't be running however, since we can't run errant GTID detection without the primary being online. - err = destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) + err = destTablet.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout) require.ErrorContains(t, err, "DeadlineExceeded") // verify the full status require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed") @@ -871,7 +871,7 @@ func TestDisableActiveReparents(t *testing.T) { RelayLogInfoPath: path.Join(root, "relay-log.info"), } - require.NoError(t, destTablet.TM.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout)) + require.NoError(t, destTablet.TM.RestoreBackup(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */, time.Time{} /* restoreFromBackupTs */, time.Time{} /* restoreToTimestamp */, "" /* restoreToPos */, []string{} /* ignoreBackupEngines */, mysqlShutdownTimeout)) // verify the full status require.NoError(t, destTablet.FakeMysqlDaemon.CheckSuperQueryList(), "destTablet.FakeMysqlDaemon.CheckSuperQueryList failed") assert.False(t, destTablet.FakeMysqlDaemon.Replicating) diff --git a/test/config.json b/test/config.json index 25b33ec5f1e..1d62c793c71 100644 --- a/test/config.json +++ b/test/config.json @@ -91,6 +91,15 @@ "RetryMax": 1, "Tags": [] }, + "backup_clone": { + "File": "unused.go", + "Args": ["vitess.io/vitess/go/test/endtoend/backup/clone", "-timeout", "30m"], + "Command": [], + "Manual": false, + "Shard": "21", + "RetryMax": 1, + "Tags": [] + }, "backup_pitr": { "File": "unused.go", "Args": ["vitess.io/vitess/go/test/endtoend/backup/pitr", "-timeout", "30m"],