diff --git a/go/cmd/vtbackup/cli/vtbackup.go b/go/cmd/vtbackup/cli/vtbackup.go index a8bbadd87ba..f661d9aac2d 100644 --- a/go/cmd/vtbackup/cli/vtbackup.go +++ b/go/cmd/vtbackup/cli/vtbackup.go @@ -93,6 +93,7 @@ var ( initShard string concurrency = 4 incrementalFromPos string + restoreWithClone bool // mysqlctld-like flags mysqlPort = 3306 @@ -157,7 +158,7 @@ When run periodically for each shard, vtbackup can ensure these configurable pol * Old backups for the shard are removed. Whatever system launches vtbackup is responsible for the following: - - Running vtbackup with similar flags that would be used for a vttablet and + - Running vtbackup with similar flags that would be used for a vttablet and mysqlctld in the target shard to be backed up. - Provisioning as much disk space for vtbackup as would be given to vttablet. @@ -226,6 +227,7 @@ func init() { utils.SetFlagStringVar(Main.Flags(), &initShard, "init-shard", initShard, "(init parameter) shard to use for this tablet") Main.Flags().IntVar(&concurrency, "concurrency", concurrency, "(init restore parameter) how many concurrent files to restore at once") utils.SetFlagStringVar(Main.Flags(), &incrementalFromPos, "incremental-from-pos", incrementalFromPos, "Position, or name of backup from which to create an incremental backup. Default: empty. If given, then this backup becomes an incremental backup from given position or given backup. If value is 'auto', this backup will be taken from the last successful backup position.") + Main.Flags().BoolVar(&restoreWithClone, "restore-with-clone", restoreWithClone, "(init parameter) will perform the restore phase with MySQL CLONE, requires either --clone-from-primary or --clone-from-tablet") // mysqlctld-like flags utils.SetFlagIntVar(Main.Flags(), &mysqlPort, "mysql-port", mysqlPort, "MySQL port") @@ -457,42 +459,49 @@ func takeBackup(ctx, backgroundCtx context.Context, topoServer *topo.Server, bac return nil } - phase.Set(phaseNameRestoreLastBackup, int64(1)) - defer phase.Set(phaseNameRestoreLastBackup, int64(0)) - backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) - log.Infof("Restoring latest backup from directory %v", backupDir) - restoreAt := time.Now() - params := mysqlctl.RestoreParams{ - Cnf: mycnf, - Mysqld: mysqld, - Logger: logutil.NewConsoleLogger(), - Concurrency: concurrency, - HookExtraEnv: extraEnv, - DeleteBeforeRestore: true, - DbName: dbName, - Keyspace: initKeyspace, - Shard: initShard, - Stats: backupstats.RestoreStats(), - MysqlShutdownTimeout: mysqlShutdownTimeout, - } - backupManifest, err := mysqlctl.Restore(ctx, params) var restorePos replication.Position - switch err { - case nil: - // if err is nil, we expect backupManifest to be non-nil - restorePos = backupManifest.Position - log.Infof("Successfully restored from backup at replication position %v", restorePos) - case mysqlctl.ErrNoBackup: - // There is no backup found, but we may be taking the initial backup of a shard - if !allowFirstBackup { - return errors.New("no backup found; not starting up empty since --initial_backup flag was not enabled") - } - restorePos = replication.Position{} - default: - return fmt.Errorf("can't restore from backup: %v", err) - } - deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) - phase.Set(phaseNameRestoreLastBackup, int64(0)) + if restoreWithClone { + restorePos, err = mysqlctl.CloneFromDonor(ctx, topoServer, mysqld, initKeyspace, initShard) + if err != nil { + return fmt.Errorf("restore with clone failed: %v", err) + } + } else { + phase.Set(phaseNameRestoreLastBackup, int64(1)) + defer phase.Set(phaseNameRestoreLastBackup, int64(0)) + backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) + log.Infof("Restoring latest backup from directory %v", backupDir) + restoreAt := time.Now() + params := mysqlctl.RestoreParams{ + Cnf: mycnf, + Mysqld: mysqld, + Logger: logutil.NewConsoleLogger(), + Concurrency: concurrency, + HookExtraEnv: extraEnv, + DeleteBeforeRestore: true, + DbName: dbName, + Keyspace: initKeyspace, + Shard: initShard, + Stats: backupstats.RestoreStats(), + MysqlShutdownTimeout: mysqlShutdownTimeout, + } + backupManifest, err := mysqlctl.Restore(ctx, params) + switch err { + case nil: + // if err is nil, we expect backupManifest to be non-nil + restorePos = backupManifest.Position + log.Infof("Successfully restored from backup at replication position %v", restorePos) + case mysqlctl.ErrNoBackup: + // There is no backup found, but we may be taking the initial backup of a shard + if !allowFirstBackup { + return errors.New("no backup found; not starting up empty since --initial_backup flag was not enabled") + } + restorePos = replication.Position{} + default: + return fmt.Errorf("can't restore from backup: %v", err) + } + deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) + phase.Set(phaseNameRestoreLastBackup, int64(0)) + } // As of MySQL 8.0.21, you can disable redo logging using the ALTER INSTANCE // DISABLE INNODB REDO_LOG statement. This functionality is intended for diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt index 87bb94fd43c..38f9f72eddc 100644 --- a/go/flags/endtoend/vtbackup.txt +++ b/go/flags/endtoend/vtbackup.txt @@ -6,7 +6,7 @@ When run periodically for each shard, vtbackup can ensure these configurable pol * Old backups for the shard are removed. Whatever system launches vtbackup is responsible for the following: - - Running vtbackup with similar flags that would be used for a vttablet and + - Running vtbackup with similar flags that would be used for a vttablet and mysqlctld in the target shard to be backed up. - Provisioning as much disk space for vtbackup as would be given to vttablet. @@ -68,6 +68,8 @@ Flags: --builtinbackup-mysqld-timeout duration how long to wait for mysqld to shutdown at the start of the backup. (default 10m0s) --builtinbackup-progress duration how often to send progress updates when backing up large files. (default 5s) --ceph-backup-storage-config string Path to JSON config file for ceph backup storage. (default "ceph_backup_config.json") + --clone-from-primary Clone data from the primary tablet in the shard using MySQL CLONE REMOTE instead of restoring from backup. Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-tablet. + --clone-from-tablet string Clone data from this tablet using MySQL CLONE REMOTE instead of restoring from backup (tablet alias, e.g., zone1-123). Requires MySQL 8.0.17+. Mutually exclusive with --clone-from-primary. --compression-engine-name string compressor engine used for compression. (default "pargzip") --compression-level int what level to pass to the compressor. (default 1) --concurrency int (init restore parameter) how many concurrent files to restore at once (default 4) @@ -189,6 +191,7 @@ Flags: --mycnf-slow-log-path string mysql slow query log path --mycnf-socket-file string mysql socket file --mycnf-tmp-dir string mysql tmp directory + --mysql-clone-enabled Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+) --mysql-port int MySQL port (default 3306) --mysql-server-version string MySQL server version to advertise. (default "8.4.6-Vitess") --mysql-shell-backup-location string location where the backup will be stored @@ -207,6 +210,7 @@ Flags: --purge-logs-interval duration how often try to remove old logs (default 1h0m0s) --remote-operation-timeout duration time to wait for a remote operation (default 15s) --restart-before-backup Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs. + --restore-with-clone (init parameter) will perform the restore phase with MySQL CLONE, requires either --clone-from-primary or --clone-from-tablet --s3-backup-aws-endpoint string endpoint of the S3 backend (region must be provided). --s3-backup-aws-min-partsize int Minimum part size to use, defaults to 5MiB but can be increased due to the dataset size. (default 5242880) --s3-backup-aws-region string AWS region to use. (default "us-east-1") diff --git a/go/test/endtoend/backup/clone/backup_test.go b/go/test/endtoend/backup/clone/backup_test.go new file mode 100644 index 00000000000..33eb01be722 --- /dev/null +++ b/go/test/endtoend/backup/clone/backup_test.go @@ -0,0 +1,164 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package clone + +import ( + "os" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/vt/log" + vtutils "vitess.io/vitess/go/vt/utils" +) + +func TestCloneBackup(t *testing.T) { + t.Cleanup(func() { removeBackups(t) }) + t.Cleanup(tearDown) + + // Initialize tablets first so we can connect to MySQL. + for _, tablet := range []*cluster.Vttablet{primary, replica1} { + err := localCluster.InitTablet(tablet, keyspaceName, shardName) + require.NoError(t, err) + err = tablet.VttabletProcess.Setup() + require.NoError(t, err) + } + + // Initialize shard primary. + err := localCluster.VtctldClientProcess.InitShardPrimary(keyspaceName, shardName, cell, primary.TabletUID) + require.NoError(t, err) + + // Now check if MySQL version supports clone (need vttablet running to query). + if !mysqlVersionSupportsClone(t, primary) { + t.Skip("Skipping clone test: MySQL version does not support CLONE (requires 8.0.17+)") + } + + // Check if clone plugin is available. + if !clonePluginAvailable(t, primary) { + t.Skip("Skipping clone test: clone plugin not available") + } + + // Set up clean test data (table may have data from previous tests). + _, err = primary.VttabletProcess.QueryTablet(vtInsertTest, keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("TRUNCATE TABLE vt_insert_test", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_test_1')", keyspaceName, true) + require.NoError(t, err) + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('clone_test_2')", keyspaceName, true) + require.NoError(t, err) + + // Verify data exists on primary. + cluster.VerifyRowsInTablet(t, primary, keyspaceName, 2) + + // Wait for replica to catch up. + time.Sleep(2 * time.Second) + cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 2) + + // Take a backup using clone from primary. + log.Infof("Starting vtbackup with --clone-from-primary") + err = vtbackupWithClone(t) + require.NoError(t, err) + + // Verify a backup was created. + backups := verifyBackupCount(t, shardKsName, 1) + assert.NotEmpty(t, backups) + + // Insert more data AFTER the backup was taken. + _, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('after_backup')", keyspaceName, true) + require.NoError(t, err) + cluster.VerifyRowsInTablet(t, primary, keyspaceName, 3) + + // Now bring up replica2 and restore from the backup we just created. + // This verifies the clone-based backup actually contains the data. + log.Infof("Restoring replica2 from backup to verify clone worked") + err = localCluster.InitTablet(replica2, keyspaceName, shardName) + require.NoError(t, err) + restore(t, replica2, "replica", "SERVING") + + // Give replica2 time to catch up via replication. + time.Sleep(5 * time.Second) + + // Verify replica2 has ALL the data (2 rows from before backup + 1 from after). + // The 2 pre-backup rows prove the clone-based backup worked. + // The 3rd row proves replication is working after restore. + cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 3) + log.Infof("Clone backup verification successful: replica2 has all data") +} + +func vtbackupWithClone(t *testing.T) error { + mysqlSocket, err := os.CreateTemp("", "vtbackup_clone_test_mysql.sock") + require.NoError(t, err) + defer os.Remove(mysqlSocket.Name()) + + extraArgs := []string{ + "--allow_first_backup", + "--db-credentials-file", dbCredentialFile, + "--mysql-clone-enabled", + vtutils.GetFlagVariantForTests("--mysql-socket"), mysqlSocket.Name(), + // Clone from primary instead of restoring from backup. + "--restore-with-clone", + "--clone-from-primary", + // Clone credentials - use vt_clone user which is created with @'%' host + // and BACKUP_ADMIN privilege in init_db.sql (no password). + "--db-clone-user", "vt_clone", + "--db-clone-password", "", + "--db-clone-use-ssl=false", + } + + log.Infof("Starting vtbackup with clone args: %v", extraArgs) + return localCluster.StartVtbackup(newInitDBFile, false, keyspaceName, shardName, cell, extraArgs...) +} + +func verifyBackupCount(t *testing.T, shardKsName string, expected int) []string { + backups, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("GetBackups", shardKsName) + require.NoError(t, err) + + var result []string + for _, line := range splitLines(backups) { + if line != "" { + result = append(result, line) + } + } + assert.Equalf(t, expected, len(result), "expected %d backups, got %d", expected, len(result)) + return result +} + +func restore(t *testing.T, tablet *cluster.Vttablet, tabletType string, waitForState string) { + // Start tablet with restore enabled. MySQL is already running from TestMain. + log.Infof("restoring tablet %s", time.Now()) + tablet.VttabletProcess.ExtraArgs = []string{"--db-credentials-file", dbCredentialFile} + tablet.VttabletProcess.TabletType = tabletType + tablet.VttabletProcess.ServingStatus = waitForState + tablet.VttabletProcess.SupportsBackup = true + err := tablet.VttabletProcess.Setup() + require.NoError(t, err) +} + +func tearDown() { + for _, tablet := range []*cluster.Vttablet{primary, replica1, replica2} { + if tablet != nil && tablet.VttabletProcess != nil { + _ = tablet.VttabletProcess.TearDown() + } + if tablet != nil { + _ = localCluster.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", tablet.Alias) + } + } +} diff --git a/go/vt/mysqlctl/clone.go b/go/vt/mysqlctl/clone.go index 295af64898d..5b4e0252528 100644 --- a/go/vt/mysqlctl/clone.go +++ b/go/vt/mysqlctl/clone.go @@ -43,8 +43,7 @@ var ( ) func init() { - // TODO: enable these flags for vtbackup. - for _, cmd := range []string{"vttablet" /*, "vtbackup"*/} { + for _, cmd := range []string{"vttablet", "vtbackup"} { servenv.OnParseFor(cmd, registerCloneFlags) } } diff --git a/go/vt/mysqlctl/mysqld.go b/go/vt/mysqlctl/mysqld.go index 37dde908366..c032dff2db8 100644 --- a/go/vt/mysqlctl/mysqld.go +++ b/go/vt/mysqlctl/mysqld.go @@ -138,6 +138,9 @@ func init() { for _, cmd := range []string{"mysqlctl", "mysqlctld", "vtcombo", "vttablet", "vttestserver"} { servenv.OnParseFor(cmd, registerPoolFlags) } + for _, cmd := range []string{"mysqlctl", "mysqlctld", "vtcombo", "vttablet", "vttestserver", "vtbackup"} { + servenv.OnParseFor(cmd, registerMySQLDCloneFlags) + } } func registerMySQLDFlags(fs *pflag.FlagSet) { @@ -145,6 +148,9 @@ func registerMySQLDFlags(fs *pflag.FlagSet) { utils.SetFlagStringVar(fs, &mycnfTemplateFile, "mysqlctl-mycnf-template", mycnfTemplateFile, "template file to use for generating the my.cnf file during server init") utils.SetFlagStringVar(fs, &socketFile, "mysqlctl-socket", socketFile, "socket file to use for remote mysqlctl actions (empty for local actions)") utils.SetFlagDurationVar(fs, &replicationConnectRetry, "replication-connect-retry", replicationConnectRetry, "how long to wait in between replica reconnect attempts. Only precise to the second.") +} + +func registerMySQLDCloneFlags(fs *pflag.FlagSet) { utils.SetFlagBoolVar(fs, &mysqlCloneEnabled, "mysql-clone-enabled", mysqlCloneEnabled, "Enable MySQL CLONE plugin and user for backup/replica provisioning (requires MySQL 8.0.17+)") }