From f23c45bf182fea85b7451004eac79d051ee77e95 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 10 Dec 2024 16:54:02 +0800 Subject: [PATCH 1/4] support changing batch for slow score scheduler Signed-off-by: Ryan Leung --- pkg/schedule/schedulers/evict_slow_store.go | 34 +++++++++-- .../schedulers/evict_slow_store_test.go | 61 +++++++++++++++++++ .../pd-ctl/tests/scheduler/scheduler_test.go | 14 +++++ 3 files changed, 105 insertions(+), 4 deletions(-) diff --git a/pkg/schedule/schedulers/evict_slow_store.go b/pkg/schedule/schedulers/evict_slow_store.go index 8d8c014b110..11426d058ea 100644 --- a/pkg/schedule/schedulers/evict_slow_store.go +++ b/pkg/schedule/schedulers/evict_slow_store.go @@ -49,6 +49,7 @@ type evictSlowStoreSchedulerConfig struct { // Duration gap for recovering the candidate, unit: s. RecoveryDurationGap uint64 `json:"recovery-duration"` EvictedStores []uint64 `json:"evict-stores"` + Batch int `json:"batch"` } func initEvictSlowStoreSchedulerConfig() *evictSlowStoreSchedulerConfig { @@ -57,6 +58,7 @@ func initEvictSlowStoreSchedulerConfig() *evictSlowStoreSchedulerConfig { lastSlowStoreCaptureTS: time.Time{}, RecoveryDurationGap: defaultRecoveryDurationGap, EvictedStores: make([]uint64, 0), + Batch: EvictLeaderBatchSize, } } @@ -65,6 +67,7 @@ func (conf *evictSlowStoreSchedulerConfig) clone() *evictSlowStoreSchedulerConfi defer conf.RUnlock() return &evictSlowStoreSchedulerConfig{ RecoveryDurationGap: conf.RecoveryDurationGap, + Batch: conf.Batch, } } @@ -81,8 +84,10 @@ func (conf *evictSlowStoreSchedulerConfig) getKeyRangesByID(id uint64) []core.Ke return []core.KeyRange{core.NewKeyRange("", "")} } -func (*evictSlowStoreSchedulerConfig) getBatch() int { - return EvictLeaderBatchSize +func (conf *evictSlowStoreSchedulerConfig) getBatch() int { + conf.RLock() + defer conf.RUnlock() + return conf.Batch } func (conf *evictSlowStoreSchedulerConfig) evictStore() uint64 { @@ -145,22 +150,39 @@ func (handler *evictSlowStoreHandler) updateConfig(w http.ResponseWriter, r *htt return } recoveryDurationGapFloat, ok := input["recovery-duration"].(float64) - if !ok { + if input["recovery-duration"] != nil && !ok { handler.rd.JSON(w, http.StatusInternalServerError, errors.New("invalid argument for 'recovery-duration'").Error()) return } + batch := handler.config.getBatch() + batchFloat, ok := input["batch"].(float64) + if input["batch"] != nil && !ok { + handler.rd.JSON(w, http.StatusInternalServerError, errors.New("invalid argument for 'batch'").Error()) + return + } + if ok { + if batchFloat < 1 || batchFloat > 10 { + handler.rd.JSON(w, http.StatusBadRequest, "batch is invalid, it should be in [1, 10]") + return + } + batch = (int)(batchFloat) + } + handler.config.Lock() defer handler.config.Unlock() prevRecoveryDurationGap := handler.config.RecoveryDurationGap + prevBatch := handler.config.Batch recoveryDurationGap := uint64(recoveryDurationGapFloat) handler.config.RecoveryDurationGap = recoveryDurationGap + handler.config.Batch = batch if err := handler.config.save(); err != nil { handler.rd.JSON(w, http.StatusInternalServerError, err.Error()) handler.config.RecoveryDurationGap = prevRecoveryDurationGap + handler.config.Batch = prevBatch return } - log.Info("evict-slow-store-scheduler update 'recovery-duration' - unit: s", zap.Uint64("prev", prevRecoveryDurationGap), zap.Uint64("cur", recoveryDurationGap)) + log.Info("evict-slow-store-scheduler update config", zap.Uint64("prev-recovery-duration", prevRecoveryDurationGap), zap.Uint64("cur-recovery-duration", recoveryDurationGap), zap.Int("prev-batch", prevBatch), zap.Int("cur-batch", batch)) handler.rd.JSON(w, http.StatusOK, "Config updated.") } @@ -194,6 +216,9 @@ func (s *evictSlowStoreScheduler) ReloadConfig() error { if err := s.conf.load(newCfg); err != nil { return err } + if newCfg.Batch == 0 { + newCfg.Batch = EvictLeaderBatchSize + } old := make(map[uint64]struct{}) for _, id := range s.conf.EvictedStores { old[id] = struct{}{} @@ -205,6 +230,7 @@ func (s *evictSlowStoreScheduler) ReloadConfig() error { pauseAndResumeLeaderTransfer(s.conf.cluster, constant.In, old, new) s.conf.RecoveryDurationGap = newCfg.RecoveryDurationGap s.conf.EvictedStores = newCfg.EvictedStores + s.conf.Batch = newCfg.Batch return nil } diff --git a/pkg/schedule/schedulers/evict_slow_store_test.go b/pkg/schedule/schedulers/evict_slow_store_test.go index 636d856fc14..94c4f645331 100644 --- a/pkg/schedule/schedulers/evict_slow_store_test.go +++ b/pkg/schedule/schedulers/evict_slow_store_test.go @@ -18,6 +18,7 @@ import ( "context" "testing" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" "github.com/pingcap/failpoint" @@ -146,3 +147,63 @@ func (suite *evictSlowStoreTestSuite) TestEvictSlowStorePersistFail() { ops, _ = suite.es.Schedule(suite.tc, false) re.NotEmpty(ops) } + +func TestEvictSlowStoreBatch(t *testing.T) { + re := require.New(t) + cancel, _, tc, oc := prepareSchedulersTest() + defer cancel() + + // Add stores + tc.AddLeaderStore(1, 0) + tc.AddLeaderStore(2, 0) + tc.AddLeaderStore(3, 0) + // Add regions with leader in store 1 + for i := 0; i < 10000; i++ { + tc.AddLeaderRegion(uint64(i), 1, 2) + } + + storage := storage.NewStorageWithMemoryBackend() + es, err := CreateScheduler(types.EvictSlowStoreScheduler, oc, storage, ConfigSliceDecoder(types.EvictSlowStoreScheduler, []string{}), nil) + re.NoError(err) + re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/schedule/schedulers/transientRecoveryGap", "return(true)")) + storeInfo := tc.GetStore(1) + newStoreInfo := storeInfo.Clone(func(store *core.StoreInfo) { + store.GetStoreStats().SlowScore = 100 + }) + tc.PutStore(newStoreInfo) + re.True(es.IsScheduleAllowed(tc)) + // Add evict leader scheduler to store 1 + ops, _ := es.Schedule(tc, false) + re.Len(ops, 3) + operatorutil.CheckMultiTargetTransferLeader(re, ops[0], operator.OpLeader, 1, []uint64{2}) + re.Equal(types.EvictSlowStoreScheduler.String(), ops[0].Desc()) + + es.(*evictSlowStoreScheduler).conf.Batch = 5 + re.NoError(es.(*evictSlowStoreScheduler).conf.save()) + ops, _ = es.Schedule(tc, false) + re.Len(ops, 5) + + newStoreInfo = storeInfo.Clone(func(store *core.StoreInfo) { + store.GetStoreStats().SlowScore = 0 + }) + + tc.PutStore(newStoreInfo) + // no slow store need to evict. + ops, _ = es.Schedule(tc, false) + re.Empty(ops) + + es2, ok := es.(*evictSlowStoreScheduler) + re.True(ok) + re.Zero(es2.conf.evictStore()) + + // check the value from storage. + var persistValue evictSlowStoreSchedulerConfig + err = es2.conf.load(&persistValue) + re.NoError(err) + + re.Equal(es2.conf.EvictedStores, persistValue.EvictedStores) + re.Zero(persistValue.evictStore()) + re.True(persistValue.readyForRecovery()) + re.Equal(5, persistValue.Batch) + re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/schedule/schedulers/transientRecoveryGap")) +} diff --git a/tools/pd-ctl/tests/scheduler/scheduler_test.go b/tools/pd-ctl/tests/scheduler/scheduler_test.go index 9de62f2b1db..521b64cd5bf 100644 --- a/tools/pd-ctl/tests/scheduler/scheduler_test.go +++ b/tools/pd-ctl/tests/scheduler/scheduler_test.go @@ -566,6 +566,20 @@ func (suite *schedulerTestSuite) checkSchedulerConfig(cluster *pdTests.TestClust }) echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-leader-scheduler"}, nil) re.Contains(echo, "Success!") + + // test evict slow store scheduler + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "evict-slow-store-scheduler"}, nil) + re.Contains(echo, "Success!") + conf = make(map[string]any) + conf1 = make(map[string]any) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler", "show"}, &conf) + re.Equal(3., conf["batch"]) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler", "set", "batch", "10"}, nil) + re.Contains(echo, "Success!") + testutil.Eventually(re, func() bool { + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler"}, &conf1) + return conf1["batch"] == 10. + }) } func (suite *schedulerTestSuite) TestGrantHotRegionScheduler() { From 37dc8aef1f662a678956276d4cbfb471e2ec7ffc Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 10 Dec 2024 18:28:47 +0800 Subject: [PATCH 2/4] fix Signed-off-by: Ryan Leung --- pkg/schedule/schedulers/evict_slow_store_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/schedule/schedulers/evict_slow_store_test.go b/pkg/schedule/schedulers/evict_slow_store_test.go index 94c4f645331..29cf1c4f2fa 100644 --- a/pkg/schedule/schedulers/evict_slow_store_test.go +++ b/pkg/schedule/schedulers/evict_slow_store_test.go @@ -158,7 +158,7 @@ func TestEvictSlowStoreBatch(t *testing.T) { tc.AddLeaderStore(2, 0) tc.AddLeaderStore(3, 0) // Add regions with leader in store 1 - for i := 0; i < 10000; i++ { + for i := range 10000 { tc.AddLeaderRegion(uint64(i), 1, 2) } From 369044a0c60698aa83ef4e194d0a5cd52d3db6e2 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Wed, 11 Dec 2024 15:15:21 +0800 Subject: [PATCH 3/4] fix Signed-off-by: Ryan Leung --- tools/pd-ctl/tests/scheduler/scheduler_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/pd-ctl/tests/scheduler/scheduler_test.go b/tools/pd-ctl/tests/scheduler/scheduler_test.go index 521b64cd5bf..2b48e4abf21 100644 --- a/tools/pd-ctl/tests/scheduler/scheduler_test.go +++ b/tools/pd-ctl/tests/scheduler/scheduler_test.go @@ -572,8 +572,10 @@ func (suite *schedulerTestSuite) checkSchedulerConfig(cluster *pdTests.TestClust re.Contains(echo, "Success!") conf = make(map[string]any) conf1 = make(map[string]any) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler", "show"}, &conf) - re.Equal(3., conf["batch"]) + testutil.Eventually(re, func() bool { + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler", "show"}, &conf) + return conf["batch"] == 3. + }) echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler", "set", "batch", "10"}, nil) re.Contains(echo, "Success!") testutil.Eventually(re, func() bool { From 02ea6ecd896b0aa454842a92d1c60ea29adc7054 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Fri, 17 Jan 2025 17:14:13 +0800 Subject: [PATCH 4/4] add TODO Signed-off-by: Ryan Leung --- pkg/schedule/schedulers/evict_slow_store.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/schedule/schedulers/evict_slow_store.go b/pkg/schedule/schedulers/evict_slow_store.go index 11426d058ea..e3e09430359 100644 --- a/pkg/schedule/schedulers/evict_slow_store.go +++ b/pkg/schedule/schedulers/evict_slow_store.go @@ -49,7 +49,9 @@ type evictSlowStoreSchedulerConfig struct { // Duration gap for recovering the candidate, unit: s. RecoveryDurationGap uint64 `json:"recovery-duration"` EvictedStores []uint64 `json:"evict-stores"` - Batch int `json:"batch"` + // TODO: We only add batch for evict-slow-store-scheduler now. + // If necessary, we also need to support evict-slow-trend-scheduler. + Batch int `json:"batch"` } func initEvictSlowStoreSchedulerConfig() *evictSlowStoreSchedulerConfig {