Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: support changing batch for slow score scheduler #8888

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 32 additions & 4 deletions pkg/schedule/schedulers/evict_slow_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
// Duration gap for recovering the candidate, unit: s.
RecoveryDurationGap uint64 `json:"recovery-duration"`
EvictedStores []uint64 `json:"evict-stores"`
// TODO: We only add batch for evict-slow-store-scheduler now.
// If necessary, we also need to support evict-slow-trend-scheduler.
Batch int `json:"batch"`
}

func initEvictSlowStoreSchedulerConfig() *evictSlowStoreSchedulerConfig {
Expand All @@ -57,6 +60,7 @@
lastSlowStoreCaptureTS: time.Time{},
RecoveryDurationGap: defaultRecoveryDurationGap,
EvictedStores: make([]uint64, 0),
Batch: EvictLeaderBatchSize,
}
}

Expand All @@ -65,6 +69,7 @@
defer conf.RUnlock()
return &evictSlowStoreSchedulerConfig{
RecoveryDurationGap: conf.RecoveryDurationGap,
Batch: conf.Batch,
}
}

Expand All @@ -81,8 +86,10 @@
return []core.KeyRange{core.NewKeyRange("", "")}
}

func (*evictSlowStoreSchedulerConfig) getBatch() int {
return EvictLeaderBatchSize
func (conf *evictSlowStoreSchedulerConfig) getBatch() int {
conf.RLock()
defer conf.RUnlock()
return conf.Batch
}

func (conf *evictSlowStoreSchedulerConfig) evictStore() uint64 {
Expand Down Expand Up @@ -145,22 +152,39 @@
return
}
recoveryDurationGapFloat, ok := input["recovery-duration"].(float64)
if !ok {
if input["recovery-duration"] != nil && !ok {
handler.rd.JSON(w, http.StatusInternalServerError, errors.New("invalid argument for 'recovery-duration'").Error())
return
}

batch := handler.config.getBatch()
batchFloat, ok := input["batch"].(float64)
if input["batch"] != nil && !ok {
handler.rd.JSON(w, http.StatusInternalServerError, errors.New("invalid argument for 'batch'").Error())
return
}

Check warning on line 165 in pkg/schedule/schedulers/evict_slow_store.go

View check run for this annotation

Codecov / codecov/patch

pkg/schedule/schedulers/evict_slow_store.go#L163-L165

Added lines #L163 - L165 were not covered by tests
if ok {
if batchFloat < 1 || batchFloat > 10 {
handler.rd.JSON(w, http.StatusBadRequest, "batch is invalid, it should be in [1, 10]")
return
}

Check warning on line 170 in pkg/schedule/schedulers/evict_slow_store.go

View check run for this annotation

Codecov / codecov/patch

pkg/schedule/schedulers/evict_slow_store.go#L168-L170

Added lines #L168 - L170 were not covered by tests
batch = (int)(batchFloat)
}

handler.config.Lock()
defer handler.config.Unlock()
prevRecoveryDurationGap := handler.config.RecoveryDurationGap
prevBatch := handler.config.Batch
recoveryDurationGap := uint64(recoveryDurationGapFloat)
handler.config.RecoveryDurationGap = recoveryDurationGap
handler.config.Batch = batch
if err := handler.config.save(); err != nil {
handler.rd.JSON(w, http.StatusInternalServerError, err.Error())
handler.config.RecoveryDurationGap = prevRecoveryDurationGap
handler.config.Batch = prevBatch

Check warning on line 184 in pkg/schedule/schedulers/evict_slow_store.go

View check run for this annotation

Codecov / codecov/patch

pkg/schedule/schedulers/evict_slow_store.go#L184

Added line #L184 was not covered by tests
return
}
log.Info("evict-slow-store-scheduler update 'recovery-duration' - unit: s", zap.Uint64("prev", prevRecoveryDurationGap), zap.Uint64("cur", recoveryDurationGap))
log.Info("evict-slow-store-scheduler update config", zap.Uint64("prev-recovery-duration", prevRecoveryDurationGap), zap.Uint64("cur-recovery-duration", recoveryDurationGap), zap.Int("prev-batch", prevBatch), zap.Int("cur-batch", batch))
handler.rd.JSON(w, http.StatusOK, "Config updated.")
}

Expand Down Expand Up @@ -194,6 +218,9 @@
if err := s.conf.load(newCfg); err != nil {
return err
}
if newCfg.Batch == 0 {
newCfg.Batch = EvictLeaderBatchSize
}

Check warning on line 223 in pkg/schedule/schedulers/evict_slow_store.go

View check run for this annotation

Codecov / codecov/patch

pkg/schedule/schedulers/evict_slow_store.go#L222-L223

Added lines #L222 - L223 were not covered by tests
old := make(map[uint64]struct{})
for _, id := range s.conf.EvictedStores {
old[id] = struct{}{}
Expand All @@ -205,6 +232,7 @@
pauseAndResumeLeaderTransfer(s.conf.cluster, constant.In, old, new)
s.conf.RecoveryDurationGap = newCfg.RecoveryDurationGap
s.conf.EvictedStores = newCfg.EvictedStores
s.conf.Batch = newCfg.Batch
return nil
}

Expand Down
61 changes: 61 additions & 0 deletions pkg/schedule/schedulers/evict_slow_store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"context"
"testing"

"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"

"github.com/pingcap/failpoint"
Expand Down Expand Up @@ -146,3 +147,63 @@ func (suite *evictSlowStoreTestSuite) TestEvictSlowStorePersistFail() {
ops, _ = suite.es.Schedule(suite.tc, false)
re.NotEmpty(ops)
}

func TestEvictSlowStoreBatch(t *testing.T) {
re := require.New(t)
cancel, _, tc, oc := prepareSchedulersTest()
defer cancel()

// Add stores
tc.AddLeaderStore(1, 0)
tc.AddLeaderStore(2, 0)
tc.AddLeaderStore(3, 0)
// Add regions with leader in store 1
for i := range 10000 {
tc.AddLeaderRegion(uint64(i), 1, 2)
}

storage := storage.NewStorageWithMemoryBackend()
es, err := CreateScheduler(types.EvictSlowStoreScheduler, oc, storage, ConfigSliceDecoder(types.EvictSlowStoreScheduler, []string{}), nil)
re.NoError(err)
re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/schedule/schedulers/transientRecoveryGap", "return(true)"))
storeInfo := tc.GetStore(1)
newStoreInfo := storeInfo.Clone(func(store *core.StoreInfo) {
store.GetStoreStats().SlowScore = 100
})
tc.PutStore(newStoreInfo)
re.True(es.IsScheduleAllowed(tc))
// Add evict leader scheduler to store 1
ops, _ := es.Schedule(tc, false)
re.Len(ops, 3)
operatorutil.CheckMultiTargetTransferLeader(re, ops[0], operator.OpLeader, 1, []uint64{2})
re.Equal(types.EvictSlowStoreScheduler.String(), ops[0].Desc())

es.(*evictSlowStoreScheduler).conf.Batch = 5
re.NoError(es.(*evictSlowStoreScheduler).conf.save())
ops, _ = es.Schedule(tc, false)
re.Len(ops, 5)

newStoreInfo = storeInfo.Clone(func(store *core.StoreInfo) {
store.GetStoreStats().SlowScore = 0
})

tc.PutStore(newStoreInfo)
// no slow store need to evict.
ops, _ = es.Schedule(tc, false)
re.Empty(ops)

es2, ok := es.(*evictSlowStoreScheduler)
re.True(ok)
re.Zero(es2.conf.evictStore())

// check the value from storage.
var persistValue evictSlowStoreSchedulerConfig
err = es2.conf.load(&persistValue)
re.NoError(err)

re.Equal(es2.conf.EvictedStores, persistValue.EvictedStores)
re.Zero(persistValue.evictStore())
re.True(persistValue.readyForRecovery())
re.Equal(5, persistValue.Batch)
re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/schedule/schedulers/transientRecoveryGap"))
}
16 changes: 16 additions & 0 deletions tools/pd-ctl/tests/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,22 @@ func (suite *schedulerTestSuite) checkSchedulerConfig(cluster *pdTests.TestClust
})
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-leader-scheduler"}, nil)
re.Contains(echo, "Success!")

// test evict slow store scheduler
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "evict-slow-store-scheduler"}, nil)
re.Contains(echo, "Success!")
conf = make(map[string]any)
conf1 = make(map[string]any)
testutil.Eventually(re, func() bool {
mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler", "show"}, &conf)
return conf["batch"] == 3.
})
echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler", "set", "batch", "10"}, nil)
re.Contains(echo, "Success!")
testutil.Eventually(re, func() bool {
mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-slow-store-scheduler"}, &conf1)
return conf1["batch"] == 10.
})
}

func (suite *schedulerTestSuite) TestGrantHotRegionScheduler() {
Expand Down
Loading