tikv · rleungx · Nov 28, 2024 · Nov 28, 2024 · Nov 29, 2024 · Dec 2, 2024
diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go
@@ -282,6 +282,7 @@ func deleteAllRegionCache(c *gin.Context) {
 		return
 	}
 	cluster.ResetRegionCache()
+	cluster.ResetPrepared()
 	c.String(http.StatusOK, "All regions are removed from server cache.")
 }
 

diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go
@@ -8,7 +8,6 @@ import (
 	"time"
 
 	"github.com/pingcap/errors"
-	"github.com/pingcap/failpoint"
 	"github.com/pingcap/kvproto/pkg/metapb"
 	"github.com/pingcap/kvproto/pkg/pdpb"
 	"github.com/pingcap/kvproto/pkg/schedulingpb"
@@ -67,7 +66,6 @@ type Cluster struct {
 const (
 	regionLabelGCInterval = time.Hour
 	requestTimeout        = 3 * time.Second
-	collectWaitTime       = time.Minute
 
 	// heartbeat relative const
 	heartbeatTaskRunner = "heartbeat-task-runner"
@@ -491,12 +489,7 @@ func (c *Cluster) runUpdateStoreStats() {
 func (c *Cluster) runCoordinator() {
 	defer logutil.LogPanic()
 	defer c.wg.Done()
-	// force wait for 1 minute to make prepare checker won't be directly skipped
-	runCollectWaitTime := collectWaitTime
-	failpoint.Inject("changeRunCollectWaitTime", func() {
-		runCollectWaitTime = 1 * time.Second
-	})
-	c.coordinator.RunUntilStop(runCollectWaitTime)
+	c.coordinator.RunUntilStop()
 }
 
 func (c *Cluster) runMetricsCollectionJob() {
@@ -706,6 +699,11 @@ func (c *Cluster) SetPrepared() {
 	c.coordinator.GetPrepareChecker().SetPrepared()
 }
 
+// ResetPrepared reset the prepare checker.
+func (c *Cluster) ResetPrepared() {
+	c.coordinator.GetPrepareChecker().ResetPrepared()
+}
+
 // IsSchedulingHalted returns whether the scheduling is halted.
 // Currently, the microservice scheduling is halted when:
 //   - The `HaltScheduling` persist option is set to true.

diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go
@@ -909,3 +909,6 @@ func (mc *Cluster) ObserveRegionsStats() {
 	storeIDs, writeBytesRates, writeKeysRates := mc.BasicCluster.GetStoresWriteRate()
 	mc.HotStat.ObserveRegionsStats(storeIDs, writeBytesRates, writeKeysRates)
 }
+
+// ResetPrepared mocks method.
+func (*Cluster) ResetPrepared() {}
diff --git a/pkg/schedule/checker/checker_controller.go b/pkg/schedule/checker/checker_controller.go
@@ -89,10 +89,11 @@ type Controller struct {
 	// patrolRegionScanLimit is the limit of regions to scan.
 	// It is calculated by the number of regions.
 	patrolRegionScanLimit int
+	prepareChecker        *sche.PrepareChecker
 }
 
 // NewController create a new Controller.
-func NewController(ctx context.Context, cluster sche.CheckerCluster, conf config.CheckerConfigProvider, ruleManager *placement.RuleManager, labeler *labeler.RegionLabeler, opController *operator.Controller) *Controller {
+func NewController(ctx context.Context, cluster sche.CheckerCluster, conf config.CheckerConfigProvider, ruleManager *placement.RuleManager, labeler *labeler.RegionLabeler, opController *operator.Controller, prepareChecker *sche.PrepareChecker) *Controller {
 	pendingProcessedRegions := cache.NewIDTTL(ctx, time.Minute, 3*time.Minute)
 	c := &Controller{
 		ctx:                     ctx,
@@ -111,6 +112,7 @@ func NewController(ctx context.Context, cluster sche.CheckerCluster, conf config
 		patrolRegionContext:     &PatrolRegionContext{},
 		interval:                cluster.GetCheckerConfig().GetPatrolRegionInterval(),
 		patrolRegionScanLimit:   calculateScanLimit(cluster),
+		prepareChecker:          prepareChecker,
 	}
 	c.duration.Store(time.Duration(0))
 	return c
@@ -134,6 +136,9 @@ func (c *Controller) PatrolRegions() {
 		case <-ticker.C:
 			c.updateTickerIfNeeded(ticker)
 			c.updatePatrolWorkersIfNeeded()
+			if !c.prepareChecker.Check(c.cluster.GetBasicCluster()) {
+				continue
+			}
 			if c.cluster.IsSchedulingHalted() {
 				for len(c.patrolRegionContext.regionChan) > 0 {
 					<-c.patrolRegionContext.regionChan

diff --git a/pkg/schedule/coordinator.go b/pkg/schedule/coordinator.go
@@ -44,7 +44,6 @@ import (
 
 const (
 	runSchedulerCheckInterval = 3 * time.Second
-	collectTimeout            = 5 * time.Minute
 	maxLoadConfigRetries      = 10
 	// pushOperatorTickInterval is the interval try to push the operator.
 	pushOperatorTickInterval = 500 * time.Millisecond
@@ -66,7 +65,7 @@ type Coordinator struct {
 	schedulersInitialized bool
 
 	cluster           sche.ClusterInformer
-	prepareChecker    *prepareChecker
+	prepareChecker    *sche.PrepareChecker
 	checkers          *checker.Controller
 	regionScatterer   *scatter.RegionScatterer
 	regionSplitter    *splitter.RegionSplitter
@@ -80,15 +79,16 @@ type Coordinator struct {
 // NewCoordinator creates a new Coordinator.
 func NewCoordinator(parentCtx context.Context, cluster sche.ClusterInformer, hbStreams *hbstream.HeartbeatStreams) *Coordinator {
 	ctx, cancel := context.WithCancel(parentCtx)
+	prepareChecker := sche.NewPrepareChecker()
 	opController := operator.NewController(ctx, cluster.GetBasicCluster(), cluster.GetSharedConfig(), hbStreams)
-	schedulers := schedulers.NewController(ctx, cluster, cluster.GetStorage(), opController)
-	checkers := checker.NewController(ctx, cluster, cluster.GetCheckerConfig(), cluster.GetRuleManager(), cluster.GetRegionLabeler(), opController)
+	schedulers := schedulers.NewController(ctx, cluster, cluster.GetStorage(), opController, prepareChecker)
+	checkers := checker.NewController(ctx, cluster, cluster.GetCheckerConfig(), cluster.GetRuleManager(), cluster.GetRegionLabeler(), opController, prepareChecker)
 	return &Coordinator{
 		ctx:                   ctx,
 		cancel:                cancel,
 		schedulersInitialized: false,
 		cluster:               cluster,
-		prepareChecker:        newPrepareChecker(),
+		prepareChecker:        prepareChecker,
 		checkers:              checkers,
 		regionScatterer:       scatter.NewRegionScatterer(ctx, cluster, opController, checkers.AddPendingProcessedRegions),
 		regionSplitter:        splitter.NewRegionSplitter(cluster, splitter.NewSplitRegionsHandler(cluster, opController), checkers.AddPendingProcessedRegions),
@@ -204,8 +204,8 @@ func (c *Coordinator) driveSlowNodeScheduler() {
 }
 
 // RunUntilStop runs the coordinator until receiving the stop signal.
-func (c *Coordinator) RunUntilStop(collectWaitTime ...time.Duration) {
-	c.Run(collectWaitTime...)
+func (c *Coordinator) RunUntilStop() {
+	c.Run()
 	<-c.ctx.Done()
 	log.Info("coordinator is stopping")
 	c.GetSchedulersController().Wait()
@@ -214,25 +214,12 @@ func (c *Coordinator) RunUntilStop(collectWaitTime ...time.Duration) {
 }
 
 // Run starts coordinator.
-func (c *Coordinator) Run(collectWaitTime ...time.Duration) {
+func (c *Coordinator) Run() {
 	ticker := time.NewTicker(runSchedulerCheckInterval)
 	failpoint.Inject("changeCoordinatorTicker", func() {
 		ticker.Reset(100 * time.Millisecond)
 	})
 	defer ticker.Stop()
-	log.Info("coordinator starts to collect cluster information")
-	for {
-		if c.ShouldRun(collectWaitTime...) {
-			log.Info("coordinator has finished cluster information preparation")
-			break
-		}
-		select {
-		case <-ticker.C:
-		case <-c.ctx.Done():
-			log.Info("coordinator stops running")
-			return
-		}
-	}
 	log.Info("coordinator starts to run schedulers")
 	c.InitSchedulers(true)
 
@@ -547,8 +534,8 @@ func ResetHotSpotMetrics() {
 }
 
 // ShouldRun returns true if the coordinator should run.
-func (c *Coordinator) ShouldRun(collectWaitTime ...time.Duration) bool {
-	return c.prepareChecker.check(c.cluster.GetBasicCluster(), collectWaitTime...)
+func (c *Coordinator) ShouldRun() bool {
+	return c.prepareChecker.Check(c.cluster.GetBasicCluster())
 }
 
 // GetSchedulersController returns the schedulers controller.
@@ -616,7 +603,7 @@ func (c *Coordinator) GetRuleChecker() *checker.RuleChecker {
 }
 
 // GetPrepareChecker returns the prepare checker.
-func (c *Coordinator) GetPrepareChecker() *prepareChecker {
+func (c *Coordinator) GetPrepareChecker() *sche.PrepareChecker {
 	return c.prepareChecker
 }
 

diff --git a/pkg/schedule/prepare_checker.go → pkg/schedule/core/prepare_checker.go b/pkg/schedule/prepare_checker.go → pkg/schedule/core/prepare_checker.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package schedule
+package core
 
 import (
 	"time"
@@ -23,32 +23,34 @@ import (
 	"go.uber.org/zap"
 )
 
-type prepareChecker struct {
+const collectTimeout = 5 * time.Minute
+
+// PrepareChecker is used to check if the coordinator has finished cluster information preparation.
+type PrepareChecker struct {
 	syncutil.RWMutex
 	start    time.Time
 	prepared bool
 }
 
-func newPrepareChecker() *prepareChecker {
-	return &prepareChecker{
+// NewPrepareChecker creates a new PrepareChecker.
+func NewPrepareChecker() *PrepareChecker {
+	return &PrepareChecker{
 		start: time.Now(),
 	}
 }
 
-// Before starting up the scheduler, we need to take the proportion of the regions on each store into consideration.
-func (checker *prepareChecker) check(c *core.BasicCluster, collectWaitTime ...time.Duration) bool {
-	checker.Lock()
-	defer checker.Unlock()
-	if checker.prepared {
+// Check checks if the coordinator has finished cluster information preparation.
+func (checker *PrepareChecker) Check(c *core.BasicCluster) bool {
+	if checker.IsPrepared() {
 		return true
 	}
+	checker.Lock()
+	defer checker.Unlock()
+
 	if time.Since(checker.start) > collectTimeout {
 		checker.prepared = true
 		return true
 	}
-	if len(collectWaitTime) > 0 && time.Since(checker.start) < collectWaitTime[0] {
-		return false
-	}
 	notLoadedFromRegionsCnt := c.GetClusterNotFromStorageRegionsCnt()
 	totalRegionsCnt := c.GetTotalRegionCount()
 	// The number of active regions should be more than total region of all stores * core.CollectFactor
@@ -61,7 +63,7 @@ func (checker *prepareChecker) check(c *core.BasicCluster, collectWaitTime ...ti
 		}
 		storeID := store.GetID()
 		// It is used to avoid sudden scheduling when scheduling service is just started.
-		if len(collectWaitTime) > 0 && (float64(store.GetStoreStats().GetRegionCount())*core.CollectFactor > float64(c.GetNotFromStorageRegionsCntByStore(storeID))) {
+		if float64(store.GetStoreStats().GetRegionCount())*core.CollectFactor > float64(c.GetNotFromStorageRegionsCntByStore(storeID)) {
 			return false
 		}
 		if !c.IsStorePrepared(storeID) {
@@ -74,15 +76,23 @@ func (checker *prepareChecker) check(c *core.BasicCluster, collectWaitTime ...ti
 }
 
 // IsPrepared returns whether the coordinator is prepared.
-func (checker *prepareChecker) IsPrepared() bool {
+func (checker *PrepareChecker) IsPrepared() bool {
 	checker.RLock()
 	defer checker.RUnlock()
 	return checker.prepared
 }
 
 // SetPrepared is for test purpose
-func (checker *prepareChecker) SetPrepared() {
+func (checker *PrepareChecker) SetPrepared() {
 	checker.Lock()
 	defer checker.Unlock()
 	checker.prepared = true
 }
+
+// ResetPrepared is for test purpose
+func (checker *PrepareChecker) ResetPrepared() {
+	checker.Lock()
+	defer checker.Unlock()
+	checker.prepared = false
+	checker.start = time.Now()
+}
diff --git a/pkg/schedule/schedulers/scheduler_controller.go b/pkg/schedule/schedulers/scheduler_controller.go
@@ -56,17 +56,19 @@ type Controller struct {
 	// which will only be initialized and used in the API service mode now.
 	schedulerHandlers map[string]http.Handler
 	opController      *operator.Controller
+	prepareChecker    *sche.PrepareChecker
 }
 
 // NewController creates a scheduler controller.
-func NewController(ctx context.Context, cluster sche.SchedulerCluster, storage endpoint.ConfigStorage, opController *operator.Controller) *Controller {
+func NewController(ctx context.Context, cluster sche.SchedulerCluster, storage endpoint.ConfigStorage, opController *operator.Controller, prepareChecker *sche.PrepareChecker) *Controller {
 	return &Controller{
 		ctx:               ctx,
 		cluster:           cluster,
 		storage:           storage,
 		schedulers:        make(map[string]*ScheduleController),
 		schedulerHandlers: make(map[string]http.Handler),
 		opController:      opController,
+		prepareChecker:    prepareChecker,
 	}
 }
 
@@ -368,6 +370,9 @@ func (c *Controller) runScheduler(s *ScheduleController) {
 	for {
 		select {
 		case <-ticker.C:
+			if !c.prepareChecker.Check(c.cluster.GetBasicCluster()) {
+				continue
+			}
 			diagnosable := s.IsDiagnosticAllowed()
 			if !s.AllowSchedule(diagnosable) {
 				continue

diff --git a/pkg/unsaferecovery/unsafe_recovery_controller.go b/pkg/unsaferecovery/unsafe_recovery_controller.go
@@ -107,6 +107,7 @@ const (
 type cluster interface {
 	core.StoreSetInformer
 
+	ResetPrepared()
 	ResetRegionCache()
 	AllocID() (uint64, error)
 	BuryStore(storeID uint64, forceBury bool) error
@@ -545,6 +546,7 @@ func (u *Controller) changeStage(stage stage) {
 		if u.step > 1 {
 			// == 1 means no operation has done, no need to invalid cache
 			u.cluster.ResetRegionCache()
+			u.cluster.ResetPrepared()
 		}
 		output.Info = "Unsafe recovery Finished"
 		output.Details = u.getAffectedTableDigest()

diff --git a/server/api/admin.go b/server/api/admin.go
@@ -124,6 +124,7 @@ func (h *adminHandler) DeleteAllRegionCache(w http.ResponseWriter, r *http.Reque
 	var err error
 	rc := getCluster(r)
 	rc.ResetRegionCache()
+	rc.ResetPrepared()
 	msg := "All regions are removed from server cache."
 	if rc.IsServiceIndependent(constant.SchedulingServiceName) {
 		err = h.deleteRegionCacheInSchedulingServer()

diff --git a/server/cluster/scheduling_controller.go b/server/cluster/scheduling_controller.go
@@ -480,6 +480,13 @@ func (sc *schedulingController) SetPrepared() {
 	sc.coordinator.GetPrepareChecker().SetPrepared()
 }
 
+// ResetPrepared reset the prepare checker.
+func (sc *schedulingController) ResetPrepared() {
+	sc.mu.RLock()
+	defer sc.mu.RUnlock()
+	sc.coordinator.GetPrepareChecker().ResetPrepared()
+}
+
 // IsSchedulingControllerRunning returns whether the scheduling controller is running. Only for test purpose.
 func (sc *schedulingController) IsSchedulingControllerRunning() bool {
 	sc.mu.RLock()

diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go
@@ -41,15 +41,13 @@ func TestAPI(t *testing.T) {
 func (suite *apiTestSuite) SetupSuite() {
 	re := suite.Require()
 	re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/schedule/changeCoordinatorTicker", `return(true)`))
-	re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/mcs/scheduling/server/changeRunCollectWaitTime", `return(true)`))
 	suite.env = tests.NewSchedulingTestEnvironment(suite.T())
 }
 
 func (suite *apiTestSuite) TearDownSuite() {
 	suite.env.Cleanup()
 	re := suite.Require()
 	re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/schedule/changeCoordinatorTicker"))
-	re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/mcs/scheduling/server/changeRunCollectWaitTime"))
 }
 
 func (suite *apiTestSuite) TestGetCheckerByName() {
-Original file line number
+Diff line change
@@ Expand Up / @@ -282,6 +282,7 @@ func deleteAllRegionCache(c *gin.Context) { @@
     		return
     	}
     	cluster.ResetRegionCache()
+    	cluster.ResetPrepared()
     	c.String(http.StatusOK, "All regions are removed from server cache.")
     }
@@ Expand Down @@