From 7731ee4df6098257322a339c102b59655055a19f Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 21 May 2024 17:22:17 +0800 Subject: [PATCH 01/21] pkg: reduce the allocation of observe (#8188) ref tikv/pd#7897 Signed-off-by: Ryan Leung --- pkg/statistics/region_collection.go | 124 +++++++++++++++++----------- 1 file changed, 78 insertions(+), 46 deletions(-) diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index e4c159cf22d..30197dd43ea 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -222,61 +222,93 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store // Better to make sure once any of these conditions changes, it will trigger the heartbeat `save_cache`. // Otherwise, the state may be out-of-date for a long time, which needs another way to apply the change ASAP. // For example, see `RegionStatsNeedUpdate` above to know how `OversizedRegion` and `UndersizedRegion` are updated. - conditions := map[RegionStatisticType]bool{ - MissPeer: len(peers) < desiredReplicas, - ExtraPeer: len(peers) > desiredReplicas, - DownPeer: len(downPeers) > 0, - PendingPeer: len(pendingPeers) > 0, - OfflinePeer: func() bool { - for _, store := range stores { - if store.IsRemoving() { - peer := region.GetStorePeer(store.GetID()) - if peer != nil { - return true - } - } + var conditions RegionStatisticType + if len(peers) < desiredReplicas { + conditions |= MissPeer + } + if len(peers) > desiredReplicas { + conditions |= ExtraPeer + } + if len(downPeers) > 0 { + conditions |= DownPeer + } + if len(pendingPeers) > 0 { + conditions |= PendingPeer + } + for _, store := range stores { + if store.IsRemoving() { + peer := region.GetStorePeer(store.GetID()) + if peer != nil { + conditions |= OfflinePeer + break } - return false - }(), - LearnerPeer: len(learners) > 0, - EmptyRegion: regionSize <= core.EmptyRegionApproximateSize, - OversizedRegion: region.IsOversized(regionMaxSize, regionMaxKeys), - UndersizedRegion: region.NeedMerge(maxMergeRegionSize, maxMergeRegionKeys), - WitnessLeader: leaderIsWitness, + } + } + if len(learners) > 0 { + conditions |= LearnerPeer + } + if regionSize <= core.EmptyRegionApproximateSize { + conditions |= EmptyRegion + } + if region.IsOversized(regionMaxSize, regionMaxKeys) { + conditions |= OversizedRegion + } + if region.NeedMerge(maxMergeRegionSize, maxMergeRegionKeys) { + conditions |= UndersizedRegion + } + if leaderIsWitness { + conditions |= WitnessLeader } // Check if the region meets any of the conditions and update the corresponding info. regionID := region.GetID() - for typ, c := range conditions { - if c { - info := r.stats[typ][regionID] - if typ == DownPeer { - if info == nil { - info = &RegionInfoWithTS{} - } - if info.(*RegionInfoWithTS).startDownPeerTS != 0 { - regionDownPeerDuration.Observe(float64(time.Now().Unix() - info.(*RegionInfoWithTS).startDownPeerTS)) + for i := 0; i < len(regionStatisticTypes); i++ { + condition := RegionStatisticType(1 << i) + if conditions&condition == 0 { + continue + } + info := r.stats[condition][regionID] + // The condition is met + switch condition { + case MissPeer: + if info == nil { + info = &RegionInfoWithTS{} + } + if len(voters) < desiredVoters { + if info.(*RegionInfoWithTS).startMissVoterPeerTS != 0 { + regionMissVoterPeerDuration.Observe(float64(time.Now().Unix() - info.(*RegionInfoWithTS).startMissVoterPeerTS)) } else { - info.(*RegionInfoWithTS).startDownPeerTS = time.Now().Unix() - logDownPeerWithNoDisconnectedStore(region, stores) - } - } else if typ == MissPeer { - if info == nil { - info = &RegionInfoWithTS{} - } - if len(voters) < desiredVoters { - if info.(*RegionInfoWithTS).startMissVoterPeerTS != 0 { - regionMissVoterPeerDuration.Observe(float64(time.Now().Unix() - info.(*RegionInfoWithTS).startMissVoterPeerTS)) - } else { - info.(*RegionInfoWithTS).startMissVoterPeerTS = time.Now().Unix() - } + info.(*RegionInfoWithTS).startMissVoterPeerTS = time.Now().Unix() } + } + case DownPeer: + if info == nil { + info = &RegionInfoWithTS{} + } + if info.(*RegionInfoWithTS).startDownPeerTS != 0 { + regionDownPeerDuration.Observe(float64(time.Now().Unix() - info.(*RegionInfoWithTS).startDownPeerTS)) } else { - info = struct{}{} + info.(*RegionInfoWithTS).startDownPeerTS = time.Now().Unix() + logDownPeerWithNoDisconnectedStore(region, stores) } - - r.stats[typ][regionID] = info - peerTypeIndex |= typ + case ExtraPeer: + fallthrough + case PendingPeer: + fallthrough + case OfflinePeer: + fallthrough + case LearnerPeer: + fallthrough + case EmptyRegion: + fallthrough + case OversizedRegion: + fallthrough + case UndersizedRegion: + fallthrough + case WitnessLeader: + info = struct{}{} } + r.stats[condition][regionID] = info + peerTypeIndex |= condition } // Remove the info if any of the conditions are not met any more. if oldIndex, ok := r.index[regionID]; ok && oldIndex > emptyStatistic { From 58e7580209f001248c3d530ef2d315ab3c6fd767 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 21 May 2024 18:11:47 +0800 Subject: [PATCH 02/21] *: use a separate runner for updating subtree (#8158) ref tikv/pd#7897 Signed-off-by: Ryan Leung --- metrics/grafana/pd.json | 10 ++-- pkg/core/context.go | 2 + pkg/core/region.go | 10 ++-- pkg/core/region_test.go | 6 +-- pkg/mcs/scheduling/server/cluster.go | 45 ++++++++++-------- pkg/ratelimit/metrics.go | 32 +++++++++---- pkg/ratelimit/runner.go | 70 ++++++++++++++++------------ pkg/syncer/client.go | 2 +- server/cluster/cluster.go | 58 ++++++++++++----------- server/cluster/cluster_worker.go | 8 ++-- 10 files changed, 142 insertions(+), 101 deletions(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index e6d314c2e00..54a047e612e 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -11651,12 +11651,12 @@ "targets": [ { "exemplar": true, - "expr": "pd_ratelimit_runner_task_pending_tasks{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "expr": "pd_ratelimit_runner_pending_tasks{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, - "legendFormat": "{{task_type}}_({{runner_name}})", + "legendFormat": "{{task_type}}_{{runner_name}}", "refId": "A", "step": 4 } @@ -11768,12 +11768,12 @@ "targets": [ { "exemplar": true, - "expr": "rate(pd_ratelimit_runner_task_failed_tasks_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])*60", + "expr": "rate(pd_ratelimit_runner_failed_tasks_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])*60", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, - "legendFormat": "failed-tasks-({{runner_name}})", + "legendFormat": "failed-tasks-{{runner_name}}", "refId": "A", "step": 4 }, @@ -11782,7 +11782,7 @@ "expr": "pd_ratelimit_runner_task_max_waiting_duration_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", "hide": false, "interval": "", - "legendFormat": "max-wait-duration-({{runner_name}})", + "legendFormat": "max-wait-duration-{{runner_name}}", "refId": "B" } ], diff --git a/pkg/core/context.go b/pkg/core/context.go index a0f51e55680..7410f8394c2 100644 --- a/pkg/core/context.go +++ b/pkg/core/context.go @@ -25,6 +25,7 @@ type MetaProcessContext struct { context.Context Tracer RegionHeartbeatProcessTracer TaskRunner ratelimit.Runner + MiscRunner ratelimit.Runner LogRunner ratelimit.Runner } @@ -35,6 +36,7 @@ func ContextTODO() *MetaProcessContext { Context: context.TODO(), Tracer: NewNoopHeartbeatProcessTracer(), TaskRunner: ratelimit.NewSyncRunner(), + MiscRunner: ratelimit.NewSyncRunner(), LogRunner: ratelimit.NewSyncRunner(), // Limit default is nil } diff --git a/pkg/core/region.go b/pkg/core/region.go index c9a8455d4de..a1a61d505a9 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -729,7 +729,7 @@ func (r *RegionInfo) isRegionRecreated() bool { // RegionGuideFunc is a function that determines which follow-up operations need to be performed based on the origin // and new region information. -type RegionGuideFunc func(ctx *MetaProcessContext, region, origin *RegionInfo) (saveKV, saveCache, needSync bool) +type RegionGuideFunc func(ctx *MetaProcessContext, region, origin *RegionInfo) (saveKV, saveCache, needSync, retained bool) // GenerateRegionGuideFunc is used to generate a RegionGuideFunc. Control the log output by specifying the log function. // nil means do not print the log. @@ -742,7 +742,7 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { } // Save to storage if meta is updated. // Save to cache if meta or leader is updated, or contains any down/pending peer. - return func(ctx *MetaProcessContext, region, origin *RegionInfo) (saveKV, saveCache, needSync bool) { + return func(ctx *MetaProcessContext, region, origin *RegionInfo) (saveKV, saveCache, needSync, retained bool) { logRunner := ctx.LogRunner // print log asynchronously debug, info := d, i @@ -772,7 +772,7 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { zap.Uint64("region-id", region.GetID()), logutil.ZapRedactStringer("meta-region", RegionToHexMeta(region.GetMeta()))) } - saveKV, saveCache = true, true + saveKV, saveCache, retained = true, true, true } else { r := region.GetRegionEpoch() o := origin.GetRegionEpoch() @@ -785,7 +785,7 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { zap.Uint64("new-version", r.GetVersion()), ) } - saveKV, saveCache = true, true + saveKV, saveCache, retained = true, true, true } if r.GetConfVer() > o.GetConfVer() { if log.GetLevel() <= zap.InfoLevel { @@ -796,7 +796,7 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { zap.Uint64("new-confver", r.GetConfVer()), ) } - saveKV, saveCache = true, true + saveKV, saveCache, retained = true, true, true } if region.GetLeader().GetId() != origin.GetLeader().GetId() { if origin.GetLeader().GetId() != 0 && log.GetLevel() <= zap.InfoLevel { diff --git a/pkg/core/region_test.go b/pkg/core/region_test.go index 1b8f20cf9b2..b09c1dfd601 100644 --- a/pkg/core/region_test.go +++ b/pkg/core/region_test.go @@ -363,7 +363,7 @@ func TestNeedSync(t *testing.T) { for _, testCase := range testCases { regionA := region.Clone(testCase.optionsA...) regionB := region.Clone(testCase.optionsB...) - _, _, needSync := RegionGuide(ContextTODO(), regionA, regionB) + _, _, needSync, _ := RegionGuide(ContextTODO(), regionA, regionB) re.Equal(testCase.needSync, needSync) } } @@ -1031,7 +1031,7 @@ func TestUpdateRegionEventualConsistency(t *testing.T) { regionsOld.AtomicCheckAndPutRegion(ctx, regionPendingItemA) re.Equal(int32(2), regionPendingItemA.GetRef()) // check new item - saveKV, saveCache, needSync := regionGuide(ctx, regionItemA, regionPendingItemA) + saveKV, saveCache, needSync, _ := regionGuide(ctx, regionItemA, regionPendingItemA) re.True(needSync) re.True(saveCache) re.False(saveKV) @@ -1060,7 +1060,7 @@ func TestUpdateRegionEventualConsistency(t *testing.T) { re.Equal(int32(1), regionPendingItemB.GetRef()) // heartbeat again, no need updates root tree - saveKV, saveCache, needSync := regionGuide(ctx, regionItemB, regionItemB) + saveKV, saveCache, needSync, _ := regionGuide(ctx, regionItemB, regionItemB) re.False(needSync) re.False(saveCache) re.False(saveKV) diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index d3691516868..c6c365b03ad 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -54,8 +54,12 @@ type Cluster struct { clusterID uint64 running atomic.Bool - heartbeatRunnner ratelimit.Runner - logRunner ratelimit.Runner + // heartbeatRunner is used to process the subtree update task asynchronously. + heartbeatRunner ratelimit.Runner + // miscRunner is used to process the statistics and persistent tasks asynchronously. + miscRunner ratelimit.Runner + // logRunner is used to process the log asynchronously. + logRunner ratelimit.Runner } const ( @@ -64,8 +68,9 @@ const ( collectWaitTime = time.Minute // heartbeat relative const - heartbeatTaskRunner = "heartbeat-task-runner" - logTaskRunner = "log-task-runner" + heartbeatTaskRunner = "heartbeat-task-runner" + statisticsTaskRunner = "statistics-task-runner" + logTaskRunner = "log-task-runner" ) var syncRunner = ratelimit.NewSyncRunner() @@ -93,8 +98,9 @@ func NewCluster(parentCtx context.Context, persistConfig *config.PersistConfig, clusterID: clusterID, checkMembershipCh: checkMembershipCh, - heartbeatRunnner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), - logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + miscRunner: ratelimit.NewConcurrentRunner(statisticsTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), } c.coordinator = schedule.NewCoordinator(ctx, c, hbStreams) err = c.ruleManager.Initialize(persistConfig.GetMaxReplicas(), persistConfig.GetLocationLabels(), persistConfig.GetIsolationLevel()) @@ -531,7 +537,8 @@ func (c *Cluster) StartBackgroundJobs() { go c.runUpdateStoreStats() go c.runCoordinator() go c.runMetricsCollectionJob() - c.heartbeatRunnner.Start() + c.heartbeatRunner.Start() + c.miscRunner.Start() c.logRunner.Start() c.running.Store(true) } @@ -543,7 +550,8 @@ func (c *Cluster) StopBackgroundJobs() { } c.running.Store(false) c.coordinator.Stop() - c.heartbeatRunnner.Stop() + c.heartbeatRunner.Stop() + c.miscRunner.Stop() c.logRunner.Stop() c.cancel() c.wg.Wait() @@ -560,16 +568,18 @@ func (c *Cluster) HandleRegionHeartbeat(region *core.RegionInfo) error { if c.persistConfig.GetScheduleConfig().EnableHeartbeatBreakdownMetrics { tracer = core.NewHeartbeatProcessTracer() } - var taskRunner, logRunner ratelimit.Runner - taskRunner, logRunner = syncRunner, syncRunner + var taskRunner, miscRunner, logRunner ratelimit.Runner + taskRunner, miscRunner, logRunner = syncRunner, syncRunner, syncRunner if c.persistConfig.GetScheduleConfig().EnableHeartbeatConcurrentRunner { - taskRunner = c.heartbeatRunnner + taskRunner = c.heartbeatRunner + miscRunner = c.miscRunner logRunner = c.logRunner } ctx := &core.MetaProcessContext{ Context: c.ctx, Tracer: tracer, TaskRunner: taskRunner, + MiscRunner: miscRunner, LogRunner: logRunner, } tracer.Begin() @@ -591,19 +601,12 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c return err } region.Inherit(origin, c.GetStoreConfig().IsEnableRegionBucket()) - - ctx.TaskRunner.RunTask( - ctx, - ratelimit.HandleStatsAsync, - func(_ context.Context) { - cluster.HandleStatsAsync(c, region) - }, - ) + cluster.HandleStatsAsync(c, region) tracer.OnAsyncHotStatsFinished() hasRegionStats := c.regionStats != nil // Save to storage if meta is updated, except for flashback. // Save to cache if meta or leader is updated, or contains any down/pending peer. - _, saveCache, _ := core.GenerateRegionGuideFunc(true)(ctx, region, origin) + _, saveCache, _, retained := core.GenerateRegionGuideFunc(true)(ctx, region, origin) if !saveCache { // Due to some config changes need to update the region stats as well, @@ -627,6 +630,7 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c func(_ context.Context) { c.CheckAndPutSubTree(region) }, + ratelimit.WithRetained(true), ) } return nil @@ -650,6 +654,7 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c func(_ context.Context) { c.CheckAndPutSubTree(region) }, + ratelimit.WithRetained(retained), ) tracer.OnUpdateSubTreeFinished() ctx.TaskRunner.RunTask( diff --git a/pkg/ratelimit/metrics.go b/pkg/ratelimit/metrics.go index 5d4443a1cc4..c5510e66b26 100644 --- a/pkg/ratelimit/metrics.go +++ b/pkg/ratelimit/metrics.go @@ -31,25 +31,41 @@ var ( Name: "runner_task_max_waiting_duration_seconds", Help: "The duration of tasks waiting in the runner.", }, []string{nameStr}) - - RunnerTaskPendingTasks = prometheus.NewGaugeVec( + RunnerPendingTasks = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pd", Subsystem: "ratelimit", - Name: "runner_task_pending_tasks", + Name: "runner_pending_tasks", Help: "The number of pending tasks in the runner.", }, []string{nameStr, taskStr}) - RunnerTaskFailedTasks = prometheus.NewCounterVec( + RunnerFailedTasks = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pd", Subsystem: "ratelimit", - Name: "runner_task_failed_tasks_total", + Name: "runner_failed_tasks_total", Help: "The number of failed tasks in the runner.", - }, []string{nameStr}) + }, []string{nameStr, taskStr}) + RunnerSucceededTasks = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "ratelimit", + Name: "runner_success_tasks_total", + Help: "The number of tasks in the runner.", + }, []string{nameStr, taskStr}) + RunnerTaskExecutionDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "pd", + Subsystem: "ratelimit", + Name: "runner_task_execution_duration_seconds", + Help: "Bucketed histogram of processing time (s) of finished tasks.", + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), + }, []string{nameStr, taskStr}) ) func init() { prometheus.MustRegister(RunnerTaskMaxWaitingDuration) - prometheus.MustRegister(RunnerTaskPendingTasks) - prometheus.MustRegister(RunnerTaskFailedTasks) + prometheus.MustRegister(RunnerPendingTasks) + prometheus.MustRegister(RunnerFailedTasks) + prometheus.MustRegister(RunnerTaskExecutionDuration) + prometheus.MustRegister(RunnerSucceededTasks) } diff --git a/pkg/ratelimit/runner.go b/pkg/ratelimit/runner.go index 07233af238b..17a45067f3d 100644 --- a/pkg/ratelimit/runner.go +++ b/pkg/ratelimit/runner.go @@ -35,7 +35,10 @@ const ( SaveRegionToKV = "SaveRegionToKV" ) -const initialCapacity = 100 +const ( + initialCapacity = 10000 + maxPendingTaskNum = 20000000 +) // Runner is the interface for running tasks. type Runner interface { @@ -48,9 +51,10 @@ type Runner interface { type Task struct { ctx context.Context submittedAt time.Time - opts *TaskOpts f func(context.Context) name string + // retained indicates whether the task should be dropped if the task queue exceeds maxPendingDuration. + retained bool } // ErrMaxWaitingTasksExceeded is returned when the number of waiting tasks exceeds the maximum. @@ -67,7 +71,6 @@ type ConcurrentRunner struct { stopChan chan struct{} wg sync.WaitGroup pendingTaskCount map[string]int64 - failedTaskCount prometheus.Counter maxWaitingDuration prometheus.Gauge } @@ -79,18 +82,19 @@ func NewConcurrentRunner(name string, limiter *ConcurrencyLimiter, maxPendingDur maxPendingDuration: maxPendingDuration, taskChan: make(chan *Task), pendingTasks: make([]*Task, 0, initialCapacity), - failedTaskCount: RunnerTaskFailedTasks.WithLabelValues(name), pendingTaskCount: make(map[string]int64), maxWaitingDuration: RunnerTaskMaxWaitingDuration.WithLabelValues(name), } return s } -// TaskOpts is the options for RunTask. -type TaskOpts struct{} - // TaskOption configures TaskOp -type TaskOption func(opts *TaskOpts) +type TaskOption func(opts *Task) + +// WithRetained sets whether the task should be retained. +func WithRetained(retained bool) TaskOption { + return func(opts *Task) { opts.retained = retained } +} // Start starts the runner. func (cr *ConcurrentRunner) Start() { @@ -123,8 +127,8 @@ func (cr *ConcurrentRunner) Start() { if len(cr.pendingTasks) > 0 { maxDuration = time.Since(cr.pendingTasks[0].submittedAt) } - for name, cnt := range cr.pendingTaskCount { - RunnerTaskPendingTasks.WithLabelValues(cr.name, name).Set(float64(cnt)) + for taskName, cnt := range cr.pendingTaskCount { + RunnerPendingTasks.WithLabelValues(cr.name, taskName).Set(float64(cnt)) } cr.pendingMu.Unlock() cr.maxWaitingDuration.Set(maxDuration.Seconds()) @@ -134,26 +138,28 @@ func (cr *ConcurrentRunner) Start() { } func (cr *ConcurrentRunner) run(task *Task, token *TaskToken) { + start := time.Now() task.f(task.ctx) if token != nil { cr.limiter.ReleaseToken(token) cr.processPendingTasks() } + RunnerTaskExecutionDuration.WithLabelValues(cr.name, task.name).Observe(time.Since(start).Seconds()) + RunnerSucceededTasks.WithLabelValues(cr.name, task.name).Inc() } func (cr *ConcurrentRunner) processPendingTasks() { cr.pendingMu.Lock() defer cr.pendingMu.Unlock() - for len(cr.pendingTasks) > 0 { + if len(cr.pendingTasks) > 0 { task := cr.pendingTasks[0] select { case cr.taskChan <- task: cr.pendingTasks = cr.pendingTasks[1:] cr.pendingTaskCount[task.name]-- - return default: - return } + return } } @@ -165,34 +171,40 @@ func (cr *ConcurrentRunner) Stop() { // RunTask runs the task asynchronously. func (cr *ConcurrentRunner) RunTask(ctx context.Context, name string, f func(context.Context), opts ...TaskOption) error { - taskOpts := &TaskOpts{} - for _, opt := range opts { - opt(taskOpts) - } task := &Task{ ctx: ctx, name: name, f: f, - opts: taskOpts, } - + for _, opt := range opts { + opt(task) + } cr.processPendingTasks() - select { - case cr.taskChan <- task: - default: - cr.pendingMu.Lock() - defer cr.pendingMu.Unlock() - if len(cr.pendingTasks) > 0 { + cr.pendingMu.Lock() + defer func() { + cr.pendingMu.Unlock() + cr.processPendingTasks() + }() + + pendingTaskNum := len(cr.pendingTasks) + if pendingTaskNum > 0 { + if !task.retained { maxWait := time.Since(cr.pendingTasks[0].submittedAt) if maxWait > cr.maxPendingDuration { - cr.failedTaskCount.Inc() + RunnerFailedTasks.WithLabelValues(cr.name, task.name).Inc() return ErrMaxWaitingTasksExceeded } } - task.submittedAt = time.Now() - cr.pendingTasks = append(cr.pendingTasks, task) - cr.pendingTaskCount[task.name]++ + // We use the max task number to limit the memory usage. + // It occupies around 1.5GB memory when there is 20000000 pending task. + if len(cr.pendingTasks) > maxPendingTaskNum { + RunnerFailedTasks.WithLabelValues(cr.name, task.name).Inc() + return ErrMaxWaitingTasksExceeded + } } + task.submittedAt = time.Now() + cr.pendingTasks = append(cr.pendingTasks, task) + cr.pendingTaskCount[task.name]++ return nil } diff --git a/pkg/syncer/client.go b/pkg/syncer/client.go index 8a2e757d5cd..00fa8dc389b 100644 --- a/pkg/syncer/client.go +++ b/pkg/syncer/client.go @@ -212,7 +212,7 @@ func (s *RegionSyncer) StartSyncWithLeader(addr string) { Tracer: core.NewNoopHeartbeatProcessTracer(), // no limit for followers. } - saveKV, _, _ := regionGuide(ctx, region, origin) + saveKV, _, _, _ := regionGuide(ctx, region, origin) overlaps := bc.PutRegion(region) if hasBuckets { diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index a8558051dfa..148b43541a2 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -107,8 +107,9 @@ const ( minSnapshotDurationSec = 5 // heartbeat relative const - heartbeatTaskRunner = "heartbeat-async" - logTaskRunner = "log-async" + heartbeatTaskRunner = "heartbeat-async" + statisticsTaskRunner = "statistics-async" + logTaskRunner = "log-async" ) // Server is the interface for cluster. @@ -173,8 +174,12 @@ type RaftCluster struct { independentServices sync.Map hbstreams *hbstream.HeartbeatStreams - heartbeatRunnner ratelimit.Runner - logRunner ratelimit.Runner + // heartbeatRunner is used to process the subtree update task asynchronously. + heartbeatRunner ratelimit.Runner + // miscRunner is used to process the statistics and persistent tasks asynchronously. + miscRunner ratelimit.Runner + // logRunner is used to process the log asynchronously. + logRunner ratelimit.Runner } // Status saves some state information. @@ -191,15 +196,16 @@ type Status struct { func NewRaftCluster(ctx context.Context, clusterID uint64, basicCluster *core.BasicCluster, storage storage.Storage, regionSyncer *syncer.RegionSyncer, etcdClient *clientv3.Client, httpClient *http.Client) *RaftCluster { return &RaftCluster{ - serverCtx: ctx, - clusterID: clusterID, - regionSyncer: regionSyncer, - httpClient: httpClient, - etcdClient: etcdClient, - core: basicCluster, - storage: storage, - heartbeatRunnner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), - logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + serverCtx: ctx, + clusterID: clusterID, + regionSyncer: regionSyncer, + httpClient: httpClient, + etcdClient: etcdClient, + core: basicCluster, + storage: storage, + heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + miscRunner: ratelimit.NewConcurrentRunner(statisticsTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), } } @@ -357,7 +363,8 @@ func (c *RaftCluster) Start(s Server) error { go c.startGCTuner() c.running = true - c.heartbeatRunnner.Start() + c.heartbeatRunner.Start() + c.miscRunner.Start() c.logRunner.Start() return nil } @@ -752,7 +759,8 @@ func (c *RaftCluster) Stop() { if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { c.stopSchedulingJobs() } - c.heartbeatRunnner.Stop() + c.heartbeatRunner.Stop() + c.miscRunner.Stop() c.logRunner.Stop() c.Unlock() @@ -1024,19 +1032,13 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio region.Inherit(origin, c.GetStoreConfig().IsEnableRegionBucket()) if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { - ctx.TaskRunner.RunTask( - ctx.Context, - ratelimit.HandleStatsAsync, - func(_ context.Context) { - cluster.HandleStatsAsync(c, region) - }, - ) + cluster.HandleStatsAsync(c, region) } tracer.OnAsyncHotStatsFinished() hasRegionStats := c.regionStats != nil // Save to storage if meta is updated, except for flashback. // Save to cache if meta or leader is updated, or contains any down/pending peer. - saveKV, saveCache, needSync := regionGuide(ctx, region, origin) + saveKV, saveCache, needSync, retained := regionGuide(ctx, region, origin) tracer.OnRegionGuideFinished() if !saveKV && !saveCache { // Due to some config changes need to update the region stats as well, @@ -1045,7 +1047,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // region stats needs to be collected in API mode. // We need to think of a better way to reduce this part of the cost in the future. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { - ctx.TaskRunner.RunTask( + ctx.MiscRunner.RunTask( ctx.Context, ratelimit.ObserveRegionStatsAsync, func(_ context.Context) { @@ -1063,6 +1065,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio func(_ context.Context) { c.CheckAndPutSubTree(region) }, + ratelimit.WithRetained(true), ) } return nil @@ -1090,11 +1093,12 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio func(_ context.Context) { c.CheckAndPutSubTree(region) }, + ratelimit.WithRetained(retained), ) tracer.OnUpdateSubTreeFinished() if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { - ctx.TaskRunner.RunTask( + ctx.MiscRunner.RunTask( ctx.Context, ratelimit.HandleOverlaps, func(_ context.Context) { @@ -1107,7 +1111,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio tracer.OnSaveCacheFinished() // handle region stats - ctx.TaskRunner.RunTask( + ctx.MiscRunner.RunTask( ctx.Context, ratelimit.CollectRegionStatsAsync, func(_ context.Context) { @@ -1121,7 +1125,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio tracer.OnCollectRegionStatsFinished() if c.storage != nil { if saveKV { - ctx.TaskRunner.RunTask( + ctx.MiscRunner.RunTask( ctx.Context, ratelimit.SaveRegionToKV, func(_ context.Context) { diff --git a/server/cluster/cluster_worker.go b/server/cluster/cluster_worker.go index 43602dbb68d..39720e7d765 100644 --- a/server/cluster/cluster_worker.go +++ b/server/cluster/cluster_worker.go @@ -40,10 +40,11 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error { tracer = core.NewHeartbeatProcessTracer() } defer tracer.Release() - var taskRunner, logRunner ratelimit.Runner - taskRunner, logRunner = syncRunner, syncRunner + var taskRunner, miscRunner, logRunner ratelimit.Runner + taskRunner, miscRunner, logRunner = syncRunner, syncRunner, syncRunner if c.GetScheduleConfig().EnableHeartbeatConcurrentRunner { - taskRunner = c.heartbeatRunnner + taskRunner = c.heartbeatRunner + miscRunner = c.miscRunner logRunner = c.logRunner } @@ -51,6 +52,7 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error { Context: c.ctx, Tracer: tracer, TaskRunner: taskRunner, + MiscRunner: miscRunner, LogRunner: logRunner, } tracer.Begin() From 0e73b7aa3b95d871d52fdb2d3b3b3dffae6414d8 Mon Sep 17 00:00:00 2001 From: Hu# Date: Tue, 21 May 2024 18:31:16 +0800 Subject: [PATCH 03/21] tests/tso: add IDAllocator to make keyspace test stable (#8202) close tikv/pd#8099 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/tso/keyspace_group_manager.go | 2 +- .../mcs/tso/keyspace_group_manager_test.go | 158 ++++++++++-------- 2 files changed, 90 insertions(+), 70 deletions(-) diff --git a/pkg/tso/keyspace_group_manager.go b/pkg/tso/keyspace_group_manager.go index 2930357e2b4..b2af48f08da 100644 --- a/pkg/tso/keyspace_group_manager.go +++ b/pkg/tso/keyspace_group_manager.go @@ -1439,7 +1439,7 @@ func (kgm *KeyspaceGroupManager) groupSplitPatroller() { defer kgm.wg.Done() patrolInterval := groupPatrolInterval failpoint.Inject("fastGroupSplitPatroller", func() { - patrolInterval = time.Second + patrolInterval = 3 * time.Second }) ticker := time.NewTicker(patrolInterval) defer ticker.Stop() diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index 909972f0315..f7b892ce77d 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -32,6 +32,7 @@ import ( "github.com/tikv/pd/pkg/errs" mcsutils "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/member" + "github.com/tikv/pd/pkg/mock/mockid" "github.com/tikv/pd/pkg/storage/endpoint" tsopkg "github.com/tikv/pd/pkg/tso" "github.com/tikv/pd/pkg/utils/etcdutil" @@ -56,6 +57,13 @@ type tsoKeyspaceGroupManagerTestSuite struct { pdLeaderServer *tests.TestServer // tsoCluster is the TSO service cluster. tsoCluster *tests.TestTSOCluster + + allocator *mockid.IDAllocator +} + +func (suite *tsoKeyspaceGroupManagerTestSuite) allocID() uint32 { + id, _ := suite.allocator.Alloc() + return uint32(id) } func TestTSOKeyspaceGroupManager(t *testing.T) { @@ -77,6 +85,8 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) SetupSuite() { re.NoError(suite.pdLeaderServer.BootstrapCluster()) suite.tsoCluster, err = tests.NewTestTSOCluster(suite.ctx, 2, suite.pdLeaderServer.GetAddr()) re.NoError(err) + suite.allocator = mockid.NewIDAllocator() + suite.allocator.SetBase(uint64(time.Now().Second())) } func (suite *tsoKeyspaceGroupManagerTestSuite) TearDownSuite() { @@ -166,9 +176,9 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestKeyspacesServedByNonDefaultKe keyspaceGroupID uint32 keyspaceIDs []uint32 }{ - {0, []uint32{0, 10}}, - {1, []uint32{1, 11}}, - {2, []uint32{2, 12}}, + {suite.allocID(), []uint32{0, 10}}, + {suite.allocID(), []uint32{1, 11}}, + {suite.allocID(), []uint32{2, 12}}, } for _, param := range params { @@ -242,51 +252,53 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestKeyspacesServedByNonDefaultKe func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupSplit() { re := suite.Require() - // Create the keyspace group 1 with keyspaces [111, 222, 333]. + // Create the keyspace group `oldID` with keyspaces [111, 222, 333]. + oldID := suite.allocID() handlersutil.MustCreateKeyspaceGroup(re, suite.pdLeaderServer, &handlers.CreateKeyspaceGroupParams{ KeyspaceGroups: []*endpoint.KeyspaceGroup{ { - ID: 1, + ID: oldID, UserKind: endpoint.Standard.String(), Members: suite.tsoCluster.GetKeyspaceGroupMember(), Keyspaces: []uint32{111, 222, 333}, }, }, }) - kg1 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, 1) - re.Equal(uint32(1), kg1.ID) + kg1 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, oldID) + re.Equal(oldID, kg1.ID) re.Equal([]uint32{111, 222, 333}, kg1.Keyspaces) re.False(kg1.IsSplitting()) - // Get a TSO from the keyspace group 1. + // Get a TSO from the keyspace group `oldID`. var ( ts pdpb.Timestamp err error ) testutil.Eventually(re, func() bool { - ts, err = suite.requestTSO(re, 222, 1) + ts, err = suite.requestTSO(re, 222, oldID) return err == nil && tsoutil.CompareTimestamp(&ts, &pdpb.Timestamp{}) > 0 }) ts.Physical += time.Hour.Milliseconds() - // Set the TSO of the keyspace group 1 to a large value. - err = suite.tsoCluster.GetPrimaryServer(222, 1).ResetTS(tsoutil.GenerateTS(&ts), false, true, 1) + // Set the TSO of the keyspace group `oldID` to a large value. + err = suite.tsoCluster.GetPrimaryServer(222, oldID).ResetTS(tsoutil.GenerateTS(&ts), false, true, oldID) re.NoError(err) - // Split the keyspace group 1 to 2. - handlersutil.MustSplitKeyspaceGroup(re, suite.pdLeaderServer, 1, &handlers.SplitKeyspaceGroupByIDParams{ - NewID: 2, + // Split the keyspace group `oldID` to `newID`. + newID := suite.allocID() + handlersutil.MustSplitKeyspaceGroup(re, suite.pdLeaderServer, oldID, &handlers.SplitKeyspaceGroupByIDParams{ + NewID: newID, Keyspaces: []uint32{222, 333}, }) // Wait for the split to complete automatically even there is no TSO request from the outside. testutil.Eventually(re, func() bool { - kg2, code := handlersutil.TryLoadKeyspaceGroupByID(re, suite.pdLeaderServer, 2) + kg2, code := handlersutil.TryLoadKeyspaceGroupByID(re, suite.pdLeaderServer, newID) if code != http.StatusOK { return false } - re.Equal(uint32(2), kg2.ID) + re.Equal(newID, kg2.ID) re.Equal([]uint32{222, 333}, kg2.Keyspaces) return !kg2.IsSplitting() }) - // Check the split TSO from keyspace group 2 now. - splitTS, err := suite.requestTSO(re, 222, 2) + // Check the split TSO from keyspace group `newID` now. + splitTS, err := suite.requestTSO(re, 222, newID) re.NoError(err) re.Greater(tsoutil.CompareTimestamp(&splitTS, &ts), 0) } @@ -304,60 +316,62 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) requestTSO( func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupSplitElection() { re := suite.Require() - // Create the keyspace group 1 with keyspaces [111, 222, 333]. + // Create the keyspace group `oldID` with keyspaces [111, 222, 333]. + oldID := suite.allocID() handlersutil.MustCreateKeyspaceGroup(re, suite.pdLeaderServer, &handlers.CreateKeyspaceGroupParams{ KeyspaceGroups: []*endpoint.KeyspaceGroup{ { - ID: 1, + ID: oldID, UserKind: endpoint.Standard.String(), Members: suite.tsoCluster.GetKeyspaceGroupMember(), Keyspaces: []uint32{111, 222, 333}, }, }, }) - kg1 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, 1) - re.Equal(uint32(1), kg1.ID) + kg1 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, oldID) + re.Equal(oldID, kg1.ID) re.Equal([]uint32{111, 222, 333}, kg1.Keyspaces) re.False(kg1.IsSplitting()) - // Split the keyspace group 1 to 2. - handlersutil.MustSplitKeyspaceGroup(re, suite.pdLeaderServer, 1, &handlers.SplitKeyspaceGroupByIDParams{ - NewID: 2, + // Split the keyspace group `oldID` to `newID`. + newID := suite.allocID() + handlersutil.MustSplitKeyspaceGroup(re, suite.pdLeaderServer, oldID, &handlers.SplitKeyspaceGroupByIDParams{ + NewID: newID, Keyspaces: []uint32{222, 333}, }) - kg2 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, 2) - re.Equal(uint32(2), kg2.ID) + kg2 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, newID) + re.Equal(newID, kg2.ID) re.Equal([]uint32{222, 333}, kg2.Keyspaces) re.True(kg2.IsSplitTarget()) // Check the leadership. - member1, err := suite.tsoCluster.WaitForPrimaryServing(re, 111, 1).GetMember(111, 1) + member1, err := suite.tsoCluster.WaitForPrimaryServing(re, 111, oldID).GetMember(111, oldID) re.NoError(err) re.NotNil(member1) - member2, err := suite.tsoCluster.WaitForPrimaryServing(re, 222, 2).GetMember(222, 2) + member2, err := suite.tsoCluster.WaitForPrimaryServing(re, 222, newID).GetMember(222, newID) re.NoError(err) re.NotNil(member2) - // Wait for the leader of the keyspace group 1 and 2 to be elected. + // Wait for the leader of the keyspace group `oldID` and `newID` to be elected. testutil.Eventually(re, func() bool { return len(member1.GetLeaderListenUrls()) > 0 && len(member2.GetLeaderListenUrls()) > 0 }) - // Check if the leader of the keyspace group 1 and 2 are the same. + // Check if the leader of the keyspace group `oldID` and `newID` are the same. re.Equal(member1.GetLeaderListenUrls(), member2.GetLeaderListenUrls()) - // Resign and block the leader of the keyspace group 1 from being elected. + // Resign and block the leader of the keyspace group `oldID` from being elected. member1.(*member.Participant).SetCampaignChecker(func(*election.Leadership) bool { return false }) member1.ResetLeader() - // The leader of the keyspace group 2 should be resigned also. + // The leader of the keyspace group `newID` should be resigned also. testutil.Eventually(re, func() bool { return member2.IsLeader() == false }) - // Check if the leader of the keyspace group 1 and 2 are the same again. + // Check if the leader of the keyspace group `oldID` and `newID` are the same again. member1.(*member.Participant).SetCampaignChecker(nil) testutil.Eventually(re, func() bool { return len(member1.GetLeaderListenUrls()) > 0 && len(member2.GetLeaderListenUrls()) > 0 }) re.Equal(member1.GetLeaderListenUrls(), member2.GetLeaderListenUrls()) // Wait for the keyspace groups to finish the split. - waitFinishSplit(re, suite.pdLeaderServer, 1, 2, []uint32{111}, []uint32{222, 333}) + waitFinishSplit(re, suite.pdLeaderServer, oldID, newID, []uint32{111}, []uint32{222, 333}) } func waitFinishSplit( @@ -390,30 +404,32 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupSplitClient() re := suite.Require() // Enable the failpoint to slow down the system time to test whether the TSO is monotonic. re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/tso/systemTimeSlow", `return(true)`)) - // Create the keyspace group 1 with keyspaces [444, 555, 666]. + // Create the keyspace group `oldID` with keyspaces [444, 555, 666]. + oldID := suite.allocID() handlersutil.MustCreateKeyspaceGroup(re, suite.pdLeaderServer, &handlers.CreateKeyspaceGroupParams{ KeyspaceGroups: []*endpoint.KeyspaceGroup{ { - ID: 1, + ID: oldID, UserKind: endpoint.Standard.String(), Members: suite.tsoCluster.GetKeyspaceGroupMember(), Keyspaces: []uint32{444, 555, 666}, }, }, }) - kg1 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, 1) - re.Equal(uint32(1), kg1.ID) + kg1 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, oldID) + re.Equal(oldID, kg1.ID) re.Equal([]uint32{444, 555, 666}, kg1.Keyspaces) re.False(kg1.IsSplitting()) // Request the TSO for keyspace 555 concurrently via client. - cancel := suite.dispatchClient(re, 555, 1) - // Split the keyspace group 1 to 2. - handlersutil.MustSplitKeyspaceGroup(re, suite.pdLeaderServer, 1, &handlers.SplitKeyspaceGroupByIDParams{ - NewID: 2, + cancel := suite.dispatchClient(re, 555, oldID) + // Split the keyspace group `oldID` to `newID`. + newID := suite.allocID() + handlersutil.MustSplitKeyspaceGroup(re, suite.pdLeaderServer, oldID, &handlers.SplitKeyspaceGroupByIDParams{ + NewID: newID, Keyspaces: []uint32{555, 666}, }) // Wait for the keyspace groups to finish the split. - waitFinishSplit(re, suite.pdLeaderServer, 1, 2, []uint32{444}, []uint32{555, 666}) + waitFinishSplit(re, suite.pdLeaderServer, oldID, newID, []uint32{444}, []uint32{555, 666}) // Stop the client. cancel() re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/tso/systemTimeSlow")) @@ -569,48 +585,49 @@ func TestTwiceSplitKeyspaceGroup(t *testing.T) { func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMerge() { re := suite.Require() - // Create the keyspace group 1 and 2 with keyspaces [111, 222] and [333]. + // Create the keyspace group `firstID` and `secondID` with keyspaces [111, 222] and [333]. + firstID, secondID := suite.allocID(), suite.allocID() handlersutil.MustCreateKeyspaceGroup(re, suite.pdLeaderServer, &handlers.CreateKeyspaceGroupParams{ KeyspaceGroups: []*endpoint.KeyspaceGroup{ { - ID: 1, + ID: firstID, UserKind: endpoint.Standard.String(), Members: suite.tsoCluster.GetKeyspaceGroupMember(), Keyspaces: []uint32{111, 222}, }, { - ID: 2, + ID: secondID, UserKind: endpoint.Standard.String(), Members: suite.tsoCluster.GetKeyspaceGroupMember(), Keyspaces: []uint32{333}, }, }, }) - // Get a TSO from the keyspace group 1. + // Get a TSO from the keyspace group `firstID`. var ( ts pdpb.Timestamp err error ) testutil.Eventually(re, func() bool { - ts, err = suite.requestTSO(re, 222, 1) + ts, err = suite.requestTSO(re, 222, firstID) return err == nil && tsoutil.CompareTimestamp(&ts, &pdpb.Timestamp{}) > 0 }) ts.Physical += time.Hour.Milliseconds() - // Set the TSO of the keyspace group 1 to a large value. - err = suite.tsoCluster.GetPrimaryServer(222, 1).ResetTS(tsoutil.GenerateTS(&ts), false, true, 1) + // Set the TSO of the keyspace group `firstID` to a large value. + err = suite.tsoCluster.GetPrimaryServer(222, firstID).ResetTS(tsoutil.GenerateTS(&ts), false, true, firstID) re.NoError(err) - // Merge the keyspace group 1 and 2 to the default keyspace group. + // Merge the keyspace group `firstID` and `secondID` to the default keyspace group. handlersutil.MustMergeKeyspaceGroup(re, suite.pdLeaderServer, mcsutils.DefaultKeyspaceGroupID, &handlers.MergeKeyspaceGroupsParams{ - MergeList: []uint32{1, 2}, + MergeList: []uint32{firstID, secondID}, }) - // Check the keyspace group 1 and 2 are merged to the default keyspace group. + // Check the keyspace group `firstID` and `secondID` are merged to the default keyspace group. kg := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, mcsutils.DefaultKeyspaceGroupID) re.Equal(mcsutils.DefaultKeyspaceGroupID, kg.ID) for _, keyspaceID := range []uint32{111, 222, 333} { re.Contains(kg.Keyspaces, keyspaceID) } re.True(kg.IsMergeTarget()) - // Check the merged TSO from the default keyspace group is greater than the TSO from the keyspace group 1. + // Check the merged TSO from the default keyspace group is greater than the TSO from the keyspace group`firstID`. var mergedTS pdpb.Timestamp testutil.Eventually(re, func() bool { mergedTS, err = suite.requestTSO(re, 333, mcsutils.DefaultKeyspaceGroupID) @@ -624,26 +641,27 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMerge() { func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMergeClient() { re := suite.Require() - // Create the keyspace group 1 with keyspaces [111, 222, 333]. + // Create the keyspace group `id` with keyspaces [111, 222, 333]. + id := suite.allocID() handlersutil.MustCreateKeyspaceGroup(re, suite.pdLeaderServer, &handlers.CreateKeyspaceGroupParams{ KeyspaceGroups: []*endpoint.KeyspaceGroup{ { - ID: 1, + ID: id, UserKind: endpoint.Standard.String(), Members: suite.tsoCluster.GetKeyspaceGroupMember(), Keyspaces: []uint32{111, 222, 333}, }, }, }) - kg1 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, 1) - re.Equal(uint32(1), kg1.ID) + kg1 := handlersutil.MustLoadKeyspaceGroupByID(re, suite.pdLeaderServer, id) + re.Equal(id, kg1.ID) re.Equal([]uint32{111, 222, 333}, kg1.Keyspaces) re.False(kg1.IsMerging()) // Request the TSO for keyspace 222 concurrently via client. - cancel := suite.dispatchClient(re, 222, 1) + cancel := suite.dispatchClient(re, 222, id) // Merge the keyspace group 1 to the default keyspace group. handlersutil.MustMergeKeyspaceGroup(re, suite.pdLeaderServer, mcsutils.DefaultKeyspaceGroupID, &handlers.MergeKeyspaceGroupsParams{ - MergeList: []uint32{1}, + MergeList: []uint32{id}, }) // Wait for the default keyspace group to finish the merge. waitFinishMerge(re, suite.pdLeaderServer, mcsutils.DefaultKeyspaceGroupID, []uint32{111, 222, 333}) @@ -671,24 +689,25 @@ func waitFinishMerge( func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMergeBeforeInitTSO() { re := suite.Require() - // Make sure the TSO of keyspace group 1 won't be initialized before it's merged. + // Make sure the TSO of keyspace group `id` won't be initialized before it's merged. re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/tso/failedToSaveTimestamp", `return(true)`)) // Request the TSO for the default keyspace concurrently via client. + id := suite.allocID() cancel := suite.dispatchClient(re, mcsutils.DefaultKeyspaceID, mcsutils.DefaultKeyspaceGroupID) // Create the keyspace group 1 with keyspaces [111, 222, 333]. handlersutil.MustCreateKeyspaceGroup(re, suite.pdLeaderServer, &handlers.CreateKeyspaceGroupParams{ KeyspaceGroups: []*endpoint.KeyspaceGroup{ { - ID: 1, + ID: id, UserKind: endpoint.Standard.String(), Members: suite.tsoCluster.GetKeyspaceGroupMember(), Keyspaces: []uint32{111, 222, 333}, }, }, }) - // Merge the keyspace group 1 to the default keyspace group. + // Merge the keyspace group `id` to the default keyspace group. handlersutil.MustMergeKeyspaceGroup(re, suite.pdLeaderServer, mcsutils.DefaultKeyspaceGroupID, &handlers.MergeKeyspaceGroupsParams{ - MergeList: []uint32{1}, + MergeList: []uint32{id}, }) // Wait for the default keyspace group to finish the merge. waitFinishMerge(re, suite.pdLeaderServer, mcsutils.DefaultKeyspaceGroupID, []uint32{111, 222, 333}) @@ -775,12 +794,13 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestKeyspaceGroupMergeIntoDefault keyspaces = make([]uint32, 0, keyspaceGroupNum) ) for i := 1; i <= keyspaceGroupNum; i++ { + id := suite.allocID() keyspaceGroups = append(keyspaceGroups, &endpoint.KeyspaceGroup{ - ID: uint32(i), + ID: id, UserKind: endpoint.UserKind(rand.Intn(int(endpoint.UserKindCount))).String(), - Keyspaces: []uint32{uint32(i)}, + Keyspaces: []uint32{id}, }) - keyspaces = append(keyspaces, uint32(i)) + keyspaces = append(keyspaces, id) if i != keyspaceGroupNum { continue } @@ -797,7 +817,7 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestKeyspaceGroupMergeIntoDefault re.NotNil(svr) for i := 1; i < keyspaceGroupNum; i++ { // Check if the keyspace group is served. - svr = suite.tsoCluster.WaitForPrimaryServing(re, uint32(i), uint32(i)) + svr = suite.tsoCluster.WaitForPrimaryServing(re, keyspaceGroups[i].ID, keyspaceGroups[i].ID) re.NotNil(svr) } // Merge all the keyspace groups into the default keyspace group. From 43e9492ceb760eae0a2026b13b2805b845315114 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Wed, 22 May 2024 10:37:16 +0800 Subject: [PATCH 04/21] pkg/statistics: reduce the memory alloc for hot statistics (#8208) ref tikv/pd#7897, close tikv/pd#8207 pkg/statistics: reduce the memory alloc for hot statistics Signed-off-by: nolouch --- go.mod | 2 +- go.sum | 4 ++-- tests/integrations/go.mod | 2 +- tests/integrations/go.sum | 4 ++-- tools/go.mod | 2 +- tools/go.sum | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index ff0cb20069a..90c5639c936 100644 --- a/go.mod +++ b/go.mod @@ -42,7 +42,7 @@ require ( github.com/prometheus/common v0.51.1 github.com/sasha-s/go-deadlock v0.2.0 github.com/shirou/gopsutil/v3 v3.23.3 - github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072 + github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99 github.com/soheilhy/cmux v0.1.5 github.com/spf13/cobra v1.8.0 github.com/spf13/pflag v1.0.5 diff --git a/go.sum b/go.sum index 8c77a4b84da..6ec1baa72c4 100644 --- a/go.sum +++ b/go.sum @@ -432,8 +432,8 @@ github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6Mwd github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072 h1:Txo4SXVJq/OgEjwgkWoxkMoTjGlcrgsQE/XSghjmu0w= -github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072/go.mod h1:+4nWMF0+CqEcU74SnX2NxaGqZ8zX4pcQ8Jcs77DbX5A= +github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99 h1:fmanhZtn5RKRljCjX46H+Q9/PECsHbflXm0RdrnK9e4= +github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99/go.mod h1:+4nWMF0+CqEcU74SnX2NxaGqZ8zX4pcQ8Jcs77DbX5A= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js= diff --git a/tests/integrations/go.mod b/tests/integrations/go.mod index bb231f747b7..7d07b668c80 100644 --- a/tests/integrations/go.mod +++ b/tests/integrations/go.mod @@ -139,7 +139,7 @@ require ( github.com/shoenig/go-m1cpu v0.1.5 // indirect github.com/shurcooL/httpgzip v0.0.0-20190720172056-320755c1c1b0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect - github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072 // indirect + github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99 // indirect github.com/soheilhy/cmux v0.1.5 // indirect github.com/spf13/cobra v1.8.0 // indirect github.com/spf13/pflag v1.0.5 // indirect diff --git a/tests/integrations/go.sum b/tests/integrations/go.sum index eeb2d73ba7f..0701b42aea7 100644 --- a/tests/integrations/go.sum +++ b/tests/integrations/go.sum @@ -427,8 +427,8 @@ github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6Mwd github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072 h1:Txo4SXVJq/OgEjwgkWoxkMoTjGlcrgsQE/XSghjmu0w= -github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072/go.mod h1:+4nWMF0+CqEcU74SnX2NxaGqZ8zX4pcQ8Jcs77DbX5A= +github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99 h1:fmanhZtn5RKRljCjX46H+Q9/PECsHbflXm0RdrnK9e4= +github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99/go.mod h1:+4nWMF0+CqEcU74SnX2NxaGqZ8zX4pcQ8Jcs77DbX5A= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js= diff --git a/tools/go.mod b/tools/go.mod index 2febbe1ad68..220cc7a5036 100644 --- a/tools/go.mod +++ b/tools/go.mod @@ -141,7 +141,7 @@ require ( github.com/shoenig/go-m1cpu v0.1.5 // indirect github.com/shurcooL/httpgzip v0.0.0-20190720172056-320755c1c1b0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect - github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072 // indirect + github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99 // indirect github.com/soheilhy/cmux v0.1.5 // indirect github.com/stretchr/objx v0.5.0 // indirect github.com/swaggo/files v0.0.0-20210815190702-a29dd2bc99b2 // indirect diff --git a/tools/go.sum b/tools/go.sum index a3c41c16420..535ea668b97 100644 --- a/tools/go.sum +++ b/tools/go.sum @@ -426,8 +426,8 @@ github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6Mwd github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072 h1:Txo4SXVJq/OgEjwgkWoxkMoTjGlcrgsQE/XSghjmu0w= -github.com/smallnest/chanx v0.0.0-20221229104322-eb4c998d2072/go.mod h1:+4nWMF0+CqEcU74SnX2NxaGqZ8zX4pcQ8Jcs77DbX5A= +github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99 h1:fmanhZtn5RKRljCjX46H+Q9/PECsHbflXm0RdrnK9e4= +github.com/smallnest/chanx v1.2.1-0.20240521153536-01121e21ff99/go.mod h1:+4nWMF0+CqEcU74SnX2NxaGqZ8zX4pcQ8Jcs77DbX5A= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js= From d7c7a41959c3c08bc75b3e6881ad9192b5fa85f9 Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 22 May 2024 13:50:46 +0800 Subject: [PATCH 05/21] tools/ut: add alloc server for `pd-ut` (#8203) ref tikv/pd#7969 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/utils/tempurl/tempurl.go | 32 ++++++++++++++ tools/go.mod | 2 +- tools/pd-ut/alloc/check_env_dummy.go | 21 +++++++++ tools/pd-ut/alloc/check_env_linux.go | 42 ++++++++++++++++++ tools/pd-ut/alloc/server.go | 56 ++++++++++++++++++++++++ tools/pd-ut/alloc/tempurl.go | 65 ++++++++++++++++++++++++++++ tools/pd-ut/ut.go | 13 +++++- 7 files changed, 229 insertions(+), 2 deletions(-) create mode 100644 tools/pd-ut/alloc/check_env_dummy.go create mode 100644 tools/pd-ut/alloc/check_env_linux.go create mode 100644 tools/pd-ut/alloc/server.go create mode 100644 tools/pd-ut/alloc/tempurl.go diff --git a/pkg/utils/tempurl/tempurl.go b/pkg/utils/tempurl/tempurl.go index 421513ff001..cd5cd498f95 100644 --- a/pkg/utils/tempurl/tempurl.go +++ b/pkg/utils/tempurl/tempurl.go @@ -16,7 +16,10 @@ package tempurl import ( "fmt" + "io" "net" + "net/http" + "os" "time" "github.com/pingcap/log" @@ -29,6 +32,9 @@ var ( testAddrMap = make(map[string]struct{}) ) +// reference: /pd/tools/pd-ut/alloc/server.go +const AllocURLFromUT = "allocURLFromUT" + // Alloc allocates a local URL for testing. func Alloc() string { for i := 0; i < 10; i++ { @@ -42,6 +48,9 @@ func Alloc() string { } func tryAllocTestURL() string { + if url := getFromUT(); url != "" { + return url + } l, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { log.Fatal("listen failed", errs.ZapError(err)) @@ -63,3 +72,26 @@ func tryAllocTestURL() string { testAddrMap[addr] = struct{}{} return addr } + +func getFromUT() string { + addr := os.Getenv(AllocURLFromUT) + if addr == "" { + return "" + } + + req, err := http.NewRequest(http.MethodGet, addr, nil) + if err != nil { + return "" + } + resp, err := http.DefaultClient.Do(req) + if err != nil || resp.StatusCode != http.StatusOK { + return "" + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return "" + } + url := string(body) + return url +} diff --git a/tools/go.mod b/tools/go.mod index 220cc7a5036..eb2c279e7fa 100644 --- a/tools/go.mod +++ b/tools/go.mod @@ -9,6 +9,7 @@ replace ( require ( github.com/BurntSushi/toml v0.3.1 + github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5 github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e github.com/coreos/go-semver v0.3.1 github.com/docker/go-units v0.4.0 @@ -64,7 +65,6 @@ require ( github.com/bitly/go-simplejson v0.5.0 // indirect github.com/breeswish/gin-jwt/v2 v2.6.4-jwt-patch // indirect github.com/bytedance/sonic v1.9.1 // indirect - github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5 // indirect github.com/cenkalti/backoff/v4 v4.0.2 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect diff --git a/tools/pd-ut/alloc/check_env_dummy.go b/tools/pd-ut/alloc/check_env_dummy.go new file mode 100644 index 00000000000..b9b8eb4827a --- /dev/null +++ b/tools/pd-ut/alloc/check_env_dummy.go @@ -0,0 +1,21 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//go:build !linux +// +build !linux + +package alloc + +func environmentCheck(_ string) bool { + return true +} diff --git a/tools/pd-ut/alloc/check_env_linux.go b/tools/pd-ut/alloc/check_env_linux.go new file mode 100644 index 00000000000..1a51f8075cf --- /dev/null +++ b/tools/pd-ut/alloc/check_env_linux.go @@ -0,0 +1,42 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//go:build linux +// +build linux + +package alloc + +import ( + "github.com/cakturk/go-netstat/netstat" + "github.com/pingcap/log" + "go.uber.org/zap" +) + +func environmentCheck(addr string) bool { + valid, err := checkAddr(addr[len("http://"):]) + if err != nil { + log.Error("check port status failed", zap.Error(err)) + return false + } + return valid +} + +func checkAddr(addr string) (bool, error) { + tabs, err := netstat.TCPSocks(func(s *netstat.SockTabEntry) bool { + return s.RemoteAddr.String() == addr || s.LocalAddr.String() == addr + }) + if err != nil { + return false, err + } + return len(tabs) < 1, nil +} diff --git a/tools/pd-ut/alloc/server.go b/tools/pd-ut/alloc/server.go new file mode 100644 index 00000000000..aced73467fb --- /dev/null +++ b/tools/pd-ut/alloc/server.go @@ -0,0 +1,56 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package alloc + +import ( + "errors" + "flag" + "fmt" + "net/http" + "os" + "time" + + "github.com/gin-gonic/gin" + "github.com/pingcap/log" + "github.com/tikv/pd/pkg/utils/tempurl" + "go.uber.org/zap" +) + +var statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") + +func RunHTTPServer() *http.Server { + err := os.Setenv(tempurl.AllocURLFromUT, fmt.Sprintf("http://%s/alloc", *statusAddress)) + if err != nil { + fmt.Println(err) + } + + gin.SetMode(gin.ReleaseMode) + engine := gin.New() + engine.Use(gin.Recovery()) + + engine.GET("alloc", func(c *gin.Context) { + addr := Alloc() + c.String(http.StatusOK, addr) + }) + + srv := &http.Server{Addr: *statusAddress, Handler: engine.Handler(), ReadHeaderTimeout: 3 * time.Second} + go func() { + if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + log.Fatal("server listen error", zap.Error(err)) + } + }() + + return srv +} diff --git a/tools/pd-ut/alloc/tempurl.go b/tools/pd-ut/alloc/tempurl.go new file mode 100644 index 00000000000..6be69dfe056 --- /dev/null +++ b/tools/pd-ut/alloc/tempurl.go @@ -0,0 +1,65 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package alloc + +import ( + "fmt" + "net" + "sync" + "time" + + "github.com/pingcap/log" + "github.com/tikv/pd/pkg/errs" +) + +var ( + testAddrMutex sync.Mutex + testAddrMap = make(map[string]struct{}) +) + +// Alloc allocates a local URL for testing. +func Alloc() string { + for i := 0; i < 50; i++ { + if u := tryAllocTestURL(); u != "" { + return u + } + time.Sleep(200 * time.Millisecond) + } + log.Fatal("failed to alloc test URL") + return "" +} + +func tryAllocTestURL() string { + l, err := net.Listen("tcp", "127.0.0.1:") + if err != nil { + return "" + } + addr := fmt.Sprintf("http://%s", l.Addr()) + err = l.Close() + if err != nil { + log.Fatal("close failed", errs.ZapError(err)) + } + + testAddrMutex.Lock() + defer testAddrMutex.Unlock() + if _, ok := testAddrMap[addr]; ok { + return "" + } + if !environmentCheck(addr) { + return "" + } + testAddrMap[addr] = struct{}{} + return addr +} diff --git a/tools/pd-ut/ut.go b/tools/pd-ut/ut.go index 9419363c152..7781ab4ee3b 100644 --- a/tools/pd-ut/ut.go +++ b/tools/pd-ut/ut.go @@ -16,6 +16,7 @@ package main import ( "bytes" + "context" "encoding/xml" "errors" "fmt" @@ -32,6 +33,9 @@ import ( "sync" "time" + "github.com/tikv/pd/tools/pd-ut/alloc" + "go.uber.org/zap" + // Set the correct value when it runs inside docker. _ "go.uber.org/automaxprocs" ) @@ -128,6 +132,13 @@ func main() { fmt.Println("os.Getwd() error", err) } + srv := alloc.RunHTTPServer() + defer func() { + if err := srv.Shutdown(context.Background()); err != nil { + log.Fatal("server shutdown error", zap.Error(err)) + } + }() + var isSucceed bool // run all tests if len(os.Args) == 1 { @@ -684,7 +695,7 @@ func buildTestBinaryMulti(pkgs []string) error { } // go test --exec=xprog --tags=tso_function_test,deadlock -vet=off --count=0 $(pkgs) - // workPath just like `/data/nvme0n1/husharp/proj/pd/tests/integrations` + // workPath just like `/pd/tests/integrations` xprogPath := path.Join(workDir, "bin/xprog") if strings.Contains(workDir, integrationsTestPath) { xprogPath = path.Join(workDir[:strings.LastIndex(workDir, integrationsTestPath)], "bin/xprog") From b871b57f828761d9298d40f25cedaba10763c127 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Wed, 22 May 2024 15:44:47 +0800 Subject: [PATCH 06/21] metrics: update grafana template to add heartbeat latency overview (#8209) ref tikv/pd#7897 metrics: update grafana template to add heartbeat latency overview Signed-off-by: nolouch --- metrics/grafana/pd.json | 309 ++++++++++++++++++++++++++++------------ 1 file changed, 221 insertions(+), 88 deletions(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 54a047e612e..a2c3d31a4b0 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -11170,10 +11170,15 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The region heartbeat handle duration in .99", + "description": "The region heartbeat handle duration by levels", "editable": true, "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "fill": 0, + "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, @@ -11181,7 +11186,8 @@ "x": 12, "y": 23 }, - "id": 1302, + "hiddenSeries": false, + "id": 1610, "legend": { "alignAsTable": true, "avg": false, @@ -11199,8 +11205,12 @@ "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "paceLength": 10, "percentage": false, + "pluginVersion": "7.5.17", "pointradius": 5, "points": false, "renderer": "flot", @@ -11210,20 +11220,46 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (address, store, le))", + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (le))", "format": "time_series", "hide": false, + "interval": "", "intervalFactor": 2, - "legendFormat": "{{address}}-store-{{store}}", + "legendFormat": "0.99", "refId": "A", "step": 4 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "0.9", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.8, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "0.8", + "refId": "C" + }, + { + "exemplar": true, + "expr": "rate(pd_scheduler_handle_region_heartbeat_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m]) / rate(pd_scheduler_handle_region_heartbeat_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])", + "hide": false, + "interval": "", + "legendFormat": "avg", + "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "99% Region heartbeat handle latency", + "title": "Region heartbeat handle latency overview", "tooltip": { "msResolution": false, "shared": true, @@ -11381,15 +11417,14 @@ }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The breakdown metric about heartbeat", + "description": "The region heartbeat handle duration in .99 by store", "editable": true, "error": false, "fill": 0, - "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, @@ -11397,77 +11432,49 @@ "x": 12, "y": 31 }, - "hiddenSeries": false, - "id": 1335, + "id": 1302, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, "hideEmpty": true, - "hideZero": true, + "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", - "options": { - "alertThreshold": true - }, "paceLength": 10, "percentage": false, - "pluginVersion": "8.5.27", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "WaitRegionsLock", - "bars": false, - "lines": true, - "linewidth": 2, - "stack": false - }, - { - "alias": "WaitSubRegionsLock", - "bars": false, - "lines": true, - "linewidth": 2, - "stack": false - } - ], + "seriesOverrides": [], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(pd_core_region_heartbeat_breakdown_handle_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (name)", + "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) by (address, store, le))", "format": "time_series", "hide": false, "intervalFactor": 2, - "legendFormat": "{{name}}", - "range": true, + "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 - }, - { - "expr": "sum(rate(pd_core_acquire_regions_lock_wait_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (type)", - "hide": false, - "legendFormat": "{{type}}", - "range": true, - "refId": "B" } ], "thresholds": [], + "timeFrom": null, "timeRegions": [], - "title": "Heartbeat Performance Duration BreakDown (Accumulation)", + "timeShift": null, + "title": "99% Region heartbeat handle latency by store", "tooltip": { "msResolution": false, "shared": true, @@ -11476,25 +11483,33 @@ }, "type": "graph", "xaxis": { + "buckets": null, "mode": "time", + "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", + "label": null, "logBase": 1, + "max": null, "min": "0", "show": true }, { "format": "s", + "label": null, "logBase": 1, + "max": null, + "min": null, "show": true } ], "yaxis": { - "align": false + "align": false, + "alignLevel": null } }, { @@ -11594,6 +11609,124 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The breakdown metric about heartbeat", + "editable": true, + "error": false, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "hiddenSeries": false, + "id": 1335, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "8.5.27", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "WaitRegionsLock", + "bars": false, + "lines": true, + "linewidth": 2, + "stack": false + }, + { + "alias": "WaitSubRegionsLock", + "bars": false, + "lines": true, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(pd_core_region_heartbeat_breakdown_handle_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (name)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "range": true, + "refId": "A", + "step": 4 + }, + { + "expr": "sum(rate(pd_core_acquire_regions_lock_wait_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (type)", + "hide": false, + "legendFormat": "{{type}}", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Heartbeat Performance Duration BreakDown (Accumulation)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "s", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, { "aliasColors": {}, "bars": false, @@ -11613,11 +11746,11 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 39 + "x": 0, + "y": 47 }, "hiddenSeries": false, - "id": 1608, + "id": 1609, "legend": { "alignAsTable": true, "avg": true, @@ -11644,7 +11777,15 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "/max-wait-duration.*/", + "bars": true, + "lines": false, + "transform": "negative-Y", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, @@ -11659,13 +11800,21 @@ "legendFormat": "{{task_type}}_{{runner_name}}", "refId": "A", "step": 4 + }, + { + "exemplar": true, + "expr": "pd_ratelimit_runner_task_max_waiting_duration_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "hide": false, + "interval": "", + "legendFormat": "max-wait-duration-{{runner_name}}", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Heartbeat Runner Pending Task", + "title": "Concurrent Runner Pending Task", "tooltip": { "msResolution": false, "shared": true, @@ -11682,8 +11831,9 @@ }, "yaxes": [ { - "format": "opm", - "label": null, + "decimals": null, + "format": "none", + "label": "", "logBase": 1, "max": null, "min": "0", @@ -11722,11 +11872,11 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 47 }, "hiddenSeries": false, - "id": 1609, + "id": 1608, "legend": { "alignAsTable": true, "avg": true, @@ -11753,15 +11903,7 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/max-wait-duration.*/", - "bars": true, - "lines": false, - "transform": "negative-Y", - "yaxis": 2 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, @@ -11776,14 +11918,6 @@ "legendFormat": "failed-tasks-{{runner_name}}", "refId": "A", "step": 4 - }, - { - "exemplar": true, - "expr": "pd_ratelimit_runner_task_max_waiting_duration_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", - "hide": false, - "interval": "", - "legendFormat": "max-wait-duration-{{runner_name}}", - "refId": "B" } ], "thresholds": [], @@ -11807,9 +11941,8 @@ }, "yaxes": [ { - "decimals": null, "format": "opm", - "label": "", + "label": null, "logBase": 1, "max": null, "min": "0", @@ -11843,8 +11976,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 47 + "x": 0, + "y": 55 }, "id": 1305, "legend": { @@ -11937,7 +12070,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 55 }, "id": 1306, @@ -12027,8 +12160,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 55 + "x": 0, + "y": 63 }, "id": 1307, "legend": { @@ -12120,7 +12253,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 63 }, "id": 1308, @@ -12217,8 +12350,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 63 + "x": 0, + "y": 71 }, "id": 1309, "legend": { @@ -12314,7 +12447,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 71 }, "id": 1310, @@ -12411,8 +12544,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 71 + "x": 0, + "y": 79 }, "id": 1311, "legend": { @@ -12508,7 +12641,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 79 }, "id": 1312, From 0056569b67e271c32c6965c9ada9090814a11cb3 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 22 May 2024 15:52:17 +0800 Subject: [PATCH 07/21] core/region: optimize the efficiency of random regions selecting (#8205) ref tikv/pd#7897 Optimize the efficiency of random regions selecting. Signed-off-by: JmPotato Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/region.go | 39 ++-------- pkg/core/region_test.go | 58 +++++++++++---- pkg/core/region_tree.go | 137 ++++++++++++++++++++++------------- pkg/core/region_tree_test.go | 48 ++++++------ 4 files changed, 161 insertions(+), 121 deletions(-) diff --git a/pkg/core/region.go b/pkg/core/region.go index a1a61d505a9..19c1d0d4794 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -727,6 +727,11 @@ func (r *RegionInfo) isRegionRecreated() bool { return r.GetRegionEpoch().GetVersion() == 1 && r.GetRegionEpoch().GetConfVer() == 1 && (len(r.GetStartKey()) != 0 || len(r.GetEndKey()) != 0) } +func (r *RegionInfo) Contains(key []byte) bool { + start, end := r.GetStartKey(), r.GetEndKey() + return bytes.Compare(key, start) >= 0 && (len(end) == 0 || bytes.Compare(key, end) < 0) +} + // RegionGuideFunc is a function that determines which follow-up operations need to be performed based on the origin // and new region information. type RegionGuideFunc func(ctx *MetaProcessContext, region, origin *RegionInfo) (saveKV, saveCache, needSync, retained bool) @@ -1673,13 +1678,6 @@ func (r *RegionsInfo) GetStoreWitnessCount(storeID uint64) int { return r.witnesses[storeID].length() } -// RandPendingRegion randomly gets a store's region with a pending peer. -func (r *RegionsInfo) RandPendingRegion(storeID uint64, ranges []KeyRange) *RegionInfo { - r.st.RLock() - defer r.st.RUnlock() - return r.pendingPeers[storeID].RandomRegion(ranges) -} - // RandPendingRegions randomly gets a store's n regions with a pending peer. func (r *RegionsInfo) RandPendingRegions(storeID uint64, ranges []KeyRange) []*RegionInfo { r.st.RLock() @@ -1687,11 +1685,11 @@ func (r *RegionsInfo) RandPendingRegions(storeID uint64, ranges []KeyRange) []*R return r.pendingPeers[storeID].RandomRegions(randomRegionMaxRetry, ranges) } -// RandLeaderRegion randomly gets a store's leader region. -func (r *RegionsInfo) RandLeaderRegion(storeID uint64, ranges []KeyRange) *RegionInfo { +// This function is used for test only. +func (r *RegionsInfo) randLeaderRegion(storeID uint64, ranges []KeyRange) { r.st.RLock() defer r.st.RUnlock() - return r.leaders[storeID].RandomRegion(ranges) + _ = r.leaders[storeID].randomRegion(ranges) } // RandLeaderRegions randomly gets a store's n leader regions. @@ -1701,13 +1699,6 @@ func (r *RegionsInfo) RandLeaderRegions(storeID uint64, ranges []KeyRange) []*Re return r.leaders[storeID].RandomRegions(randomRegionMaxRetry, ranges) } -// RandFollowerRegion randomly gets a store's follower region. -func (r *RegionsInfo) RandFollowerRegion(storeID uint64, ranges []KeyRange) *RegionInfo { - r.st.RLock() - defer r.st.RUnlock() - return r.followers[storeID].RandomRegion(ranges) -} - // RandFollowerRegions randomly gets a store's n follower regions. func (r *RegionsInfo) RandFollowerRegions(storeID uint64, ranges []KeyRange) []*RegionInfo { r.st.RLock() @@ -1715,13 +1706,6 @@ func (r *RegionsInfo) RandFollowerRegions(storeID uint64, ranges []KeyRange) []* return r.followers[storeID].RandomRegions(randomRegionMaxRetry, ranges) } -// RandLearnerRegion randomly gets a store's learner region. -func (r *RegionsInfo) RandLearnerRegion(storeID uint64, ranges []KeyRange) *RegionInfo { - r.st.RLock() - defer r.st.RUnlock() - return r.learners[storeID].RandomRegion(ranges) -} - // RandLearnerRegions randomly gets a store's n learner regions. func (r *RegionsInfo) RandLearnerRegions(storeID uint64, ranges []KeyRange) []*RegionInfo { r.st.RLock() @@ -1729,13 +1713,6 @@ func (r *RegionsInfo) RandLearnerRegions(storeID uint64, ranges []KeyRange) []*R return r.learners[storeID].RandomRegions(randomRegionMaxRetry, ranges) } -// RandWitnessRegion randomly gets a store's witness region. -func (r *RegionsInfo) RandWitnessRegion(storeID uint64, ranges []KeyRange) *RegionInfo { - r.st.RLock() - defer r.st.RUnlock() - return r.witnesses[storeID].RandomRegion(ranges) -} - // RandWitnessRegions randomly gets a store's n witness regions. func (r *RegionsInfo) RandWitnessRegions(storeID uint64, ranges []KeyRange) []*RegionInfo { r.st.RLock() diff --git a/pkg/core/region_test.go b/pkg/core/region_test.go index b09c1dfd601..8956bd8a357 100644 --- a/pkg/core/region_test.go +++ b/pkg/core/region_test.go @@ -642,21 +642,49 @@ func BenchmarkUpdateBuckets(b *testing.B) { } func BenchmarkRandomRegion(b *testing.B) { - regions := NewRegionsInfo() - for i := 0; i < 5000000; i++ { - peer := &metapb.Peer{StoreId: 1, Id: uint64(i + 1)} - region := NewRegionInfo(&metapb.Region{ - Id: uint64(i + 1), - Peers: []*metapb.Peer{peer}, - StartKey: []byte(fmt.Sprintf("%20d", i)), - EndKey: []byte(fmt.Sprintf("%20d", i+1)), - }, peer) - origin, overlaps, rangeChanged := regions.SetRegion(region) - regions.UpdateSubTree(region, origin, overlaps, rangeChanged) - } - b.ResetTimer() - for i := 0; i < b.N; i++ { - regions.RandLeaderRegion(1, nil) + for _, size := range []int{10, 100, 1000, 10000, 100000, 1000000, 10000000} { + regions := NewRegionsInfo() + for i := 0; i < size; i++ { + peer := &metapb.Peer{StoreId: 1, Id: uint64(i + 1)} + region := NewRegionInfo(&metapb.Region{ + Id: uint64(i + 1), + Peers: []*metapb.Peer{peer}, + StartKey: []byte(fmt.Sprintf("%20d", i)), + EndKey: []byte(fmt.Sprintf("%20d", i+1)), + }, peer) + origin, overlaps, rangeChanged := regions.SetRegion(region) + regions.UpdateSubTree(region, origin, overlaps, rangeChanged) + } + b.Run(fmt.Sprintf("random region whole range with size %d", size), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + regions.randLeaderRegion(1, nil) + } + }) + b.Run(fmt.Sprintf("random regions whole range with size %d", size), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + regions.RandLeaderRegions(1, nil) + } + }) + ranges := []KeyRange{ + NewKeyRange(fmt.Sprintf("%20d", 0), fmt.Sprintf("%20d", size/4)), + NewKeyRange(fmt.Sprintf("%20d", size/4), fmt.Sprintf("%20d", size/2)), + NewKeyRange(fmt.Sprintf("%20d", size/2), fmt.Sprintf("%20d", size*3/4)), + NewKeyRange(fmt.Sprintf("%20d", size*3/4), fmt.Sprintf("%20d", size)), + } + b.Run(fmt.Sprintf("random region given ranges with size %d", size), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + regions.randLeaderRegion(1, ranges) + } + }) + b.Run(fmt.Sprintf("random regions given ranges with size %d", size), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + regions.RandLeaderRegions(1, ranges) + } + }) } } diff --git a/pkg/core/region_tree.go b/pkg/core/region_tree.go index 6c3c71c5158..e6d05d443a1 100644 --- a/pkg/core/region_tree.go +++ b/pkg/core/region_tree.go @@ -52,11 +52,6 @@ func (r *regionItem) Less(other *regionItem) bool { return bytes.Compare(left, right) < 0 } -func (r *regionItem) Contains(key []byte) bool { - start, end := r.GetStartKey(), r.GetEndKey() - return bytes.Compare(key, start) >= 0 && (len(end) == 0 || bytes.Compare(key, end) < 0) -} - const ( defaultBTreeDegree = 64 ) @@ -328,62 +323,100 @@ func (t *regionTree) getAdjacentItem(item *regionItem) (prev *regionItem, next * return prev, next } -// RandomRegion is used to get a random region within ranges. -func (t *regionTree) RandomRegion(ranges []KeyRange) *RegionInfo { - if t.length() == 0 { +func (t *regionTree) randomRegion(ranges []KeyRange) *RegionInfo { + regions := t.RandomRegions(1, ranges) + if len(regions) == 0 { return nil } + return regions[0] +} - if len(ranges) == 0 { - ranges = []KeyRange{NewKeyRange("", "")} +// RandomRegions get n random regions within the given ranges. +func (t *regionTree) RandomRegions(n int, ranges []KeyRange) []*RegionInfo { + treeLen := t.length() + if treeLen == 0 || n < 1 { + return nil } - - for _, i := range rand.Perm(len(ranges)) { - var endIndex int - startKey, endKey := ranges[i].StartKey, ranges[i].EndKey - startRegion, startIndex := t.tree.GetWithIndex(®ionItem{RegionInfo: &RegionInfo{meta: &metapb.Region{StartKey: startKey}}}) - - if len(endKey) != 0 { - _, endIndex = t.tree.GetWithIndex(®ionItem{RegionInfo: &RegionInfo{meta: &metapb.Region{StartKey: endKey}}}) - } else { - endIndex = t.tree.Len() + // Pre-allocate the variables to reduce the temporary memory allocations. + var ( + startKey, endKey []byte + startIndex, endIndex, randIndex int + startItem *regionItem + pivotItem = ®ionItem{&RegionInfo{meta: &metapb.Region{}}} + region *RegionInfo + regions = make([]*RegionInfo, 0, n) + rangeLen, curLen = len(ranges), len(regions) + // setStartEndIndices is a helper function to set `startIndex` and `endIndex` + // according to the `startKey` and `endKey`. + // TODO: maybe we could cache the `startIndex` and `endIndex` for each range. + setStartEndIndices = func() { + pivotItem.meta.StartKey = startKey + startItem, startIndex = t.tree.GetWithIndex(pivotItem) + if len(endKey) != 0 { + pivotItem.meta.StartKey = endKey + _, endIndex = t.tree.GetWithIndex(pivotItem) + } else { + endIndex = treeLen + } + // Consider that the item in the tree may not be continuous, + // we need to check if the previous item contains the key. + if startIndex != 0 && startItem == nil { + region = t.tree.GetAt(startIndex - 1).RegionInfo + if region.Contains(startKey) { + startIndex-- + } + } } - - // Consider that the item in the tree may not be continuous, - // we need to check if the previous item contains the key. - if startIndex != 0 && startRegion == nil && t.tree.GetAt(startIndex-1).Contains(startKey) { - startIndex-- + ) + // If no ranges specified, select randomly from the whole tree. + // This is a fast path to reduce the unnecessary iterations. + if rangeLen == 0 { + startKey, endKey = []byte(""), []byte("") + setStartEndIndices() + for curLen < n { + randIndex = rand.Intn(endIndex-startIndex) + startIndex + region = t.tree.GetAt(randIndex).RegionInfo + if region.isInvolved(startKey, endKey) { + regions = append(regions, region) + curLen++ + } + // No region found, directly break to avoid infinite loop. + if curLen == 0 { + break + } } + return regions + } + // When there are multiple ranges provided, + // keep retrying until we get enough regions. + for curLen < n { + // Shuffle the ranges to increase the randomness. + for _, i := range rand.Perm(rangeLen) { + startKey, endKey = ranges[i].StartKey, ranges[i].EndKey + setStartEndIndices() + if endIndex <= startIndex { + if len(endKey) > 0 && bytes.Compare(startKey, endKey) > 0 { + log.Error("wrong range keys", + logutil.ZapRedactString("start-key", string(HexRegionKey(startKey))), + logutil.ZapRedactString("end-key", string(HexRegionKey(endKey))), + errs.ZapError(errs.ErrWrongRangeKeys)) + } + continue + } - if endIndex <= startIndex { - if len(endKey) > 0 && bytes.Compare(startKey, endKey) > 0 { - log.Error("wrong range keys", - logutil.ZapRedactString("start-key", string(HexRegionKey(startKey))), - logutil.ZapRedactString("end-key", string(HexRegionKey(endKey))), - errs.ZapError(errs.ErrWrongRangeKeys)) + randIndex = rand.Intn(endIndex-startIndex) + startIndex + region = t.tree.GetAt(randIndex).RegionInfo + if region.isInvolved(startKey, endKey) { + regions = append(regions, region) + curLen++ + if curLen == n { + return regions + } } - continue - } - index := rand.Intn(endIndex-startIndex) + startIndex - region := t.tree.GetAt(index).RegionInfo - if region.isInvolved(startKey, endKey) { - return region } - } - - return nil -} - -func (t *regionTree) RandomRegions(n int, ranges []KeyRange) []*RegionInfo { - if t.length() == 0 { - return nil - } - - regions := make([]*RegionInfo, 0, n) - - for i := 0; i < n; i++ { - if region := t.RandomRegion(ranges); region != nil { - regions = append(regions, region) + // No region found, directly break to avoid infinite loop. + if curLen == 0 { + break } } return regions diff --git a/pkg/core/region_tree_test.go b/pkg/core/region_tree_test.go index 3f2ca0c1fb8..a86f2f52c47 100644 --- a/pkg/core/region_tree_test.go +++ b/pkg/core/region_tree_test.go @@ -274,13 +274,15 @@ func TestRegionTreeSplitAndMerge(t *testing.T) { func TestRandomRegion(t *testing.T) { re := require.New(t) tree := newRegionTree() - r := tree.RandomRegion(nil) + r := tree.randomRegion(nil) re.Nil(r) regionA := NewTestRegionInfo(1, 1, []byte(""), []byte("g")) updateNewItem(tree, regionA) - ra := tree.RandomRegion([]KeyRange{NewKeyRange("", "")}) + ra := tree.randomRegion([]KeyRange{NewKeyRange("", "")}) re.Equal(regionA, ra) + ra2 := tree.RandomRegions(2, []KeyRange{NewKeyRange("", "")}) + re.Equal([]*RegionInfo{regionA, regionA}, ra2) regionB := NewTestRegionInfo(2, 2, []byte("g"), []byte("n")) regionC := NewTestRegionInfo(3, 3, []byte("n"), []byte("t")) @@ -289,20 +291,20 @@ func TestRandomRegion(t *testing.T) { updateNewItem(tree, regionC) updateNewItem(tree, regionD) - rb := tree.RandomRegion([]KeyRange{NewKeyRange("g", "n")}) + rb := tree.randomRegion([]KeyRange{NewKeyRange("g", "n")}) re.Equal(regionB, rb) - rc := tree.RandomRegion([]KeyRange{NewKeyRange("n", "t")}) + rc := tree.randomRegion([]KeyRange{NewKeyRange("n", "t")}) re.Equal(regionC, rc) - rd := tree.RandomRegion([]KeyRange{NewKeyRange("t", "")}) + rd := tree.randomRegion([]KeyRange{NewKeyRange("t", "")}) re.Equal(regionD, rd) - rf := tree.RandomRegion([]KeyRange{NewKeyRange("", "a")}) + rf := tree.randomRegion([]KeyRange{NewKeyRange("", "a")}) re.Nil(rf) - rf = tree.RandomRegion([]KeyRange{NewKeyRange("o", "s")}) + rf = tree.randomRegion([]KeyRange{NewKeyRange("o", "s")}) re.Nil(rf) - rf = tree.RandomRegion([]KeyRange{NewKeyRange("", "a")}) + rf = tree.randomRegion([]KeyRange{NewKeyRange("", "a")}) re.Nil(rf) - rf = tree.RandomRegion([]KeyRange{NewKeyRange("z", "")}) + rf = tree.randomRegion([]KeyRange{NewKeyRange("z", "")}) re.Nil(rf) checkRandomRegion(re, tree, []*RegionInfo{regionA, regionB, regionC, regionD}, []KeyRange{NewKeyRange("", "")}) @@ -315,43 +317,43 @@ func TestRandomRegion(t *testing.T) { func TestRandomRegionDiscontinuous(t *testing.T) { re := require.New(t) tree := newRegionTree() - r := tree.RandomRegion([]KeyRange{NewKeyRange("c", "f")}) + r := tree.randomRegion([]KeyRange{NewKeyRange("c", "f")}) re.Nil(r) // test for single region regionA := NewTestRegionInfo(1, 1, []byte("c"), []byte("f")) updateNewItem(tree, regionA) - ra := tree.RandomRegion([]KeyRange{NewKeyRange("c", "e")}) + ra := tree.randomRegion([]KeyRange{NewKeyRange("c", "e")}) re.Nil(ra) - ra = tree.RandomRegion([]KeyRange{NewKeyRange("c", "f")}) + ra = tree.randomRegion([]KeyRange{NewKeyRange("c", "f")}) re.Equal(regionA, ra) - ra = tree.RandomRegion([]KeyRange{NewKeyRange("c", "g")}) + ra = tree.randomRegion([]KeyRange{NewKeyRange("c", "g")}) re.Equal(regionA, ra) - ra = tree.RandomRegion([]KeyRange{NewKeyRange("a", "e")}) + ra = tree.randomRegion([]KeyRange{NewKeyRange("a", "e")}) re.Nil(ra) - ra = tree.RandomRegion([]KeyRange{NewKeyRange("a", "f")}) + ra = tree.randomRegion([]KeyRange{NewKeyRange("a", "f")}) re.Equal(regionA, ra) - ra = tree.RandomRegion([]KeyRange{NewKeyRange("a", "g")}) + ra = tree.randomRegion([]KeyRange{NewKeyRange("a", "g")}) re.Equal(regionA, ra) regionB := NewTestRegionInfo(2, 2, []byte("n"), []byte("x")) updateNewItem(tree, regionB) - rb := tree.RandomRegion([]KeyRange{NewKeyRange("g", "x")}) + rb := tree.randomRegion([]KeyRange{NewKeyRange("g", "x")}) re.Equal(regionB, rb) - rb = tree.RandomRegion([]KeyRange{NewKeyRange("g", "y")}) + rb = tree.randomRegion([]KeyRange{NewKeyRange("g", "y")}) re.Equal(regionB, rb) - rb = tree.RandomRegion([]KeyRange{NewKeyRange("n", "y")}) + rb = tree.randomRegion([]KeyRange{NewKeyRange("n", "y")}) re.Equal(regionB, rb) - rb = tree.RandomRegion([]KeyRange{NewKeyRange("o", "y")}) + rb = tree.randomRegion([]KeyRange{NewKeyRange("o", "y")}) re.Nil(rb) regionC := NewTestRegionInfo(3, 3, []byte("z"), []byte("")) updateNewItem(tree, regionC) - rc := tree.RandomRegion([]KeyRange{NewKeyRange("y", "")}) + rc := tree.randomRegion([]KeyRange{NewKeyRange("y", "")}) re.Equal(regionC, rc) regionD := NewTestRegionInfo(4, 4, []byte(""), []byte("a")) updateNewItem(tree, regionD) - rd := tree.RandomRegion([]KeyRange{NewKeyRange("", "b")}) + rd := tree.randomRegion([]KeyRange{NewKeyRange("", "b")}) re.Equal(regionD, rd) checkRandomRegion(re, tree, []*RegionInfo{regionA, regionB, regionC, regionD}, []KeyRange{NewKeyRange("", "")}) @@ -365,7 +367,7 @@ func updateNewItem(tree *regionTree, region *RegionInfo) { func checkRandomRegion(re *require.Assertions, tree *regionTree, regions []*RegionInfo, ranges []KeyRange) { keys := make(map[string]struct{}) for i := 0; i < 10000 && len(keys) < len(regions); i++ { - re := tree.RandomRegion(ranges) + re := tree.randomRegion(ranges) if re == nil { continue } From 0f3e1f90cb3c5e3694c637900a1cc732c08572f7 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 23 May 2024 10:08:47 +0800 Subject: [PATCH 08/21] tools/ut: add a parallel parameter (#8186) ref tikv/pd#7969 add parallel parameter Signed-off-by: husharp --- tools/pd-ut/ut.go | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/tools/pd-ut/ut.go b/tools/pd-ut/ut.go index 7781ab4ee3b..fbf2a640651 100644 --- a/tools/pd-ut/ut.go +++ b/tools/pd-ut/ut.go @@ -95,8 +95,7 @@ var ( var ( // runtime - p int - buildParallel int + parallel int workDir string coverFileTempDir string // arguments @@ -108,6 +107,7 @@ var ( func main() { race = handleFlag("--race") + parallelStr := stripFlag("--parallel") junitFile = stripFlag("--junitfile") coverProfile = stripFlag("--coverprofile") ignoreDir = stripFlag("--ignore") @@ -122,11 +122,21 @@ func main() { defer os.RemoveAll(coverFileTempDir) } - // Get the correct count of CPU if it's in docker. - p = runtime.GOMAXPROCS(0) - // We use 2 * p for `go build` to make it faster. - buildParallel = p * 2 var err error + procs := runtime.GOMAXPROCS(0) + if parallelStr == "" { + // Get the correct count of CPU if it's in docker. + parallel = procs + } else { + parallel, err = strconv.Atoi(parallelStr) + if err != nil { + fmt.Println("parse parallel error", err) + return + } + if parallel > procs { + fmt.Printf("Recommend to set parallel be same as the GOMAXPROCS=%d\n", procs) + } + } workDir, err = os.Getwd() if err != nil { fmt.Println("os.Getwd() error", err) @@ -353,12 +363,12 @@ func cmdRun(args ...string) bool { } } - fmt.Printf("building task finish, parallelism=%d, count=%d, takes=%v\n", buildParallel, len(tasks), time.Since(start)) + fmt.Printf("building task finish, parallelism=%d, count=%d, takes=%v\n", parallel*2, len(tasks), time.Since(start)) taskCh := make(chan task, 100) - works := make([]numa, p) + works := make([]numa, parallel) var wg sync.WaitGroup - for i := 0; i < p; i++ { + for i := 0; i < parallel; i++ { wg.Add(1) go works[i].worker(&wg, taskCh) } @@ -400,7 +410,7 @@ func cmdRun(args ...string) bool { // stripFlag strip the '--flag xxx' from the command line os.Args // Example of the os.Args changes -// Before: ut run pkg TestXXX --coverprofile xxx --junitfile yyy +// Before: ut run pkg TestXXX --coverprofile xxx --junitfile yyy --parallel 16 // After: ut run pkg TestXXX // The value of the flag is returned. func stripFlag(flag string) string { @@ -636,7 +646,7 @@ func (*numa) testCommand(pkg string, fn string) *exec.Cmd { args = append(args, "-test.coverprofile", tmpFile) } if strings.Contains(fn, "Suite") { - args = append(args, "-test.cpu", fmt.Sprint(p/2)) + args = append(args, "-test.cpu", fmt.Sprint(parallel/2)) } else { args = append(args, "-test.cpu", "1") } @@ -705,7 +715,8 @@ func buildTestBinaryMulti(pkgs []string) error { packages = append(packages, path.Join(modulePath, pkg)) } - p := strconv.Itoa(buildParallel) + // We use 2 * parallel for `go build` to make it faster. + p := strconv.Itoa(parallel * 2) cmd := exec.Command("go", "test", "-p", p, "--exec", xprogPath, "-vet", "off", "--tags=tso_function_test,deadlock") if coverProfile != "" { coverpkg := "./..." From 6b8d22779efd2fa1496c62a834691cf6f1d963a2 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Thu, 23 May 2024 10:53:46 +0800 Subject: [PATCH 09/21] metrics: fix the duplicate avg metrics (#8210) ref tikv/pd#7897 metrics: fix the duplicate avg metrics Signed-off-by: nolouch Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- metrics/grafana/pd.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index a2c3d31a4b0..69afb93f531 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -11248,7 +11248,7 @@ }, { "exemplar": true, - "expr": "rate(pd_scheduler_handle_region_heartbeat_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m]) / rate(pd_scheduler_handle_region_heartbeat_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])", + "expr": "sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m])) / sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\"}[1m]))", "hide": false, "interval": "", "legendFormat": "avg", From 4cd42b3cb8e7ae32c4d15d6fab7d7ab0552facf2 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 23 May 2024 17:40:19 +0800 Subject: [PATCH 10/21] core/region: optimize the logic of randomly selecting regions within a single range (#8211) ref tikv/pd#7897 In the case of only one key range, follow the fast path logic to avoid unnecessary random and loop operations. Signed-off-by: JmPotato --- pkg/core/region_test.go | 19 +++++++++-- pkg/core/region_tree.go | 61 ++++++++++++++++++++++-------------- pkg/core/region_tree_test.go | 6 ++++ 3 files changed, 61 insertions(+), 25 deletions(-) diff --git a/pkg/core/region_test.go b/pkg/core/region_test.go index 8956bd8a357..aaf440eeeea 100644 --- a/pkg/core/region_test.go +++ b/pkg/core/region_test.go @@ -668,18 +668,33 @@ func BenchmarkRandomRegion(b *testing.B) { } }) ranges := []KeyRange{ + NewKeyRange(fmt.Sprintf("%20d", size/4), fmt.Sprintf("%20d", size*3/4)), + } + b.Run(fmt.Sprintf("random region single range with size %d", size), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + regions.randLeaderRegion(1, ranges) + } + }) + b.Run(fmt.Sprintf("random regions single range with size %d", size), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + regions.RandLeaderRegions(1, ranges) + } + }) + ranges = []KeyRange{ NewKeyRange(fmt.Sprintf("%20d", 0), fmt.Sprintf("%20d", size/4)), NewKeyRange(fmt.Sprintf("%20d", size/4), fmt.Sprintf("%20d", size/2)), NewKeyRange(fmt.Sprintf("%20d", size/2), fmt.Sprintf("%20d", size*3/4)), NewKeyRange(fmt.Sprintf("%20d", size*3/4), fmt.Sprintf("%20d", size)), } - b.Run(fmt.Sprintf("random region given ranges with size %d", size), func(b *testing.B) { + b.Run(fmt.Sprintf("random region multiple ranges with size %d", size), func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { regions.randLeaderRegion(1, ranges) } }) - b.Run(fmt.Sprintf("random regions given ranges with size %d", size), func(b *testing.B) { + b.Run(fmt.Sprintf("random regions multiple ranges with size %d", size), func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { regions.RandLeaderRegions(1, ranges) diff --git a/pkg/core/region_tree.go b/pkg/core/region_tree.go index e6d05d443a1..d4ef4a880fc 100644 --- a/pkg/core/region_tree.go +++ b/pkg/core/region_tree.go @@ -339,20 +339,28 @@ func (t *regionTree) RandomRegions(n int, ranges []KeyRange) []*RegionInfo { } // Pre-allocate the variables to reduce the temporary memory allocations. var ( - startKey, endKey []byte - startIndex, endIndex, randIndex int - startItem *regionItem - pivotItem = ®ionItem{&RegionInfo{meta: &metapb.Region{}}} - region *RegionInfo - regions = make([]*RegionInfo, 0, n) - rangeLen, curLen = len(ranges), len(regions) + startKey, endKey []byte + // By default, we set the `startIndex` and `endIndex` to the whole tree range. + startIndex, endIndex = 0, treeLen + randIndex int + startItem *regionItem + pivotItem = ®ionItem{&RegionInfo{meta: &metapb.Region{}}} + region *RegionInfo + regions = make([]*RegionInfo, 0, n) + rangeLen, curLen = len(ranges), len(regions) // setStartEndIndices is a helper function to set `startIndex` and `endIndex` - // according to the `startKey` and `endKey`. + // according to the `startKey` and `endKey` and check if the range is invalid + // to skip the iteration. // TODO: maybe we could cache the `startIndex` and `endIndex` for each range. - setStartEndIndices = func() { + setAndCheckStartEndIndices = func() (skip bool) { + startKeyLen, endKeyLen := len(startKey), len(endKey) + if startKeyLen == 0 && endKeyLen == 0 { + startIndex, endIndex = 0, treeLen + return false + } pivotItem.meta.StartKey = startKey startItem, startIndex = t.tree.GetWithIndex(pivotItem) - if len(endKey) != 0 { + if endKeyLen > 0 { pivotItem.meta.StartKey = endKey _, endIndex = t.tree.GetWithIndex(pivotItem) } else { @@ -366,13 +374,27 @@ func (t *regionTree) RandomRegions(n int, ranges []KeyRange) []*RegionInfo { startIndex-- } } + // Check whether the `startIndex` and `endIndex` are valid. + if endIndex <= startIndex { + if endKeyLen > 0 && bytes.Compare(startKey, endKey) > 0 { + log.Error("wrong range keys", + logutil.ZapRedactString("start-key", string(HexRegionKey(startKey))), + logutil.ZapRedactString("end-key", string(HexRegionKey(endKey))), + errs.ZapError(errs.ErrWrongRangeKeys)) + } + return true + } + return false } ) - // If no ranges specified, select randomly from the whole tree. - // This is a fast path to reduce the unnecessary iterations. - if rangeLen == 0 { - startKey, endKey = []byte(""), []byte("") - setStartEndIndices() + // This is a fast path to reduce the unnecessary iterations when we only have one range. + if rangeLen <= 1 { + if rangeLen == 1 { + startKey, endKey = ranges[0].StartKey, ranges[0].EndKey + if setAndCheckStartEndIndices() { + return regions + } + } for curLen < n { randIndex = rand.Intn(endIndex-startIndex) + startIndex region = t.tree.GetAt(randIndex).RegionInfo @@ -393,14 +415,7 @@ func (t *regionTree) RandomRegions(n int, ranges []KeyRange) []*RegionInfo { // Shuffle the ranges to increase the randomness. for _, i := range rand.Perm(rangeLen) { startKey, endKey = ranges[i].StartKey, ranges[i].EndKey - setStartEndIndices() - if endIndex <= startIndex { - if len(endKey) > 0 && bytes.Compare(startKey, endKey) > 0 { - log.Error("wrong range keys", - logutil.ZapRedactString("start-key", string(HexRegionKey(startKey))), - logutil.ZapRedactString("end-key", string(HexRegionKey(endKey))), - errs.ZapError(errs.ErrWrongRangeKeys)) - } + if setAndCheckStartEndIndices() { continue } diff --git a/pkg/core/region_tree_test.go b/pkg/core/region_tree_test.go index a86f2f52c47..5886103191c 100644 --- a/pkg/core/region_tree_test.go +++ b/pkg/core/region_tree_test.go @@ -281,8 +281,12 @@ func TestRandomRegion(t *testing.T) { updateNewItem(tree, regionA) ra := tree.randomRegion([]KeyRange{NewKeyRange("", "")}) re.Equal(regionA, ra) + ra = tree.randomRegion(nil) + re.Equal(regionA, ra) ra2 := tree.RandomRegions(2, []KeyRange{NewKeyRange("", "")}) re.Equal([]*RegionInfo{regionA, regionA}, ra2) + ra2 = tree.RandomRegions(2, nil) + re.Equal([]*RegionInfo{regionA, regionA}, ra2) regionB := NewTestRegionInfo(2, 2, []byte("g"), []byte("n")) regionC := NewTestRegionInfo(3, 3, []byte("n"), []byte("t")) @@ -307,6 +311,7 @@ func TestRandomRegion(t *testing.T) { rf = tree.randomRegion([]KeyRange{NewKeyRange("z", "")}) re.Nil(rf) + checkRandomRegion(re, tree, []*RegionInfo{regionA, regionB, regionC, regionD}, nil) checkRandomRegion(re, tree, []*RegionInfo{regionA, regionB, regionC, regionD}, []KeyRange{NewKeyRange("", "")}) checkRandomRegion(re, tree, []*RegionInfo{regionA, regionB}, []KeyRange{NewKeyRange("", "n")}) checkRandomRegion(re, tree, []*RegionInfo{regionC, regionD}, []KeyRange{NewKeyRange("n", "")}) @@ -356,6 +361,7 @@ func TestRandomRegionDiscontinuous(t *testing.T) { rd := tree.randomRegion([]KeyRange{NewKeyRange("", "b")}) re.Equal(regionD, rd) + checkRandomRegion(re, tree, []*RegionInfo{regionA, regionB, regionC, regionD}, nil) checkRandomRegion(re, tree, []*RegionInfo{regionA, regionB, regionC, regionD}, []KeyRange{NewKeyRange("", "")}) } From 7cc3b4e08298d51d278c797f1989022bb2ae02e3 Mon Sep 17 00:00:00 2001 From: Hu# Date: Fri, 24 May 2024 17:19:18 +0800 Subject: [PATCH 11/21] tools/ctl: add caller ID for `pd-ctl` (#8214) ref tikv/pd#7300 Signed-off-by: husharp --- tools/pd-ctl/pdctl/command/global.go | 8 ++--- tools/pd-ctl/tests/global_test.go | 45 +++++++++++++++++++++------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/tools/pd-ctl/pdctl/command/global.go b/tools/pd-ctl/pdctl/command/global.go index fa77df6a101..f7c04c3ca5c 100644 --- a/tools/pd-ctl/pdctl/command/global.go +++ b/tools/pd-ctl/pdctl/command/global.go @@ -33,7 +33,7 @@ import ( ) const ( - pdControlCallerID = "pd-ctl" + PDControlCallerID = "pd-ctl" clusterPrefix = "pd/api/v1/cluster" ) @@ -107,7 +107,7 @@ func initNewPDClient(cmd *cobra.Command, opts ...pd.ClientOption) error { if PDCli != nil { PDCli.Close() } - PDCli = pd.NewClient(pdControlCallerID, getEndpoints(cmd), opts...) + PDCli = pd.NewClient(PDControlCallerID, getEndpoints(cmd), opts...).WithCallerID(PDControlCallerID) return nil } @@ -122,7 +122,7 @@ func initNewPDClientWithTLS(cmd *cobra.Command, caPath, certPath, keyPath string // TODO: replace dialClient with the PD HTTP client completely. var dialClient = &http.Client{ - Transport: apiutil.NewCallerIDRoundTripper(http.DefaultTransport, pdControlCallerID), + Transport: apiutil.NewCallerIDRoundTripper(http.DefaultTransport, PDControlCallerID), } // RequireHTTPSClient creates a HTTPS client if the related flags are set @@ -153,7 +153,7 @@ func initHTTPSClient(caPath, certPath, keyPath string) error { } dialClient = &http.Client{ Transport: apiutil.NewCallerIDRoundTripper( - &http.Transport{TLSClientConfig: tlsConfig}, pdControlCallerID), + &http.Transport{TLSClientConfig: tlsConfig}, PDControlCallerID), } return nil } diff --git a/tools/pd-ctl/tests/global_test.go b/tools/pd-ctl/tests/global_test.go index f4f55e2af89..6987267ea54 100644 --- a/tools/pd-ctl/tests/global_test.go +++ b/tools/pd-ctl/tests/global_test.go @@ -16,33 +16,44 @@ package tests import ( "context" + "encoding/json" "fmt" "net/http" "testing" - "github.com/pingcap/log" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/apiutil" "github.com/tikv/pd/pkg/utils/assertutil" "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server" cmd "github.com/tikv/pd/tools/pd-ctl/pdctl" - "go.uber.org/zap" + "github.com/tikv/pd/tools/pd-ctl/pdctl/command" ) -const pdControlCallerID = "pd-ctl" - func TestSendAndGetComponent(t *testing.T) { re := require.New(t) handler := func(context.Context, *server.Server) (http.Handler, apiutil.APIServiceGroup, error) { mux := http.NewServeMux() + // check pd http sdk api + mux.HandleFunc("/pd/api/v1/cluster", func(w http.ResponseWriter, r *http.Request) { + callerID := apiutil.GetCallerIDOnHTTP(r) + re.Equal(command.PDControlCallerID, callerID) + cluster := &metapb.Cluster{Id: 1} + clusterBytes, err := json.Marshal(cluster) + re.NoError(err) + w.Write(clusterBytes) + }) + // check http client api + // TODO: remove this comment after replacing dialClient with the PD HTTP client completely. mux.HandleFunc("/pd/api/v1/health", func(w http.ResponseWriter, r *http.Request) { callerID := apiutil.GetCallerIDOnHTTP(r) - for k := range r.Header { - log.Info("header", zap.String("key", k)) - } - log.Info("caller id", zap.String("caller-id", callerID)) - re.Equal(pdControlCallerID, callerID) + re.Equal(command.PDControlCallerID, callerID) + fmt.Fprint(w, callerID) + }) + mux.HandleFunc("/pd/api/v1/stores", func(w http.ResponseWriter, r *http.Request) { + callerID := apiutil.GetCallerIDOnHTTP(r) + re.Equal(command.PDControlCallerID, callerID) fmt.Fprint(w, callerID) }) info := apiutil.APIServiceGroup{ @@ -64,8 +75,20 @@ func TestSendAndGetComponent(t *testing.T) { }() cmd := cmd.GetRootCmd() - args := []string{"-u", pdAddr, "health"} + args := []string{"-u", pdAddr, "cluster"} output, err := ExecuteCommand(cmd, args...) re.NoError(err) - re.Equal(fmt.Sprintf("%s\n", pdControlCallerID), string(output)) + re.Equal(fmt.Sprintf("%s\n", `{ + "id": 1 +}`), string(output)) + + args = []string{"-u", pdAddr, "health"} + output, err = ExecuteCommand(cmd, args...) + re.NoError(err) + re.Equal(fmt.Sprintf("%s\n", command.PDControlCallerID), string(output)) + + args = []string{"-u", pdAddr, "store"} + output, err = ExecuteCommand(cmd, args...) + re.NoError(err) + re.Equal(fmt.Sprintf("%s\n", command.PDControlCallerID), string(output)) } From dd7f2a772dcd2075a69eb49fcbb3853cccfaaad2 Mon Sep 17 00:00:00 2001 From: okJiang Date: Mon, 27 May 2024 11:22:19 +0800 Subject: [PATCH 12/21] ctl: replace doRequest with HTTP client to get health status (#8212) ref tikv/pd#7300 Signed-off-by: okJiang <819421878@qq.com> --- client/http/api.go | 1 + client/http/interface.go | 15 +++++++++++++++ client/http/request_info.go | 1 + client/http/types.go | 9 +++++++++ .../uiserver/embedded_assets_rewriter.go | 1 + tests/integrations/client/http_client_test.go | 19 +++++++++++++++++++ tools/pd-ctl/pdctl/command/health_command.go | 17 ++++++----------- tools/pd-ctl/tests/global_test.go | 10 ---------- 8 files changed, 52 insertions(+), 21 deletions(-) diff --git a/client/http/api.go b/client/http/api.go index a1ca96b38f1..3376a48770d 100644 --- a/client/http/api.go +++ b/client/http/api.go @@ -41,6 +41,7 @@ const ( membersPrefix = "/pd/api/v1/members" leaderPrefix = "/pd/api/v1/leader" transferLeader = "/pd/api/v1/leader/transfer" + health = "/pd/api/v1/health" // Config Config = "/pd/api/v1/config" ClusterVersion = "/pd/api/v1/config/cluster-version" diff --git a/client/http/interface.go b/client/http/interface.go index 7b15291d9e7..11c24beaefd 100644 --- a/client/http/interface.go +++ b/client/http/interface.go @@ -50,6 +50,7 @@ type Client interface { GetStores(context.Context) (*StoresInfo, error) GetStore(context.Context, uint64) (*StoreInfo, error) SetStoreLabels(context.Context, int64, map[string]string) error + GetHealthStatus(context.Context) ([]Health, error) /* Config-related interfaces */ GetConfig(context.Context) (map[string]any, error) SetConfig(context.Context, map[string]any, ...float64) error @@ -337,6 +338,20 @@ func (c *client) SetStoreLabels(ctx context.Context, storeID int64, storeLabels WithBody(jsonInput)) } +// GetHealthStatus gets the health status of the cluster. +func (c *client) GetHealthStatus(ctx context.Context) ([]Health, error) { + var healths []Health + err := c.request(ctx, newRequestInfo(). + WithName(getHealthStatusName). + WithURI(health). + WithMethod(http.MethodGet). + WithResp(&healths)) + if err != nil { + return nil, err + } + return healths, nil +} + // GetConfig gets the configurations. func (c *client) GetConfig(ctx context.Context) (map[string]any, error) { var config map[string]any diff --git a/client/http/request_info.go b/client/http/request_info.go index 0ce7072d1ba..202eab1150f 100644 --- a/client/http/request_info.go +++ b/client/http/request_info.go @@ -39,6 +39,7 @@ const ( getStoresName = "GetStores" getStoreName = "GetStore" setStoreLabelsName = "SetStoreLabels" + getHealthStatusName = "GetHealthStatus" getConfigName = "GetConfig" setConfigName = "SetConfig" getScheduleConfigName = "GetScheduleConfig" diff --git a/client/http/types.go b/client/http/types.go index 31b2bfdaea7..f7273068b8c 100644 --- a/client/http/types.go +++ b/client/http/types.go @@ -661,3 +661,12 @@ func stringToKeyspaceState(str string) (keyspacepb.KeyspaceState, error) { return keyspacepb.KeyspaceState(0), fmt.Errorf("invalid KeyspaceState string: %s", str) } } + +// Health reflects the cluster's health. +// NOTE: This type is moved from `server/api/health.go`, maybe move them to the same place later. +type Health struct { + Name string `json:"name"` + MemberID uint64 `json:"member_id"` + ClientUrls []string `json:"client_urls"` + Health bool `json:"health"` +} diff --git a/pkg/dashboard/uiserver/embedded_assets_rewriter.go b/pkg/dashboard/uiserver/embedded_assets_rewriter.go index 2a5b4a5b3b6..d19db01936f 100644 --- a/pkg/dashboard/uiserver/embedded_assets_rewriter.go +++ b/pkg/dashboard/uiserver/embedded_assets_rewriter.go @@ -28,6 +28,7 @@ import ( var once sync.Once // Assets returns the Assets FileSystem of the dashboard UI +// NOTE: if you see "undefined: assets" error, please run `make dashboard-ui` in the root directory of the repository. func Assets(cfg *config.Config) http.FileSystem { once.Do(func() { resPath := distroutil.MustGetResPath() diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index d35b7f00584..9e712b808f3 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -811,3 +811,22 @@ func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, re.True(ok) re.Equal(expectGCManagementType, val) } + +func (suite *httpClientTestSuite) TestGetHealthStatus() { + suite.RunTestInTwoModes(suite.checkGetHealthStatus) +} + +func (suite *httpClientTestSuite) checkGetHealthStatus(mode mode, client pd.Client) { + re := suite.Require() + env := suite.env[mode] + + healths, err := client.GetHealthStatus(env.ctx) + re.NoError(err) + re.Len(healths, 2) + sort.Slice(healths, func(i, j int) bool { + return healths[i].Name < healths[j].Name + }) + re.Equal("pd1", healths[0].Name) + re.Equal("pd2", healths[1].Name) + re.True(healths[0].Health && healths[1].Health) +} diff --git a/tools/pd-ctl/pdctl/command/health_command.go b/tools/pd-ctl/pdctl/command/health_command.go index 50ac7763d28..a10ee118397 100644 --- a/tools/pd-ctl/pdctl/command/health_command.go +++ b/tools/pd-ctl/pdctl/command/health_command.go @@ -15,30 +15,25 @@ package command import ( - "net/http" - "github.com/spf13/cobra" ) -var ( - healthPrefix = "pd/api/v1/health" -) - // NewHealthCommand return a health subcommand of rootCmd func NewHealthCommand() *cobra.Command { m := &cobra.Command{ - Use: "health", - Short: "show all node's health information of the pd cluster", - Run: showHealthCommandFunc, + Use: "health", + Short: "show all node's health information of the PD cluster", + PersistentPreRunE: requirePDClient, + Run: showHealthCommandFunc, } return m } func showHealthCommandFunc(cmd *cobra.Command, _ []string) { - r, err := doRequest(cmd, healthPrefix, http.MethodGet, http.Header{}) + health, err := PDCli.GetHealthStatus(cmd.Context()) if err != nil { cmd.Println(err) return } - cmd.Println(r) + jsonPrint(cmd, health) } diff --git a/tools/pd-ctl/tests/global_test.go b/tools/pd-ctl/tests/global_test.go index 6987267ea54..766e357088e 100644 --- a/tools/pd-ctl/tests/global_test.go +++ b/tools/pd-ctl/tests/global_test.go @@ -46,11 +46,6 @@ func TestSendAndGetComponent(t *testing.T) { }) // check http client api // TODO: remove this comment after replacing dialClient with the PD HTTP client completely. - mux.HandleFunc("/pd/api/v1/health", func(w http.ResponseWriter, r *http.Request) { - callerID := apiutil.GetCallerIDOnHTTP(r) - re.Equal(command.PDControlCallerID, callerID) - fmt.Fprint(w, callerID) - }) mux.HandleFunc("/pd/api/v1/stores", func(w http.ResponseWriter, r *http.Request) { callerID := apiutil.GetCallerIDOnHTTP(r) re.Equal(command.PDControlCallerID, callerID) @@ -82,11 +77,6 @@ func TestSendAndGetComponent(t *testing.T) { "id": 1 }`), string(output)) - args = []string{"-u", pdAddr, "health"} - output, err = ExecuteCommand(cmd, args...) - re.NoError(err) - re.Equal(fmt.Sprintf("%s\n", command.PDControlCallerID), string(output)) - args = []string{"-u", pdAddr, "store"} output, err = ExecuteCommand(cmd, args...) re.NoError(err) From beb91c139d0a4ecf8d77c6820faf543e0cfbedc4 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 27 May 2024 15:44:49 +0800 Subject: [PATCH 13/21] apiutil/middleware: add retry logic for obtaining PD leader in redirector (#8216) close tikv/pd#8142 Add retry logic to improve PD HTTP request forwarding success rate during PD leader switch. Signed-off-by: JmPotato --- pkg/utils/apiutil/serverapi/middleware.go | 52 +++++++++++++++++++---- tests/server/api/api_test.go | 18 ++++++++ 2 files changed, 62 insertions(+), 8 deletions(-) diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go index 2432e15c967..18dd2f52155 100755 --- a/pkg/utils/apiutil/serverapi/middleware.go +++ b/pkg/utils/apiutil/serverapi/middleware.go @@ -18,7 +18,9 @@ import ( "net/http" "net/url" "strings" + "time" + "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/log" "github.com/tikv/pd/pkg/errs" mcsutils "github.com/tikv/pd/pkg/mcs/utils" @@ -204,20 +206,19 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http clientUrls = append(clientUrls, targetAddr) // Add a header to the response, it is used to mark whether the request has been forwarded to the micro service. w.Header().Add(apiutil.XForwardedToMicroServiceHeader, "true") - } else { - leader := h.s.GetMember().GetLeader() + } else if name := r.Header.Get(apiutil.PDRedirectorHeader); len(name) == 0 { + leader := h.waitForLeader(r) if leader == nil { http.Error(w, "no leader", http.StatusServiceUnavailable) return } clientUrls = leader.GetClientUrls() - // Prevent more than one redirection among PD/API servers. - if name := r.Header.Get(apiutil.PDRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirect)) - http.Error(w, errs.ErrRedirectToNotLeader.FastGenByArgs().Error(), http.StatusInternalServerError) - return - } r.Header.Set(apiutil.PDRedirectorHeader, h.s.Name()) + } else { + // Prevent more than one redirection among PD/API servers. + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirect)) + http.Error(w, errs.ErrRedirectToNotLeader.FastGenByArgs().Error(), http.StatusInternalServerError) + return } urls := make([]url.URL, 0, len(clientUrls)) @@ -233,3 +234,38 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http client := h.s.GetHTTPClient() apiutil.NewCustomReverseProxies(client, urls).ServeHTTP(w, r) } + +const ( + backoffMaxDelay = 3 * time.Second + backoffInterval = 100 * time.Millisecond +) + +// If current server does not have a leader, backoff to increase the chance of success. +func (h *redirector) waitForLeader(r *http.Request) (leader *pdpb.Member) { + var ( + interval = backoffInterval + maxDelay = backoffMaxDelay + curDelay = time.Duration(0) + ) + for { + leader = h.s.GetMember().GetLeader() + if leader != nil { + return + } + select { + case <-time.After(interval): + curDelay += interval + if curDelay >= maxDelay { + return + } + interval *= 2 + if curDelay+interval > maxDelay { + interval = maxDelay - curDelay + } + case <-r.Context().Done(): + return + case <-h.s.LoopContext().Done(): + return + } + } +} diff --git a/tests/server/api/api_test.go b/tests/server/api/api_test.go index 091d1488177..f59e85651f5 100644 --- a/tests/server/api/api_test.go +++ b/tests/server/api/api_test.go @@ -617,6 +617,24 @@ func (suite *redirectorTestSuite) TestRedirect() { re.Equal(h, header) } } + // Test redirect during leader election. + leader = suite.cluster.GetLeaderServer() + re.NotNil(leader) + err := leader.ResignLeader() + re.NoError(err) + for _, svr := range suite.cluster.GetServers() { + url := fmt.Sprintf("%s/pd/api/v1/version", svr.GetServer().GetAddr()) + testutil.Eventually(re, func() bool { + resp, err := tests.TestDialClient.Get(url) + re.NoError(err) + defer resp.Body.Close() + _, err = io.ReadAll(resp.Body) + re.NoError(err) + // Should not meet 503 since the retry logic ensure the request is sent to the new leader eventually. + re.NotEqual(http.StatusServiceUnavailable, resp.StatusCode) + return resp.StatusCode == http.StatusOK + }) + } } func (suite *redirectorTestSuite) TestAllowFollowerHandle() { From 9d580d03e62e0dad3627de3f050a4a32f8f3e207 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 28 May 2024 13:40:20 +0800 Subject: [PATCH 14/21] *: batch process peer task (#8213) ref tikv/pd#7897 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/cluster/cluster.go | 7 +- pkg/core/peer.go | 31 --------- pkg/mcs/scheduling/server/cluster.go | 4 +- pkg/mock/mockcluster/mockcluster.go | 26 +------ pkg/statistics/hot_cache.go | 9 +-- pkg/statistics/hot_cache_task.go | 43 +++++++++--- pkg/statistics/hot_peer_cache.go | 87 ++++++++++++------------ pkg/statistics/hot_peer_cache_test.go | 98 +++++++++++---------------- pkg/statistics/utils/kind.go | 10 +-- server/cluster/cluster.go | 3 +- server/cluster/cluster_test.go | 32 +++++++++ tools/pd-ctl/tests/hot/hot_test.go | 3 +- 12 files changed, 165 insertions(+), 188 deletions(-) diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 8bd2616f41f..ab97c7899db 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -35,12 +35,7 @@ type Cluster interface { func HandleStatsAsync(c Cluster, region *core.RegionInfo) { c.GetHotStat().CheckWriteAsync(statistics.NewCheckExpiredItemTask(region)) c.GetHotStat().CheckReadAsync(statistics.NewCheckExpiredItemTask(region)) - reportInterval := region.GetInterval() - interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetWriteLoads(), interval) - c.GetHotStat().CheckWriteAsync(statistics.NewCheckPeerTask(peerInfo, region)) - } + c.GetHotStat().CheckWriteAsync(statistics.NewCheckWritePeerTask(region)) c.GetCoordinator().GetSchedulersController().CheckTransferWitnessLeader(region) } diff --git a/pkg/core/peer.go b/pkg/core/peer.go index 659886e6d39..1f888ba58eb 100644 --- a/pkg/core/peer.go +++ b/pkg/core/peer.go @@ -77,34 +77,3 @@ func CountInJointState(peers ...*metapb.Peer) int { } return count } - -// PeerInfo provides peer information -type PeerInfo struct { - *metapb.Peer - loads []float64 - interval uint64 -} - -// NewPeerInfo creates PeerInfo -func NewPeerInfo(meta *metapb.Peer, loads []float64, interval uint64) *PeerInfo { - return &PeerInfo{ - Peer: meta, - loads: loads, - interval: interval, - } -} - -// GetLoads provides loads -func (p *PeerInfo) GetLoads() []float64 { - return p.loads -} - -// GetPeerID provides peer id -func (p *PeerInfo) GetPeerID() uint64 { - return p.GetId() -} - -// GetInterval returns reporting interval -func (p *PeerInfo) GetInterval() uint64 { - return p.interval -} diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index c6c365b03ad..d711ab2d4f6 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -9,6 +9,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/failpoint" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/kvproto/pkg/schedulingpb" "github.com/pingcap/log" @@ -442,8 +443,7 @@ func (c *Cluster) HandleStoreHeartbeat(heartbeat *schedulingpb.StoreHeartbeatReq utils.RegionWriteKeys: 0, utils.RegionWriteQueryNum: 0, } - peerInfo := core.NewPeerInfo(peer, loads, interval) - c.hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + c.hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{peer}, loads, interval)) } // Here we will compare the reported regions with the previous hot peers to decide if it is still hot. diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go index e5b3e39a502..3f9710c48fd 100644 --- a/pkg/mock/mockcluster/mockcluster.go +++ b/pkg/mock/mockcluster/mockcluster.go @@ -896,14 +896,7 @@ func (mc *Cluster) CheckRegionRead(region *core.RegionInfo) []*statistics.HotPee items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckReadPeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - return items + return append(items, mc.HotCache.CheckReadPeerSync(region, region.GetPeers(), region.GetLoads(), interval)...) } // CheckRegionWrite checks region write info with all peers @@ -913,14 +906,7 @@ func (mc *Cluster) CheckRegionWrite(region *core.RegionInfo) []*statistics.HotPe items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckWritePeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - return items + return append(items, mc.HotCache.CheckWritePeerSync(region, region.GetPeers(), region.GetLoads(), interval)...) } // CheckRegionLeaderRead checks region read info with leader peer @@ -930,13 +916,7 @@ func (mc *Cluster) CheckRegionLeaderRead(region *core.RegionInfo) []*statistics. items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - peer := region.GetLeader() - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckReadPeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - return items + return append(items, mc.HotCache.CheckReadPeerSync(region, []*metapb.Peer{region.GetLeader()}, region.GetLoads(), interval)...) } // ObserveRegionsStats records the current stores stats from region stats. diff --git a/pkg/statistics/hot_cache.go b/pkg/statistics/hot_cache.go index 799fb240d10..26548c8b47e 100644 --- a/pkg/statistics/hot_cache.go +++ b/pkg/statistics/hot_cache.go @@ -17,6 +17,7 @@ package statistics import ( "context" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/smallnest/chanx" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/statistics/utils" @@ -172,14 +173,14 @@ func (w *HotCache) Update(item *HotPeerStat, kind utils.RWType) { // CheckWritePeerSync checks the write status, returns update items. // This is used for mockcluster, for test purpose. -func (w *HotCache) CheckWritePeerSync(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - return w.writeCache.checkPeerFlow(peer, region) +func (w *HotCache) CheckWritePeerSync(region *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) []*HotPeerStat { + return w.writeCache.checkPeerFlow(region, peers, loads, interval) } // CheckReadPeerSync checks the read status, returns update items. // This is used for mockcluster, for test purpose. -func (w *HotCache) CheckReadPeerSync(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - return w.readCache.checkPeerFlow(peer, region) +func (w *HotCache) CheckReadPeerSync(region *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) []*HotPeerStat { + return w.readCache.checkPeerFlow(region, peers, loads, interval) } // ExpiredReadItems returns the read items which are already expired. diff --git a/pkg/statistics/hot_cache_task.go b/pkg/statistics/hot_cache_task.go index fa224b522ff..01731f3fe4d 100644 --- a/pkg/statistics/hot_cache_task.go +++ b/pkg/statistics/hot_cache_task.go @@ -17,6 +17,7 @@ package statistics import ( "context" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" ) @@ -25,22 +26,46 @@ type FlowItemTask interface { runTask(cache *hotPeerCache) } -type checkPeerTask struct { - peerInfo *core.PeerInfo +type checkReadPeerTask struct { regionInfo *core.RegionInfo + peers []*metapb.Peer + loads []float64 + interval uint64 } -// NewCheckPeerTask creates task to update peerInfo -func NewCheckPeerTask(peerInfo *core.PeerInfo, regionInfo *core.RegionInfo) FlowItemTask { - return &checkPeerTask{ - peerInfo: peerInfo, +// NewCheckReadPeerTask creates task to update peerInfo +func NewCheckReadPeerTask(regionInfo *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) FlowItemTask { + return &checkReadPeerTask{ regionInfo: regionInfo, + peers: peers, + loads: loads, + interval: interval, } } -func (t *checkPeerTask) runTask(cache *hotPeerCache) { - stat := cache.checkPeerFlow(t.peerInfo, t.regionInfo) - if stat != nil { +func (t *checkReadPeerTask) runTask(cache *hotPeerCache) { + stats := cache.checkPeerFlow(t.regionInfo, t.peers, t.loads, t.interval) + for _, stat := range stats { + cache.updateStat(stat) + } +} + +type checkWritePeerTask struct { + region *core.RegionInfo +} + +// NewCheckWritePeerTask creates task to update peerInfo +func NewCheckWritePeerTask(region *core.RegionInfo) FlowItemTask { + return &checkWritePeerTask{ + region: region, + } +} + +func (t *checkWritePeerTask) runTask(cache *hotPeerCache) { + reportInterval := t.region.GetInterval() + interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() + stats := cache.checkPeerFlow(t.region, t.region.GetPeers(), t.region.GetWriteLoads(), interval) + for _, stat := range stats { cache.updateStat(stat) } } diff --git a/pkg/statistics/hot_peer_cache.go b/pkg/statistics/hot_peer_cache.go index cd27dcad4c8..3a3d3519bd9 100644 --- a/pkg/statistics/hot_peer_cache.go +++ b/pkg/statistics/hot_peer_cache.go @@ -174,58 +174,61 @@ func (f *hotPeerCache) collectExpiredItems(region *core.RegionInfo) []*HotPeerSt // checkPeerFlow checks the flow information of a peer. // Notice: checkPeerFlow couldn't be used concurrently. // checkPeerFlow will update oldItem's rollingLoads into newItem, thus we should use write lock here. -func (f *hotPeerCache) checkPeerFlow(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - interval := peer.GetInterval() +func (f *hotPeerCache) checkPeerFlow(region *core.RegionInfo, peers []*metapb.Peer, deltaLoads []float64, interval uint64) []*HotPeerStat { if Denoising && interval < HotRegionReportMinInterval { // for test or simulator purpose return nil } - storeID := peer.GetStoreId() - deltaLoads := peer.GetLoads() + f.collectPeerMetrics(deltaLoads, interval) // update metrics regionID := region.GetID() - oldItem := f.getOldHotPeerStat(regionID, storeID) - - // check whether the peer is allowed to be inherited - source := utils.Direct - if oldItem == nil { - for _, storeID := range f.getAllStoreIDs(region) { - oldItem = f.getOldHotPeerStat(regionID, storeID) - if oldItem != nil && oldItem.allowInherited { - source = utils.Inherit - break + + regionPeers := region.GetPeers() + stats := make([]*HotPeerStat, 0, len(peers)) + for _, peer := range peers { + storeID := peer.GetStoreId() + oldItem := f.getOldHotPeerStat(regionID, storeID) + + // check whether the peer is allowed to be inherited + source := utils.Direct + if oldItem == nil { + for _, storeID := range f.getAllStoreIDs(region) { + oldItem = f.getOldHotPeerStat(regionID, storeID) + if oldItem != nil && oldItem.allowInherited { + source = utils.Inherit + break + } } } - } - - // check new item whether is hot - if oldItem == nil { - regionStats := f.kind.RegionStats() - thresholds := f.calcHotThresholds(storeID) - isHot := slice.AnyOf(regionStats, func(i int) bool { - return deltaLoads[regionStats[i]]/float64(interval) >= thresholds[i] - }) - if !isHot { - return nil + // check new item whether is hot + if oldItem == nil { + regionStats := f.kind.RegionStats() + thresholds := f.calcHotThresholds(storeID) + isHot := slice.AnyOf(regionStats, func(i int) bool { + return deltaLoads[regionStats[i]]/float64(interval) >= thresholds[i] + }) + if !isHot { + continue + } } - } - - peers := region.GetPeers() - newItem := &HotPeerStat{ - StoreID: storeID, - RegionID: regionID, - Loads: f.kind.GetLoadRatesFromPeer(peer), - isLeader: region.GetLeader().GetStoreId() == storeID, - actionType: utils.Update, - stores: make([]uint64, len(peers)), - } - for i, peer := range peers { - newItem.stores[i] = peer.GetStoreId() - } - if oldItem == nil { - return f.updateNewHotPeerStat(newItem, deltaLoads, time.Duration(interval)*time.Second) + newItem := &HotPeerStat{ + StoreID: storeID, + RegionID: regionID, + Loads: f.kind.GetLoadRates(deltaLoads, interval), + isLeader: region.GetLeader().GetStoreId() == storeID, + actionType: utils.Update, + stores: make([]uint64, len(regionPeers)), + } + for i, peer := range regionPeers { + newItem.stores[i] = peer.GetStoreId() + } + if oldItem == nil { + stats = append(stats, f.updateNewHotPeerStat(newItem, deltaLoads, time.Duration(interval)*time.Second)) + continue + } + stats = append(stats, f.updateHotPeerStat(region, newItem, oldItem, deltaLoads, time.Duration(interval)*time.Second, source)) } - return f.updateHotPeerStat(region, newItem, oldItem, deltaLoads, time.Duration(interval)*time.Second, source) + return stats } // checkColdPeer checks the collect the un-heartbeat peer and maintain it. diff --git a/pkg/statistics/hot_peer_cache_test.go b/pkg/statistics/hot_peer_cache_test.go index 36f922d3830..c116e020f54 100644 --- a/pkg/statistics/hot_peer_cache_test.go +++ b/pkg/statistics/hot_peer_cache_test.go @@ -109,14 +109,7 @@ func checkFlow(cache *hotPeerCache, region *core.RegionInfo, peers []*metapb.Pee reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() res = append(res, cache.collectExpiredItems(region)...) - for _, peer := range peers { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := cache.checkPeerFlow(peerInfo, region) - if item != nil { - res = append(res, item) - } - } - return res + return append(res, cache.checkPeerFlow(region, peers, region.GetLoads(), interval)...) } func updateFlow(cache *hotPeerCache, res []*HotPeerStat) []*HotPeerStat { @@ -318,13 +311,13 @@ func TestUpdateHotPeerStat(t *testing.T) { }() // skip interval=0 - interval := 0 + interval := uint64(0) deltaLoads := []float64{0.0, 0.0, 0.0} utils.MinHotThresholds[utils.RegionReadBytes] = 0.0 utils.MinHotThresholds[utils.RegionReadKeys] = 0.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 0.0 - newItem := cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + newItem := cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Nil(newItem) // new peer, interval is larger than report interval, but no hot @@ -333,8 +326,8 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadBytes] = 1.0 utils.MinHotThresholds[utils.RegionReadKeys] = 1.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 1.0 - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Nil(newItem) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Empty(newItem) // new peer, interval is less than report interval interval = 4 @@ -342,50 +335,49 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadBytes] = 0.0 utils.MinHotThresholds[utils.RegionReadKeys] = 0.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 0.0 - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.NotNil(newItem) - re.Equal(0, newItem.HotDegree) - re.Equal(0, newItem.AntiCount) + re.Equal(0, newItem[0].HotDegree) + re.Equal(0, newItem[0].AntiCount) // sum of interval is less than report interval - interval = 4 deltaLoads = []float64{60.0, 60.0, 60.0} - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(0, newItem.HotDegree) - re.Equal(0, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(0, newItem[0].HotDegree) + re.Equal(0, newItem[0].AntiCount) // sum of interval is larger than report interval, and hot - newItem.AntiCount = utils.Read.DefaultAntiCount() - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + newItem[0].AntiCount = utils.Read.DefaultAntiCount() + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is less than report interval - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is larger than report interval, and hot interval = 10 - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(2, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(2, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is larger than report interval, and cold utils.MinHotThresholds[utils.RegionReadBytes] = 10.0 utils.MinHotThresholds[utils.RegionReadKeys] = 10.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 10.0 - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m-1, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m-1, newItem[0].AntiCount) // sum of interval is larger than report interval, and cold for i := 0; i < 2*m-1; i++ { - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) } - re.Less(newItem.HotDegree, 0) - re.Equal(0, newItem.AntiCount) - re.Equal(utils.Remove, newItem.actionType) + re.Less(newItem[0].HotDegree, 0) + re.Equal(0, newItem[0].AntiCount) + re.Equal(utils.Remove, newItem[0].actionType) } func TestThresholdWithUpdateHotPeerStat(t *testing.T) { @@ -688,9 +680,8 @@ func TestHotPeerCacheTopNThreshold(t *testing.T) { StartTimestamp: start, EndTimestamp: end, })) - newPeer := core.NewPeerInfo(meta.Peers[0], region.GetLoads(), end-start) - stat := cache.checkPeerFlow(newPeer, newRegion) - if stat != nil { + stats := cache.checkPeerFlow(newRegion, newRegion.GetPeers(), newRegion.GetLoads(), end-start) + for _, stat := range stats { cache.updateStat(stat) } } @@ -717,22 +708,11 @@ func TestHotPeerCacheTopNThreshold(t *testing.T) { func BenchmarkCheckRegionFlow(b *testing.B) { cache := NewHotPeerCache(context.Background(), utils.Read) region := buildRegion(utils.Read, 3, 10) - peerInfos := make([]*core.PeerInfo, 0) - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), 10) - peerInfos = append(peerInfos, peerInfo) - } b.ResetTimer() for i := 0; i < b.N; i++ { - items := make([]*HotPeerStat, 0) - for _, peerInfo := range peerInfos { - item := cache.checkPeerFlow(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - for _, ret := range items { - cache.updateStat(ret) + stats := cache.checkPeerFlow(region, region.GetPeers(), region.GetLoads(), 10) + for _, stat := range stats { + cache.updateStat(stat) } } } diff --git a/pkg/statistics/utils/kind.go b/pkg/statistics/utils/kind.go index 4d44b8d57e1..089732f759f 100644 --- a/pkg/statistics/utils/kind.go +++ b/pkg/statistics/utils/kind.go @@ -14,10 +14,6 @@ package utils -import ( - "github.com/tikv/pd/pkg/core" -) - const ( // BytePriority indicates hot-region-scheduler prefer byte dim BytePriority = "byte" @@ -230,10 +226,8 @@ func (rw RWType) DefaultAntiCount() int { } } -// GetLoadRatesFromPeer gets the load rates of the read or write type from PeerInfo. -func (rw RWType) GetLoadRatesFromPeer(peer *core.PeerInfo) []float64 { - deltaLoads := peer.GetLoads() - interval := peer.GetInterval() +// GetLoadRates gets the load rates of the read or write type. +func (rw RWType) GetLoadRates(deltaLoads []float64, interval uint64) []float64 { loads := make([]float64, DimLen) for dim, k := range rw.RegionStats() { loads[dim] = deltaLoads[k] / float64(interval) diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 148b43541a2..057814b718b 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -959,8 +959,7 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest utils.RegionWriteKeys: 0, utils.RegionWriteQueryNum: 0, } - peerInfo := core.NewPeerInfo(peer, loads, interval) - c.hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + c.hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{peer}, loads, interval)) } } for _, stat := range stats.GetSnapshotStats() { diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index 945e354bb6c..0f08153c8ae 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/cluster" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/core/storelimit" @@ -3730,3 +3731,34 @@ func waitNoResponse(re *require.Assertions, stream mockhbstream.HeartbeatStream) return res == nil }) } + +func BenchmarkHandleStatsAsync(b *testing.B) { + // Setup: create a new instance of Cluster + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _, opt, _ := newTestScheduleConfig() + c := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) + c.coordinator = schedule.NewCoordinator(ctx, c, nil) + c.SetPrepared() + region := core.NewRegionInfo(&metapb.Region{ + Id: 1, + RegionEpoch: &metapb.RegionEpoch{ + ConfVer: 1, + Version: 1, + }, + StartKey: []byte{byte(2)}, + EndKey: []byte{byte(3)}, + Peers: []*metapb.Peer{{Id: 11, StoreId: uint64(1)}}, + }, nil, + core.SetApproximateSize(10), + core.SetReportInterval(0, 10), + ) + + // Reset timer after setup + b.ResetTimer() + // Run HandleStatsAsync b.N times + for i := 0; i < b.N; i++ { + cluster.HandleStatsAsync(c, region) + } +} diff --git a/tools/pd-ctl/tests/hot/hot_test.go b/tools/pd-ctl/tests/hot/hot_test.go index 7661704aa41..f65b811b36a 100644 --- a/tools/pd-ctl/tests/hot/hot_test.go +++ b/tools/pd-ctl/tests/hot/hot_test.go @@ -188,11 +188,10 @@ func (suite *hotTestSuite) checkHot(cluster *pdTests.TestCluster) { Id: 100 + regionIDCounter, StoreId: hotStoreID, } - peerInfo := core.NewPeerInfo(leader, loads, reportInterval) region := core.NewRegionInfo(&metapb.Region{ Id: hotRegionID, }, leader) - hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{leader}, loads, reportInterval)) testutil.Eventually(re, func() bool { hotPeerStat := getHotPeerStat(utils.Read, hotRegionID, hotStoreID) return hotPeerStat != nil From 5eb66e09360987ffe35b91cf1849d5628b2f52fb Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 28 May 2024 14:49:20 +0800 Subject: [PATCH 15/21] *: refactor store info (#6830) ref tikv/pd#7897 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/basic_cluster.go | 195 +---------- pkg/core/store.go | 194 +++++++---- pkg/keyspace/keyspace.go | 10 +- pkg/mcs/scheduling/server/apis/v1/api.go | 4 +- pkg/mcs/scheduling/server/cluster.go | 20 +- pkg/mock/mockcluster/mockcluster.go | 12 +- pkg/schedule/checker/rule_checker_test.go | 2 +- pkg/schedule/placement/fit_test.go | 2 +- pkg/schedule/scatter/region_scatterer_test.go | 2 +- pkg/schedule/schedulers/balance_test.go | 2 +- pkg/storage/leveldb_backend.go | 0 pkg/storage/storage_test.go | 4 +- .../unsafe_recovery_controller.go | 4 +- server/api/admin.go | 6 +- server/api/stats.go | 2 +- server/cluster/cluster.go | 305 +++--------------- server/cluster/cluster_test.go | 136 ++++---- server/cluster/scheduling_controller.go | 2 +- server/grpc_service.go | 2 +- server/server.go | 2 - tests/integrations/mcs/scheduling/api_test.go | 6 +- .../mcs/scheduling/config_test.go | 2 +- .../integrations/mcs/scheduling/meta_test.go | 4 +- .../mcs/scheduling/server_test.go | 2 +- tests/server/api/region_test.go | 2 +- tests/server/cluster/cluster_test.go | 2 +- 26 files changed, 293 insertions(+), 631 deletions(-) mode change 100644 => 100755 pkg/storage/leveldb_backend.go diff --git a/pkg/core/basic_cluster.go b/pkg/core/basic_cluster.go index d70b620db3b..2392b7ddac6 100644 --- a/pkg/core/basic_cluster.go +++ b/pkg/core/basic_cluster.go @@ -14,218 +14,43 @@ package core -import ( - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core/storelimit" - "github.com/tikv/pd/pkg/utils/syncutil" -) - // BasicCluster provides basic data member and interface for a tikv cluster. type BasicCluster struct { - Stores struct { - mu syncutil.RWMutex - *StoresInfo - } - + *StoresInfo *RegionsInfo } // NewBasicCluster creates a BasicCluster. func NewBasicCluster() *BasicCluster { return &BasicCluster{ - Stores: struct { - mu syncutil.RWMutex - *StoresInfo - }{StoresInfo: NewStoresInfo()}, - + StoresInfo: NewStoresInfo(), RegionsInfo: NewRegionsInfo(), } } -/* Stores read operations */ - -// GetStores returns all Stores in the cluster. -func (bc *BasicCluster) GetStores() []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStores() -} - -// GetMetaStores gets a complete set of metapb.Store. -func (bc *BasicCluster) GetMetaStores() []*metapb.Store { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetMetaStores() -} - -// GetStore searches for a store by ID. -func (bc *BasicCluster) GetStore(storeID uint64) *StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(storeID) -} - -// GetRegionStores returns all Stores that contains the region's peer. -func (bc *BasicCluster) GetRegionStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetStoreIDs() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetNonWitnessVoterStores returns all Stores that contains the non-witness's voter peer. -func (bc *BasicCluster) GetNonWitnessVoterStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetNonWitnessVoters() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetFollowerStores returns all Stores that contains the region's follower peer. -func (bc *BasicCluster) GetFollowerStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetFollowers() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetLeaderStore returns all Stores that contains the region's leader peer. -func (bc *BasicCluster) GetLeaderStore(region *RegionInfo) *StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(region.GetLeader().GetStoreId()) -} - -// GetStoreCount returns the total count of storeInfo. -func (bc *BasicCluster) GetStoreCount() int { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStoreCount() -} - -/* Stores Write operations */ - -// PauseLeaderTransfer prevents the store from been selected as source or -// target store of TransferLeader. -func (bc *BasicCluster) PauseLeaderTransfer(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.PauseLeaderTransfer(storeID) -} - -// ResumeLeaderTransfer cleans a store's pause state. The store can be selected -// as source or target of TransferLeader again. -func (bc *BasicCluster) ResumeLeaderTransfer(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.ResumeLeaderTransfer(storeID) -} - -// SlowStoreEvicted marks a store as a slow store and prevents transferring -// leader to the store -func (bc *BasicCluster) SlowStoreEvicted(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.SlowStoreEvicted(storeID) -} - -// SlowTrendEvicted marks a store as a slow store by trend and prevents transferring -// leader to the store -func (bc *BasicCluster) SlowTrendEvicted(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.SlowTrendEvicted(storeID) -} - -// SlowTrendRecovered cleans the evicted by slow trend state of a store. -func (bc *BasicCluster) SlowTrendRecovered(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SlowTrendRecovered(storeID) -} - -// SlowStoreRecovered cleans the evicted state of a store. -func (bc *BasicCluster) SlowStoreRecovered(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SlowStoreRecovered(storeID) -} - -// ResetStoreLimit resets the limit for a specific store. -func (bc *BasicCluster) ResetStoreLimit(storeID uint64, limitType storelimit.Type, ratePerSec ...float64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.ResetStoreLimit(storeID, limitType, ratePerSec...) -} - // UpdateStoreStatus updates the information of the store. func (bc *BasicCluster) UpdateStoreStatus(storeID uint64) { - leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize := bc.RegionsInfo.GetStoreStats(storeID) - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.UpdateStoreStatus(storeID, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize) -} - -// PutStore put a store. -func (bc *BasicCluster) PutStore(store *StoreInfo) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SetStore(store) -} - -// ResetStores resets the store cache. -func (bc *BasicCluster) ResetStores() { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.StoresInfo = NewStoresInfo() -} - -// DeleteStore deletes a store. -func (bc *BasicCluster) DeleteStore(store *StoreInfo) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.DeleteStore(store) + leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize := bc.GetStoreStats(storeID) + bc.StoresInfo.UpdateStoreStatus(storeID, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize) } /* Regions read operations */ // GetLeaderStoreByRegionID returns the leader store of the given region. func (bc *BasicCluster) GetLeaderStoreByRegionID(regionID uint64) *StoreInfo { - region := bc.RegionsInfo.GetRegion(regionID) + region := bc.GetRegion(regionID) if region == nil || region.GetLeader() == nil { return nil } - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(region.GetLeader().GetStoreId()) + return bc.GetStore(region.GetLeader().GetStoreId()) } func (bc *BasicCluster) getWriteRate( f func(storeID uint64) (bytesRate, keysRate float64), ) (storeIDs []uint64, bytesRates, keysRates []float64) { - bc.Stores.mu.RLock() - count := len(bc.Stores.stores) - storeIDs = make([]uint64, 0, count) - for _, store := range bc.Stores.stores { - storeIDs = append(storeIDs, store.GetID()) - } - bc.Stores.mu.RUnlock() + storeIDs = bc.GetStoreIDs() + count := len(storeIDs) bytesRates = make([]float64, 0, count) keysRates = make([]float64, 0, count) for _, id := range storeIDs { @@ -238,12 +63,12 @@ func (bc *BasicCluster) getWriteRate( // GetStoresLeaderWriteRate get total write rate of each store's leaders. func (bc *BasicCluster) GetStoresLeaderWriteRate() (storeIDs []uint64, bytesRates, keysRates []float64) { - return bc.getWriteRate(bc.RegionsInfo.GetStoreLeaderWriteRate) + return bc.getWriteRate(bc.GetStoreLeaderWriteRate) } // GetStoresWriteRate get total write rate of each store's regions. func (bc *BasicCluster) GetStoresWriteRate() (storeIDs []uint64, bytesRates, keysRates []float64) { - return bc.getWriteRate(bc.RegionsInfo.GetStoreWriteRate) + return bc.getWriteRate(bc.GetStoreWriteRate) } // UpdateAllStoreStatus updates the information of all stores. diff --git a/pkg/core/store.go b/pkg/core/store.go index 9b660754496..5baedafdb05 100644 --- a/pkg/core/store.go +++ b/pkg/core/store.go @@ -26,6 +26,7 @@ import ( "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/core/storelimit" "github.com/tikv/pd/pkg/errs" + "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/pkg/utils/typeutil" "go.uber.org/zap" ) @@ -639,6 +640,7 @@ func MergeLabels(origin []*metapb.StoreLabel, labels []*metapb.StoreLabel) []*me // StoresInfo contains information about all stores. type StoresInfo struct { + syncutil.RWMutex stores map[uint64]*StoreInfo } @@ -649,8 +651,12 @@ func NewStoresInfo() *StoresInfo { } } +/* Stores read operations */ + // GetStore returns a copy of the StoreInfo with the specified storeID. func (s *StoresInfo) GetStore(storeID uint64) *StoreInfo { + s.RLock() + defer s.RUnlock() store, ok := s.stores[storeID] if !ok { return nil @@ -658,13 +664,121 @@ func (s *StoresInfo) GetStore(storeID uint64) *StoreInfo { return store } -// SetStore sets a StoreInfo with storeID. -func (s *StoresInfo) SetStore(store *StoreInfo) { +// GetStores gets a complete set of StoreInfo. +func (s *StoresInfo) GetStores() []*StoreInfo { + s.RLock() + defer s.RUnlock() + stores := make([]*StoreInfo, 0, len(s.stores)) + for _, store := range s.stores { + stores = append(stores, store) + } + return stores +} + +// GetMetaStores gets a complete set of metapb.Store. +func (s *StoresInfo) GetMetaStores() []*metapb.Store { + s.RLock() + defer s.RUnlock() + stores := make([]*metapb.Store, 0, len(s.stores)) + for _, store := range s.stores { + stores = append(stores, store.GetMeta()) + } + return stores +} + +// GetStoreIDs returns a list of store ids. +func (s *StoresInfo) GetStoreIDs() []uint64 { + s.RLock() + defer s.RUnlock() + count := len(s.stores) + storeIDs := make([]uint64, 0, count) + for _, store := range s.stores { + storeIDs = append(storeIDs, store.GetID()) + } + return storeIDs +} + +// GetFollowerStores returns all Stores that contains the region's follower peer. +func (s *StoresInfo) GetFollowerStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetFollowers() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +// GetRegionStores returns all Stores that contains the region's peer. +func (s *StoresInfo) GetRegionStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetStoreIDs() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +// GetLeaderStore returns all Stores that contains the region's leader peer. +func (s *StoresInfo) GetLeaderStore(region *RegionInfo) *StoreInfo { + s.RLock() + defer s.RUnlock() + if store, ok := s.stores[region.GetLeader().GetStoreId()]; ok && store != nil { + return store + } + return nil +} + +// GetStoreCount returns the total count of storeInfo. +func (s *StoresInfo) GetStoreCount() int { + s.RLock() + defer s.RUnlock() + return len(s.stores) +} + +// GetNonWitnessVoterStores returns all Stores that contains the non-witness's voter peer. +func (s *StoresInfo) GetNonWitnessVoterStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetNonWitnessVoters() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +/* Stores write operations */ + +// PutStore sets a StoreInfo with storeID. +func (s *StoresInfo) PutStore(store *StoreInfo) { + s.Lock() + defer s.Unlock() + s.putStoreLocked(store) +} + +// putStoreLocked sets a StoreInfo with storeID. +func (s *StoresInfo) putStoreLocked(store *StoreInfo) { s.stores[store.GetID()] = store } +// ResetStores resets the store cache. +func (s *StoresInfo) ResetStores() { + s.Lock() + defer s.Unlock() + s.stores = make(map[uint64]*StoreInfo) +} + // PauseLeaderTransfer pauses a StoreInfo with storeID. func (s *StoresInfo) PauseLeaderTransfer(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -679,6 +793,8 @@ func (s *StoresInfo) PauseLeaderTransfer(storeID uint64) error { // ResumeLeaderTransfer cleans a store's pause state. The store can be selected // as source or target of TransferLeader again. func (s *StoresInfo) ResumeLeaderTransfer(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's pause state, but it is not found. It may be cleanup", @@ -691,6 +807,8 @@ func (s *StoresInfo) ResumeLeaderTransfer(storeID uint64) { // SlowStoreEvicted marks a store as a slow store and prevents transferring // leader to the store func (s *StoresInfo) SlowStoreEvicted(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -704,6 +822,8 @@ func (s *StoresInfo) SlowStoreEvicted(storeID uint64) error { // SlowStoreRecovered cleans the evicted state of a store. func (s *StoresInfo) SlowStoreRecovered(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's evicted as a slow store state, but it is not found. It may be cleanup", @@ -716,6 +836,8 @@ func (s *StoresInfo) SlowStoreRecovered(storeID uint64) { // SlowTrendEvicted marks a store as a slow trend and prevents transferring // leader to the store func (s *StoresInfo) SlowTrendEvicted(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -729,6 +851,8 @@ func (s *StoresInfo) SlowTrendEvicted(storeID uint64) error { // SlowTrendRecovered cleans the evicted by trend state of a store. func (s *StoresInfo) SlowTrendRecovered(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's evicted by trend as a slow store state, but it is not found. It may be cleanup", @@ -740,76 +864,24 @@ func (s *StoresInfo) SlowTrendRecovered(storeID uint64) { // ResetStoreLimit resets the limit for a specific store. func (s *StoresInfo) ResetStoreLimit(storeID uint64, limitType storelimit.Type, ratePerSec ...float64) { + s.Lock() + defer s.Unlock() if store, ok := s.stores[storeID]; ok { s.stores[storeID] = store.Clone(ResetStoreLimit(limitType, ratePerSec...)) } } -// GetStores gets a complete set of StoreInfo. -func (s *StoresInfo) GetStores() []*StoreInfo { - stores := make([]*StoreInfo, 0, len(s.stores)) - for _, store := range s.stores { - stores = append(stores, store) - } - return stores -} - -// GetMetaStores gets a complete set of metapb.Store. -func (s *StoresInfo) GetMetaStores() []*metapb.Store { - stores := make([]*metapb.Store, 0, len(s.stores)) - for _, store := range s.stores { - stores = append(stores, store.GetMeta()) - } - return stores -} - // DeleteStore deletes tombstone record form store func (s *StoresInfo) DeleteStore(store *StoreInfo) { + s.Lock() + defer s.Unlock() delete(s.stores, store.GetID()) } -// GetStoreCount returns the total count of storeInfo. -func (s *StoresInfo) GetStoreCount() int { - return len(s.stores) -} - -// SetLeaderCount sets the leader count to a storeInfo. -func (s *StoresInfo) SetLeaderCount(storeID uint64, leaderCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetLeaderCount(leaderCount)) - } -} - -// SetRegionCount sets the region count to a storeInfo. -func (s *StoresInfo) SetRegionCount(storeID uint64, regionCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetRegionCount(regionCount)) - } -} - -// SetPendingPeerCount sets the pending count to a storeInfo. -func (s *StoresInfo) SetPendingPeerCount(storeID uint64, pendingPeerCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetPendingPeerCount(pendingPeerCount)) - } -} - -// SetLeaderSize sets the leader size to a storeInfo. -func (s *StoresInfo) SetLeaderSize(storeID uint64, leaderSize int64) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetLeaderSize(leaderSize)) - } -} - -// SetRegionSize sets the region size to a storeInfo. -func (s *StoresInfo) SetRegionSize(storeID uint64, regionSize int64) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetRegionSize(regionSize)) - } -} - // UpdateStoreStatus updates the information of the store. func (s *StoresInfo) UpdateStoreStatus(storeID uint64, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount int, leaderSize int64, regionSize int64) { + s.Lock() + defer s.Unlock() if store, ok := s.stores[storeID]; ok { newStore := store.ShallowClone(SetLeaderCount(leaderCount), SetRegionCount(regionCount), @@ -818,7 +890,7 @@ func (s *StoresInfo) UpdateStoreStatus(storeID uint64, leaderCount, regionCount, SetPendingPeerCount(pendingPeerCount), SetLeaderSize(leaderSize), SetRegionSize(regionSize)) - s.SetStore(newStore) + s.putStoreLocked(newStore) } } diff --git a/pkg/keyspace/keyspace.go b/pkg/keyspace/keyspace.go index d84b3698f69..b37ec7f0fca 100644 --- a/pkg/keyspace/keyspace.go +++ b/pkg/keyspace/keyspace.go @@ -343,20 +343,20 @@ func (manager *Manager) splitKeyspaceRegion(id uint32, waitRegionSplit bool) (er for { select { case <-ticker.C: - regionsInfo := manager.cluster.GetBasicCluster().RegionsInfo - region := regionsInfo.GetRegionByKey(rawLeftBound) + c := manager.cluster.GetBasicCluster() + region := c.GetRegionByKey(rawLeftBound) if region == nil || !bytes.Equal(region.GetStartKey(), rawLeftBound) { continue } - region = regionsInfo.GetRegionByKey(rawRightBound) + region = c.GetRegionByKey(rawRightBound) if region == nil || !bytes.Equal(region.GetStartKey(), rawRightBound) { continue } - region = regionsInfo.GetRegionByKey(txnLeftBound) + region = c.GetRegionByKey(txnLeftBound) if region == nil || !bytes.Equal(region.GetStartKey(), txnLeftBound) { continue } - region = regionsInfo.GetRegionByKey(txnRightBound) + region = c.GetRegionByKey(txnRightBound) if region == nil || !bytes.Equal(region.GetStartKey(), txnRightBound) { continue } diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go index be3277f3fc6..39aa11927ca 100644 --- a/pkg/mcs/scheduling/server/apis/v1/api.go +++ b/pkg/mcs/scheduling/server/apis/v1/api.go @@ -272,7 +272,7 @@ func deleteAllRegionCache(c *gin.Context) { c.String(http.StatusInternalServerError, errs.ErrNotBootstrapped.GenWithStackByArgs().Error()) return } - cluster.DropCacheAllRegion() + cluster.ResetRegionCache() c.String(http.StatusOK, "All regions are removed from server cache.") } @@ -297,7 +297,7 @@ func deleteRegionCacheByID(c *gin.Context) { c.String(http.StatusBadRequest, err.Error()) return } - cluster.DropCacheRegion(regionID) + cluster.RemoveRegionIfExist(regionID) c.String(http.StatusOK, "The region is removed from server cache.") } diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index d711ab2d4f6..caaafe42c87 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -69,9 +69,9 @@ const ( collectWaitTime = time.Minute // heartbeat relative const - heartbeatTaskRunner = "heartbeat-task-runner" - statisticsTaskRunner = "statistics-task-runner" - logTaskRunner = "log-task-runner" + heartbeatTaskRunner = "heartbeat-task-runner" + miscTaskRunner = "misc-task-runner" + logTaskRunner = "log-task-runner" ) var syncRunner = ratelimit.NewSyncRunner() @@ -100,7 +100,7 @@ func NewCluster(parentCtx context.Context, persistConfig *config.PersistConfig, checkMembershipCh: checkMembershipCh, heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), - miscRunner: ratelimit.NewConcurrentRunner(statisticsTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + miscRunner: ratelimit.NewConcurrentRunner(miscTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), } c.coordinator = schedule.NewCoordinator(ctx, c, hbStreams) @@ -521,7 +521,7 @@ func (c *Cluster) collectMetrics() { // collect hot cache metrics c.hotStat.CollectMetrics() // collect the lock metrics - c.RegionsInfo.CollectWaitLockMetrics() + c.CollectWaitLockMetrics() } func resetMetrics() { @@ -688,16 +688,6 @@ func (c *Cluster) SetPrepared() { c.coordinator.GetPrepareChecker().SetPrepared() } -// DropCacheAllRegion removes all cached regions. -func (c *Cluster) DropCacheAllRegion() { - c.ResetRegionCache() -} - -// DropCacheRegion removes a region from the cache. -func (c *Cluster) DropCacheRegion(id uint64) { - c.RemoveRegionIfExist(id) -} - // IsSchedulingHalted returns whether the scheduling is halted. // Currently, the microservice scheduling is halted when: // - The `HaltScheduling` persist option is set to true. diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go index 3f9710c48fd..5d3aba2d2e8 100644 --- a/pkg/mock/mockcluster/mockcluster.go +++ b/pkg/mock/mockcluster/mockcluster.go @@ -138,11 +138,6 @@ func (mc *Cluster) GetStoresLoads() map[uint64][]float64 { return mc.HotStat.GetStoresLoads() } -// GetStore gets a store with a given store ID. -func (mc *Cluster) GetStore(storeID uint64) *core.StoreInfo { - return mc.Stores.GetStore(storeID) -} - // IsRegionHot checks if the region is hot. func (mc *Cluster) IsRegionHot(region *core.RegionInfo) bool { return mc.HotCache.IsRegionHot(region, mc.GetHotRegionCacheHitsThreshold()) @@ -561,11 +556,6 @@ func (mc *Cluster) AddLeaderRegionWithWriteInfo( return items } -// DropCacheAllRegion removes all regions from the cache. -func (mc *Cluster) DropCacheAllRegion() { - mc.ResetRegionCache() -} - // UpdateStoreLeaderWeight updates store leader weight. func (mc *Cluster) UpdateStoreLeaderWeight(storeID uint64, weight float64) { store := mc.GetStore(storeID) @@ -752,7 +742,7 @@ func (mc *Cluster) UpdateStoreStatus(id uint64) { pendingPeerCount := mc.GetStorePendingPeerCount(id) leaderSize := mc.GetStoreLeaderRegionSize(id) regionSize := mc.GetStoreRegionSize(id) - store := mc.Stores.GetStore(id) + store := mc.GetStore(id) stats := &pdpb.StoreStats{} stats.Capacity = defaultStoreCapacity stats.Available = stats.Capacity - uint64(store.GetRegionSize()*units.MiB) diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index e69b956134b..e1cc702fd36 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -1980,7 +1980,7 @@ func makeStores() placement.StoreSet { if zone == 1 && host == 1 { labels["type"] = "read" } - stores.SetStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now), core.SetStoreState(metapb.StoreState_Up))) + stores.PutStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now), core.SetStoreState(metapb.StoreState_Up))) } } } diff --git a/pkg/schedule/placement/fit_test.go b/pkg/schedule/placement/fit_test.go index aa5c66059f7..cc49d25640c 100644 --- a/pkg/schedule/placement/fit_test.go +++ b/pkg/schedule/placement/fit_test.go @@ -47,7 +47,7 @@ func makeStores() StoreSet { if id == 1111 || id == 2111 || id == 3111 { labels["disk"] = "ssd" } - stores.SetStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now))) + stores.PutStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now))) } } } diff --git a/pkg/schedule/scatter/region_scatterer_test.go b/pkg/schedule/scatter/region_scatterer_test.go index b0027e0e415..89e55e5c9c7 100644 --- a/pkg/schedule/scatter/region_scatterer_test.go +++ b/pkg/schedule/scatter/region_scatterer_test.go @@ -216,7 +216,7 @@ func scatterSpecial(re *require.Assertions, numOrdinaryStores, numSpecialStores, leaderStoreID := region.GetLeader().GetStoreId() for _, peer := range region.GetPeers() { storeID := peer.GetStoreId() - store := tc.Stores.GetStore(storeID) + store := tc.GetStore(storeID) if store.GetLabelValue("engine") == "tiflash" { countSpecialPeers[storeID]++ } else { diff --git a/pkg/schedule/schedulers/balance_test.go b/pkg/schedule/schedulers/balance_test.go index 234acfd6d26..26214ed5456 100644 --- a/pkg/schedule/schedulers/balance_test.go +++ b/pkg/schedule/schedulers/balance_test.go @@ -697,7 +697,7 @@ func (suite *balanceLeaderRangeSchedulerTestSuite) TestReSortStores() { suite.tc.AddLeaderStore(4, 100) suite.tc.AddLeaderStore(5, 100) suite.tc.AddLeaderStore(6, 0) - stores := suite.tc.Stores.GetStores() + stores := suite.tc.GetStores() sort.Slice(stores, func(i, j int) bool { return stores[i].GetID() < stores[j].GetID() }) diff --git a/pkg/storage/leveldb_backend.go b/pkg/storage/leveldb_backend.go old mode 100644 new mode 100755 diff --git a/pkg/storage/storage_test.go b/pkg/storage/storage_test.go index 4525ec6091c..460489ecd10 100644 --- a/pkg/storage/storage_test.go +++ b/pkg/storage/storage_test.go @@ -100,7 +100,7 @@ func TestLoadStores(t *testing.T) { n := 10 stores := mustSaveStores(re, storage, n) - re.NoError(storage.LoadStores(cache.SetStore)) + re.NoError(storage.LoadStores(cache.PutStore)) re.Equal(n, cache.GetStoreCount()) for _, store := range cache.GetMetaStores() { @@ -117,7 +117,7 @@ func TestStoreWeight(t *testing.T) { mustSaveStores(re, storage, n) re.NoError(storage.SaveStoreWeight(1, 2.0, 3.0)) re.NoError(storage.SaveStoreWeight(2, 0.2, 0.3)) - re.NoError(storage.LoadStores(cache.SetStore)) + re.NoError(storage.LoadStores(cache.PutStore)) leaderWeights := []float64{1.0, 2.0, 0.2} regionWeights := []float64{1.0, 3.0, 0.3} for i := 0; i < n; i++ { diff --git a/pkg/unsaferecovery/unsafe_recovery_controller.go b/pkg/unsaferecovery/unsafe_recovery_controller.go index d2f6125c3f3..89cd6e6393c 100644 --- a/pkg/unsaferecovery/unsafe_recovery_controller.go +++ b/pkg/unsaferecovery/unsafe_recovery_controller.go @@ -107,7 +107,7 @@ const ( type cluster interface { core.StoreSetInformer - DropCacheAllRegion() + ResetRegionCache() AllocID() (uint64, error) BuryStore(storeID uint64, forceBury bool) error GetSchedulerConfig() sc.SchedulerConfigProvider @@ -544,7 +544,7 @@ func (u *Controller) changeStage(stage stage) { case Finished: if u.step > 1 { // == 1 means no operation has done, no need to invalid cache - u.cluster.DropCacheAllRegion() + u.cluster.ResetRegionCache() } output.Info = "Unsafe recovery Finished" output.Details = u.getAffectedTableDigest() diff --git a/server/api/admin.go b/server/api/admin.go index ab5ba882287..dd81985b514 100644 --- a/server/api/admin.go +++ b/server/api/admin.go @@ -60,7 +60,7 @@ func (h *adminHandler) DeleteRegionCache(w http.ResponseWriter, r *http.Request) h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } - rc.DropCacheRegion(regionID) + rc.RemoveRegionIfExist(regionID) if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer(regionID) } @@ -100,7 +100,7 @@ func (h *adminHandler) DeleteRegionStorage(w http.ResponseWriter, r *http.Reques return } // Remove region from cache. - rc.DropCacheRegion(regionID) + rc.RemoveRegionIfExist(regionID) if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer(regionID) } @@ -116,7 +116,7 @@ func (h *adminHandler) DeleteRegionStorage(w http.ResponseWriter, r *http.Reques func (h *adminHandler) DeleteAllRegionCache(w http.ResponseWriter, r *http.Request) { var err error rc := getCluster(r) - rc.DropCacheAllRegion() + rc.ResetRegionCache() if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer() } diff --git a/server/api/stats.go b/server/api/stats.go index 915d33ddfdf..5aa8fcb72a6 100644 --- a/server/api/stats.go +++ b/server/api/stats.go @@ -47,7 +47,7 @@ func (h *statsHandler) GetRegionStatus(w http.ResponseWriter, r *http.Request) { startKey, endKey := r.URL.Query().Get("start_key"), r.URL.Query().Get("end_key") var stats *statistics.RegionStats if r.URL.Query().Has("count") { - stats = rc.GetRegionCount([]byte(startKey), []byte(endKey)) + stats = rc.GetRegionStatsCount([]byte(startKey), []byte(endKey)) } else { stats = rc.GetRegionStatsByRange([]byte(startKey), []byte(endKey)) } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 057814b718b..70d6b46b980 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -107,9 +107,9 @@ const ( minSnapshotDurationSec = 5 // heartbeat relative const - heartbeatTaskRunner = "heartbeat-async" - statisticsTaskRunner = "statistics-async" - logTaskRunner = "log-async" + heartbeatTaskRunner = "heartbeat-async" + miscTaskRunner = "misc-async" + logTaskRunner = "log-async" ) // Server is the interface for cluster. @@ -143,6 +143,8 @@ type RaftCluster struct { ctx context.Context cancel context.CancelFunc + *core.BasicCluster // cached cluster info + etcdClient *clientv3.Client httpClient *http.Client @@ -159,7 +161,6 @@ type RaftCluster struct { // This below fields are all read-only, we cannot update itself after the raft cluster starts. clusterID uint64 id id.Allocator - core *core.BasicCluster // cached cluster info opt *config.PersistOptions limiter *StoreLimiter *schedulingController @@ -201,10 +202,10 @@ func NewRaftCluster(ctx context.Context, clusterID uint64, basicCluster *core.Ba regionSyncer: regionSyncer, httpClient: httpClient, etcdClient: etcdClient, - core: basicCluster, + BasicCluster: basicCluster, storage: storage, heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), - miscRunner: ratelimit.NewConcurrentRunner(statisticsTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + miscRunner: ratelimit.NewConcurrentRunner(miscTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), } } @@ -251,10 +252,10 @@ func (c *RaftCluster) LoadClusterStatus() (*Status, error) { } func (c *RaftCluster) isInitialized() bool { - if c.core.GetTotalRegionCount() > 1 { + if c.GetTotalRegionCount() > 1 { return true } - region := c.core.GetRegionByKey(nil) + region := c.GetRegionByKey(nil) return region != nil && len(region.GetVoters()) >= int(c.opt.GetReplicationConfig().MaxReplicas) && len(region.GetPendingPeers()) == 0 @@ -295,7 +296,7 @@ func (c *RaftCluster) InitCluster( return err } } - c.schedulingController = newSchedulingController(c.ctx, c.core, c.opt, c.ruleManager) + c.schedulingController = newSchedulingController(c.ctx, c.BasicCluster, c.opt, c.ruleManager) return nil } @@ -644,9 +645,9 @@ func (c *RaftCluster) LoadClusterInfo() (*RaftCluster, error) { return nil, nil } - c.core.ResetStores() + c.ResetStores() start := time.Now() - if err := c.storage.LoadStores(c.core.PutStore); err != nil { + if err := c.storage.LoadStores(c.PutStore); err != nil { return nil, err } log.Info("load stores", @@ -657,11 +658,11 @@ func (c *RaftCluster) LoadClusterInfo() (*RaftCluster, error) { start = time.Now() // used to load region from kv storage to cache storage. - if err = storage.TryLoadRegionsOnce(c.ctx, c.storage, c.core.CheckAndPutRegion); err != nil { + if err = storage.TryLoadRegionsOnce(c.ctx, c.storage, c.CheckAndPutRegion); err != nil { return nil, err } log.Info("load regions", - zap.Int("count", c.core.GetTotalRegionCount()), + zap.Int("count", c.GetTotalRegionCount()), zap.Duration("cost", time.Since(start)), ) @@ -729,7 +730,7 @@ func (c *RaftCluster) runUpdateStoreStats() { case <-ticker.C: // Update related stores. start := time.Now() - c.core.UpdateAllStoreStatus() + c.UpdateAllStoreStatus() updateStoreStatsGauge.Set(time.Since(start).Seconds()) } } @@ -868,8 +869,6 @@ func (c *RaftCluster) GetUnsafeRecoveryController() *unsaferecovery.Controller { func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest, resp *pdpb.StoreHeartbeatResponse) error { stats := heartbeat.GetStats() storeID := stats.GetStoreId() - c.Lock() - defer c.Unlock() store := c.GetStore(storeID) if store == nil { return errors.Errorf("store %v not found", storeID) @@ -917,10 +916,10 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest newStore = newStore.Clone(core.SetLastPersistTime(nowTime)) } } - if store := c.core.GetStore(storeID); store != nil { + if store := c.GetStore(storeID); store != nil { statistics.UpdateStoreHeartbeatMetrics(store) } - c.core.PutStore(newStore) + c.PutStore(newStore) var ( regions map[uint64]*core.RegionInfo interval uint64 @@ -989,7 +988,7 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest // processReportBuckets update the bucket information. func (c *RaftCluster) processReportBuckets(buckets *metapb.Buckets) error { - region := c.core.GetRegion(buckets.GetRegionId()) + region := c.GetRegion(buckets.GetRegionId()) if region == nil { regionCacheMissCounter.Inc() return errors.Errorf("region %v not found", buckets.GetRegionId()) @@ -1022,7 +1021,7 @@ var syncRunner = ratelimit.NewSyncRunner() // processRegionHeartbeat updates the region information. func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *core.RegionInfo) error { tracer := ctx.Tracer - origin, _, err := c.core.PreCheckPutRegion(region) + origin, _, err := c.PreCheckPutRegion(region) tracer.OnPreCheckFinished() if err != nil { return err @@ -1082,7 +1081,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // check its validation again here. // // However, it can't solve the race condition of concurrent heartbeats from the same region. - if overlaps, err = c.core.CheckAndPutRootTree(ctx, region); err != nil { + if overlaps, err = c.CheckAndPutRootTree(ctx, region); err != nil { tracer.OnSaveCacheFinished() return err } @@ -1173,158 +1172,7 @@ func (c *RaftCluster) putMetaLocked(meta *metapb.Cluster) error { // GetBasicCluster returns the basic cluster. func (c *RaftCluster) GetBasicCluster() *core.BasicCluster { - return c.core -} - -// GetRegionByKey gets regionInfo by region key from cluster. -func (c *RaftCluster) GetRegionByKey(regionKey []byte) *core.RegionInfo { - return c.core.GetRegionByKey(regionKey) -} - -// GetPrevRegionByKey gets previous region and leader peer by the region key from cluster. -func (c *RaftCluster) GetPrevRegionByKey(regionKey []byte) *core.RegionInfo { - return c.core.GetPrevRegionByKey(regionKey) -} - -// ScanRegions scans region with start key, until the region contains endKey, or -// total number greater than limit. -func (c *RaftCluster) ScanRegions(startKey, endKey []byte, limit int) []*core.RegionInfo { - return c.core.ScanRegions(startKey, endKey, limit) -} - -// GetRegion searches for a region by ID. -func (c *RaftCluster) GetRegion(regionID uint64) *core.RegionInfo { - return c.core.GetRegion(regionID) -} - -// GetMetaRegions gets regions from cluster. -func (c *RaftCluster) GetMetaRegions() []*metapb.Region { - return c.core.GetMetaRegions() -} - -// GetRegions returns all regions' information in detail. -func (c *RaftCluster) GetRegions() []*core.RegionInfo { - return c.core.GetRegions() -} - -// ValidRegion is used to decide if the region is valid. -func (c *RaftCluster) ValidRegion(region *metapb.Region) error { - return c.core.ValidRegion(region) -} - -// GetTotalRegionCount returns total count of regions -func (c *RaftCluster) GetTotalRegionCount() int { - return c.core.GetTotalRegionCount() -} - -// GetStoreRegions returns all regions' information with a given storeID. -func (c *RaftCluster) GetStoreRegions(storeID uint64) []*core.RegionInfo { - return c.core.GetStoreRegions(storeID) -} - -// GetStoreRegions returns all regions' information with a given storeID. -func (c *RaftCluster) GetStoreRegionsByType(storeID uint64) []*core.RegionInfo { - return c.core.GetStoreRegions(storeID) -} - -// RandLeaderRegions returns some random regions that has leader on the store. -func (c *RaftCluster) RandLeaderRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandLeaderRegions(storeID, ranges) -} - -// RandFollowerRegions returns some random regions that has a follower on the store. -func (c *RaftCluster) RandFollowerRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandFollowerRegions(storeID, ranges) -} - -// RandPendingRegions returns some random regions that has a pending peer on the store. -func (c *RaftCluster) RandPendingRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandPendingRegions(storeID, ranges) -} - -// RandLearnerRegions returns some random regions that has a learner peer on the store. -func (c *RaftCluster) RandLearnerRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandLearnerRegions(storeID, ranges) -} - -// RandWitnessRegions returns some random regions that has a witness peer on the store. -func (c *RaftCluster) RandWitnessRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandWitnessRegions(storeID, ranges) -} - -// GetLeaderStore returns all stores that contains the region's leader peer. -func (c *RaftCluster) GetLeaderStore(region *core.RegionInfo) *core.StoreInfo { - return c.core.GetLeaderStore(region) -} - -// GetNonWitnessVoterStores returns all stores that contains the region's non-witness voter peer. -func (c *RaftCluster) GetNonWitnessVoterStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetNonWitnessVoterStores(region) -} - -// GetFollowerStores returns all stores that contains the region's follower peer. -func (c *RaftCluster) GetFollowerStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetFollowerStores(region) -} - -// GetRegionStores returns all stores that contains the region's peer. -func (c *RaftCluster) GetRegionStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetRegionStores(region) -} - -// GetStoreCount returns the count of stores. -func (c *RaftCluster) GetStoreCount() int { - return c.core.GetStoreCount() -} - -// GetStoreRegionCount returns the number of regions for a given store. -func (c *RaftCluster) GetStoreRegionCount(storeID uint64) int { - return c.core.GetStoreRegionCount(storeID) -} - -// GetAverageRegionSize returns the average region approximate size. -func (c *RaftCluster) GetAverageRegionSize() int64 { - return c.core.GetAverageRegionSize() -} - -// DropCacheRegion removes a region from the cache. -func (c *RaftCluster) DropCacheRegion(id uint64) { - c.core.RemoveRegionIfExist(id) -} - -// DropCacheAllRegion removes all regions from the cache. -func (c *RaftCluster) DropCacheAllRegion() { - c.core.ResetRegionCache() -} - -// GetMetaStores gets stores from cluster. -func (c *RaftCluster) GetMetaStores() []*metapb.Store { - return c.core.GetMetaStores() -} - -// GetStores returns all stores in the cluster. -func (c *RaftCluster) GetStores() []*core.StoreInfo { - return c.core.GetStores() -} - -// GetLeaderStoreByRegionID returns the leader store of the given region. -func (c *RaftCluster) GetLeaderStoreByRegionID(regionID uint64) *core.StoreInfo { - return c.core.GetLeaderStoreByRegionID(regionID) -} - -// GetStore gets store from cluster. -func (c *RaftCluster) GetStore(storeID uint64) *core.StoreInfo { - return c.core.GetStore(storeID) -} - -// GetAdjacentRegions returns regions' information that are adjacent with the specific region ID. -func (c *RaftCluster) GetAdjacentRegions(region *core.RegionInfo) (*core.RegionInfo, *core.RegionInfo) { - return c.core.GetAdjacentRegions(region) -} - -// GetRangeHoles returns all range holes, i.e the key ranges without any region info. -func (c *RaftCluster) GetRangeHoles() [][]string { - return c.core.GetRangeHoles() + return c.BasicCluster } // UpdateStoreLabels updates a store's location labels @@ -1360,8 +1208,8 @@ func (c *RaftCluster) DeleteStoreLabel(storeID uint64, labelKey string) error { return c.putStoreImpl(newStore, true) } -// PutStore puts a store. -func (c *RaftCluster) PutStore(store *metapb.Store) error { +// PutMetaStore puts a store. +func (c *RaftCluster) PutMetaStore(store *metapb.Store) error { if err := c.putStoreImpl(store, false); err != nil { return err } @@ -1374,9 +1222,6 @@ func (c *RaftCluster) PutStore(store *metapb.Store) error { // If 'force' is true, the store's labels will overwrite those labels which already existed in the store. // If 'force' is false, the store's labels will merge into those labels which already existed in the store. func (c *RaftCluster) putStoreImpl(store *metapb.Store, force bool) error { - c.Lock() - defer c.Unlock() - if store.GetId() == 0 { return errors.Errorf("invalid put store %v", store) } @@ -1418,7 +1263,7 @@ func (c *RaftCluster) putStoreImpl(store *metapb.Store, force bool) error { if err := c.checkStoreLabels(s); err != nil { return err } - return c.putStoreLocked(s) + return c.setStore(s) } func (c *RaftCluster) checkStoreVersion(store *metapb.Store) error { @@ -1463,9 +1308,6 @@ func (c *RaftCluster) checkStoreLabels(s *core.StoreInfo) error { // RemoveStore marks a store as offline in cluster. // State transition: Up -> Offline. func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1490,9 +1332,9 @@ func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) erro zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress()), zap.Bool("physically-destroyed", newStore.IsPhysicallyDestroyed())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { - regionSize := float64(c.core.GetStoreRegionSize(storeID)) + regionSize := float64(c.GetStoreRegionSize(storeID)) c.resetProgress(storeID, store.GetAddress()) c.progressManager.AddProgress(encodeRemovingProgressKey(storeID), regionSize, regionSize, nodeStateCheckJobInterval, progress.WindowDurationOption(c.GetCoordinator().GetPatrolRegionsDuration())) // record the current store limit in memory @@ -1555,9 +1397,6 @@ func (c *RaftCluster) getUpStores() []uint64 { // BuryStore marks a store as tombstone in cluster. // If forceBury is false, the store should be offlined and emptied before calling this func. func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1582,8 +1421,8 @@ func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { zap.String("store-address", newStore.GetAddress()), zap.String("state", store.GetState().String()), zap.Bool("physically-destroyed", store.IsPhysicallyDestroyed())) - err := c.putStoreLocked(newStore) - c.onStoreVersionChangeLocked() + err := c.setStore(newStore) + c.OnStoreVersionChange() if err == nil { // clean up the residual information. delete(c.prevStoreLimit, storeID) @@ -1599,40 +1438,6 @@ func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { return err } -// PauseLeaderTransfer prevents the store from been selected as source or -// target store of TransferLeader. -func (c *RaftCluster) PauseLeaderTransfer(storeID uint64) error { - return c.core.PauseLeaderTransfer(storeID) -} - -// ResumeLeaderTransfer cleans a store's pause state. The store can be selected -// as source or target of TransferLeader again. -func (c *RaftCluster) ResumeLeaderTransfer(storeID uint64) { - c.core.ResumeLeaderTransfer(storeID) -} - -// SlowStoreEvicted marks a store as a slow store and prevents transferring -// leader to the store -func (c *RaftCluster) SlowStoreEvicted(storeID uint64) error { - return c.core.SlowStoreEvicted(storeID) -} - -// SlowTrendEvicted marks a store as a slow store by trend and prevents transferring -// leader to the store -func (c *RaftCluster) SlowTrendEvicted(storeID uint64) error { - return c.core.SlowTrendEvicted(storeID) -} - -// SlowTrendRecovered cleans the evicted by slow trend state of a store. -func (c *RaftCluster) SlowTrendRecovered(storeID uint64) { - c.core.SlowTrendRecovered(storeID) -} - -// SlowStoreRecovered cleans the evicted state of a store. -func (c *RaftCluster) SlowStoreRecovered(storeID uint64) { - c.core.SlowStoreRecovered(storeID) -} - // NeedAwakenAllRegionsInStore checks whether we should do AwakenRegions operation. func (c *RaftCluster) NeedAwakenAllRegionsInStore(storeID uint64) (needAwaken bool, slowStoreIDs []uint64) { store := c.GetStore(storeID) @@ -1664,9 +1469,6 @@ func (c *RaftCluster) NeedAwakenAllRegionsInStore(storeID uint64) (needAwaken bo // UpStore up a store from offline func (c *RaftCluster) UpStore(storeID uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1697,7 +1499,7 @@ func (c *RaftCluster) UpStore(storeID uint64) error { log.Warn("store has been up", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { if exist { // persist the store limit @@ -1711,9 +1513,6 @@ func (c *RaftCluster) UpStore(storeID uint64) error { // ReadyToServe change store's node state to Serving. func (c *RaftCluster) ReadyToServe(storeID uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1735,7 +1534,7 @@ func (c *RaftCluster) ReadyToServe(storeID uint64) error { log.Info("store has changed to serving", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { c.resetProgress(storeID, store.GetAddress()) } @@ -1758,16 +1557,16 @@ func (c *RaftCluster) SetStoreWeight(storeID uint64, leaderWeight, regionWeight core.SetRegionWeight(regionWeight), ) - return c.putStoreLocked(newStore) + return c.setStore(newStore) } -func (c *RaftCluster) putStoreLocked(store *core.StoreInfo) error { +func (c *RaftCluster) setStore(store *core.StoreInfo) error { if c.storage != nil { if err := c.storage.SaveStoreMeta(store.GetMeta()); err != nil { return err } } - c.core.PutStore(store) + c.PutStore(store) if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { c.updateStoreStatistics(store.GetID(), store.IsSlow()) } @@ -1833,11 +1632,11 @@ func (c *RaftCluster) checkStores() { offlineStore := store.GetMeta() id := offlineStore.GetId() - regionSize := c.core.GetStoreRegionSize(id) + regionSize := c.GetStoreRegionSize(id) if c.IsPrepared() { c.updateProgress(id, store.GetAddress(), removingAction, float64(regionSize), float64(regionSize), false /* dec */) } - regionCount := c.core.GetStoreRegionCount(id) + regionCount := c.GetStoreRegionCount(id) // If the store is empty, it can be buried. if regionCount == 0 { if err := c.BuryStore(id, false); err != nil { @@ -1865,7 +1664,7 @@ func (c *RaftCluster) checkStores() { func (c *RaftCluster) getThreshold(stores []*core.StoreInfo, store *core.StoreInfo) float64 { start := time.Now() if !c.opt.IsPlacementRulesEnabled() { - regionSize := c.core.GetRegionSizeByRange([]byte(""), []byte("")) * int64(c.opt.GetMaxReplicas()) + regionSize := c.GetRegionSizeByRange([]byte(""), []byte("")) * int64(c.opt.GetMaxReplicas()) weight := getStoreTopoWeight(store, stores, c.opt.GetLocationLabels(), c.opt.GetMaxReplicas()) return float64(regionSize) * weight * 0.9 } @@ -1905,7 +1704,7 @@ func (c *RaftCluster) calculateRange(stores []*core.StoreInfo, store *core.Store matchStores = append(matchStores, s) } } - regionSize := c.core.GetRegionSizeByRange(startKey, endKey) * int64(rule.Count) + regionSize := c.GetRegionSizeByRange(startKey, endKey) * int64(rule.Count) weight := getStoreTopoWeight(store, matchStores, rule.LocationLabels, rule.Count) storeSize += float64(regionSize) * weight log.Debug("calculate range result", @@ -2071,13 +1870,10 @@ func encodePreparingProgressKey(storeID uint64) string { // RemoveTombStoneRecords removes the tombStone Records. func (c *RaftCluster) RemoveTombStoneRecords() error { - c.Lock() - defer c.Unlock() - var failedStores []uint64 for _, store := range c.GetStores() { if store.IsRemoved() { - if c.core.GetStoreRegionCount(store.GetID()) > 0 { + if c.GetStoreRegionCount(store.GetID()) > 0 { log.Warn("skip removing tombstone", zap.Stringer("store", store.GetMeta())) failedStores = append(failedStores, store.GetID()) continue @@ -2115,7 +1911,7 @@ func (c *RaftCluster) deleteStore(store *core.StoreInfo) error { return err } } - c.core.DeleteStore(store) + c.DeleteStore(store) return nil } @@ -2156,12 +1952,6 @@ func (c *RaftCluster) resetProgressIndicator() { // OnStoreVersionChange changes the version of the cluster when needed. func (c *RaftCluster) OnStoreVersionChange() { - c.RLock() - defer c.RUnlock() - c.onStoreVersionChangeLocked() -} - -func (c *RaftCluster) onStoreVersionChangeLocked() { var minVersion *semver.Version stores := c.GetStores() for _, s := range stores { @@ -2219,13 +2009,13 @@ func (c *RaftCluster) PutMetaCluster(meta *metapb.Cluster) error { // GetRegionStatsByRange returns region statistics from cluster. func (c *RaftCluster) GetRegionStatsByRange(startKey, endKey []byte) *statistics.RegionStats { - return statistics.GetRegionStats(c.core.ScanRegions(startKey, endKey, -1)) + return statistics.GetRegionStats(c.ScanRegions(startKey, endKey, -1)) } -// GetRegionCount returns the number of regions in the range. -func (c *RaftCluster) GetRegionCount(startKey, endKey []byte) *statistics.RegionStats { +// GetRegionStatsCount returns the number of regions in the range. +func (c *RaftCluster) GetRegionStatsCount(startKey, endKey []byte) *statistics.RegionStats { stats := &statistics.RegionStats{} - stats.Count = c.core.GetRegionCount(startKey, endKey) + stats.Count = c.GetRegionCount(startKey, endKey) return stats } @@ -2237,7 +2027,7 @@ func (c *RaftCluster) putRegion(region *core.RegionInfo) error { return err } } - c.core.PutRegion(region) + c.PutRegion(region) return nil } @@ -2292,7 +2082,7 @@ func (c *RaftCluster) AddStoreLimit(store *metapb.Store) { func (c *RaftCluster) RemoveStoreLimit(storeID uint64) { cfg := c.opt.GetScheduleConfig().Clone() for _, limitType := range storelimit.TypeNameValue { - c.core.ResetStoreLimit(storeID, limitType) + c.ResetStoreLimit(storeID, limitType) } delete(cfg.StoreLimit, storeID) c.opt.SetScheduleConfig(cfg) @@ -2312,16 +2102,13 @@ func (c *RaftCluster) RemoveStoreLimit(storeID uint64) { // SetMinResolvedTS sets up a store with min resolved ts. func (c *RaftCluster) SetMinResolvedTS(storeID, minResolvedTS uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) } newStore := store.Clone(core.SetMinResolvedTS(minResolvedTS)) - c.core.PutStore(newStore) + c.PutStore(newStore) return nil } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index 0f08153c8ae..ee7c477476b 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -93,7 +93,7 @@ func TestStoreHeartbeat(t *testing.T) { } re.Error(cluster.HandleStoreHeartbeat(req, resp)) - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) re.Equal(i+1, cluster.GetStoreCount()) re.Equal(int64(0), store.GetLastHeartbeatTS().UnixNano()) @@ -215,7 +215,7 @@ func TestFilterUnhealthyStore(t *testing.T) { Available: 50, RegionCount: 1, } - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) re.NoError(cluster.HandleStoreHeartbeat(req, resp)) re.NotNil(cluster.hotStat.GetRollingStoreStats(store.GetID())) } @@ -228,7 +228,7 @@ func TestFilterUnhealthyStore(t *testing.T) { RegionCount: 1, } newStore := store.Clone(core.SetStoreState(metapb.StoreState_Tombstone)) - re.NoError(cluster.putStoreLocked(newStore)) + re.NoError(cluster.setStore(newStore)) re.NoError(cluster.HandleStoreHeartbeat(req, resp)) re.Nil(cluster.hotStat.GetRollingStoreStats(store.GetID())) } @@ -253,7 +253,7 @@ func TestSetOfflineStore(t *testing.T) { // Put 6 stores. for _, store := range newTestStores(6, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // store 1: up -> offline @@ -295,7 +295,7 @@ func TestSetOfflineStore(t *testing.T) { // test clean up tombstone store toCleanStore := cluster.GetStore(1).Clone().GetMeta() toCleanStore.LastHeartbeat = time.Now().Add(-40 * 24 * time.Hour).UnixNano() - cluster.PutStore(toCleanStore) + cluster.PutMetaStore(toCleanStore) cluster.checkStores() re.Nil(cluster.GetStore(1)) } @@ -312,7 +312,7 @@ func TestSetOfflineWithReplica(t *testing.T) { // Put 4 stores. for _, store := range newTestStores(4, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.NoError(cluster.RemoveStore(2, false)) @@ -351,7 +351,7 @@ func TestSetOfflineStoreWithEvictLeader(t *testing.T) { // Put 3 stores. for _, store := range newTestStores(3, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } _, err = addEvictLeaderScheduler(cluster, 1) @@ -378,7 +378,7 @@ func TestForceBuryStore(t *testing.T) { stores := newTestStores(2, "5.3.0") stores[1] = stores[1].Clone(core.SetLastHeartbeatTS(time.Now())) for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.NoError(cluster.BuryStore(uint64(1), true)) re.Error(cluster.BuryStore(uint64(2), true)) @@ -396,7 +396,7 @@ func TestReuseAddress(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) // Put 4 stores. for _, store := range newTestStores(4, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // store 1: up // store 2: offline @@ -420,9 +420,9 @@ func TestReuseAddress(t *testing.T) { if storeInfo.IsPhysicallyDestroyed() || storeInfo.IsRemoved() { // try to start a new store with the same address with store which is physically destroyed or tombstone should be success - re.NoError(cluster.PutStore(newStore)) + re.NoError(cluster.PutMetaStore(newStore)) } else { - re.Error(cluster.PutStore(newStore)) + re.Error(cluster.PutMetaStore(newStore)) } } } @@ -450,7 +450,7 @@ func TestUpStore(t *testing.T) { // Put 5 stores. for _, store := range newTestStores(5, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // set store 1 offline @@ -490,7 +490,7 @@ func TestRemovingProcess(t *testing.T) { // Put 5 stores. stores := newTestStores(5, "5.0.0") for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } regions := newTestRegions(100, 5, 1) var regionInStore1 []*core.RegionInfo @@ -518,7 +518,7 @@ func TestRemovingProcess(t *testing.T) { if i >= 5 { break } - cluster.DropCacheRegion(region.GetID()) + cluster.RemoveRegionIfExist(region.GetID()) i++ } cluster.checkStores() @@ -553,13 +553,13 @@ func TestDeleteStoreUpdatesClusterVersion(t *testing.T) { // Put 3 new 4.0.9 stores. for _, store := range newTestStores(3, "4.0.9") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Equal("4.0.9", cluster.GetClusterVersion()) // Upgrade 2 stores to 5.0.0. for _, store := range newTestStores(2, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Equal("4.0.9", cluster.GetClusterVersion()) @@ -582,14 +582,14 @@ func TestStoreClusterVersion(t *testing.T) { s1.Version = "5.0.1" s2.Version = "5.0.3" s3.Version = "5.0.5" - re.NoError(cluster.PutStore(s2)) + re.NoError(cluster.PutMetaStore(s2)) re.Equal(s2.Version, cluster.GetClusterVersion()) - re.NoError(cluster.PutStore(s1)) + re.NoError(cluster.PutMetaStore(s1)) // the cluster version should be 5.0.1(the min one) re.Equal(s1.Version, cluster.GetClusterVersion()) - re.NoError(cluster.PutStore(s3)) + re.NoError(cluster.PutMetaStore(s3)) // the cluster version should be 5.0.1(the min one) re.Equal(s1.Version, cluster.GetClusterVersion()) } @@ -679,7 +679,7 @@ func TestBucketHeartbeat(t *testing.T) { n, np := uint64(2), uint64(2) regions := newTestRegions(n, n, np) for _, store := range stores { - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) } re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), regions[0])) @@ -729,31 +729,31 @@ func TestRegionHeartbeat(t *testing.T) { regions := newTestRegions(n, n, np) for _, store := range stores { - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) } for i, region := range regions { // region does not exist. re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is the same, not updated. re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) origin := region // region is updated. region = origin.Clone(core.WithIncVersion()) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is stale (Version). stale := origin.Clone(core.WithIncConfVer()) re.Error(cluster.processRegionHeartbeat(core.ContextTODO(), stale)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is updated @@ -763,13 +763,13 @@ func TestRegionHeartbeat(t *testing.T) { ) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is stale (ConfVer). stale = origin.Clone(core.WithIncConfVer()) re.Error(cluster.processRegionHeartbeat(core.ContextTODO(), stale)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Add a down peer. @@ -781,38 +781,38 @@ func TestRegionHeartbeat(t *testing.T) { })) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Add a pending peer. region = region.Clone(core.WithPendingPeers([]*metapb.Peer{region.GetPeers()[rand.Intn(len(region.GetPeers()))]})) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Clear down peers. region = region.Clone(core.WithDownPeers(nil)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Clear pending peers. region = region.Clone(core.WithPendingPeers(nil)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Remove peers. origin = region region = origin.Clone(core.SetPeers(region.GetPeers()[:1])) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Add peers. region = origin regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Change one peer to witness @@ -822,47 +822,47 @@ func TestRegionHeartbeat(t *testing.T) { ) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change leader. region = region.Clone(core.WithLeader(region.GetPeers()[1])) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change ApproximateSize. region = region.Clone(core.SetApproximateSize(144)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change ApproximateKeys. region = region.Clone(core.SetApproximateKeys(144000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change bytes written. region = region.Clone(core.SetWrittenBytes(24000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change bytes read. region = region.Clone(core.SetReadBytes(1080000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Flashback region = region.Clone(core.WithFlashback(true, 1)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) region = region.Clone(core.WithFlashback(false, 0)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) } regionCounts := make(map[uint64]int) @@ -894,10 +894,10 @@ func TestRegionHeartbeat(t *testing.T) { time.Sleep(50 * time.Millisecond) for _, store := range cluster.GetStores() { - re.Equal(cluster.core.GetStoreLeaderCount(store.GetID()), store.GetLeaderCount()) - re.Equal(cluster.core.GetStoreRegionCount(store.GetID()), store.GetRegionCount()) - re.Equal(cluster.core.GetStoreLeaderRegionSize(store.GetID()), store.GetLeaderSize()) - re.Equal(cluster.core.GetStoreRegionSize(store.GetID()), store.GetRegionSize()) + re.Equal(cluster.GetStoreLeaderCount(store.GetID()), store.GetLeaderCount()) + re.Equal(cluster.GetStoreRegionCount(store.GetID()), store.GetRegionCount()) + re.Equal(cluster.GetStoreLeaderRegionSize(store.GetID()), store.GetLeaderSize()) + re.Equal(cluster.GetStoreRegionSize(store.GetID()), store.GetRegionSize()) } // Test with storage. @@ -1133,7 +1133,7 @@ func TestRegionLabelIsolationLevel(t *testing.T) { State: metapb.StoreState_Up, Labels: labels, } - re.NoError(cluster.putStoreLocked(core.NewStoreInfo(store))) + re.NoError(cluster.setStore(core.NewStoreInfo(store))) } peers := make([]*metapb.Peer, 0, 4) @@ -1296,7 +1296,7 @@ func TestOfflineAndMerge(t *testing.T) { // Put 4 stores. for _, store := range newTestStores(4, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } peers := []*metapb.Peer{ @@ -1351,7 +1351,7 @@ func TestStoreConfigUpdate(t *testing.T) { tc := newTestCluster(ctx, opt) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } re.Len(tc.getUpStores(), 5) // Case1: big region. @@ -1436,7 +1436,7 @@ func TestSyncConfigContext(t *testing.T) { })) stores := newTestStores(1, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } // trip schema header now := time.Now() @@ -1458,7 +1458,7 @@ func TestStoreConfigSync(t *testing.T) { tc := newTestCluster(ctx, opt) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } re.Len(tc.getUpStores(), 5) @@ -1503,7 +1503,7 @@ func TestUpdateStorePendingPeerCount(t *testing.T) { tc.RaftCluster.coordinator = schedule.NewCoordinator(ctx, tc.RaftCluster, nil) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } tc.RaftCluster.wg.Add(1) go tc.RaftCluster.runUpdateStoreStats() @@ -1678,7 +1678,7 @@ func TestCalculateStoreSize1(t *testing.T) { }, }...) s := store.Clone(core.SetStoreLabels(labels)) - re.NoError(cluster.PutStore(s.GetMeta())) + re.NoError(cluster.PutMetaStore(s.GetMeta())) } cluster.ruleManager.SetRule( @@ -1762,7 +1762,7 @@ func TestCalculateStoreSize2(t *testing.T) { } labels = append(labels, []*metapb.StoreLabel{{Key: "rack", Value: "r1"}, {Key: "host", Value: "h1"}}...) s := store.Clone(core.SetStoreLabels(labels)) - re.NoError(cluster.PutStore(s.GetMeta())) + re.NoError(cluster.PutMetaStore(s.GetMeta())) } cluster.ruleManager.SetRule( @@ -1812,7 +1812,7 @@ func TestStores(t *testing.T) { id := store.GetID() re.Nil(cache.GetStore(id)) re.Error(cache.PauseLeaderTransfer(id)) - cache.SetStore(store) + cache.PutStore(store) re.Equal(store, cache.GetStore(id)) re.Equal(i+1, cache.GetStoreCount()) re.NoError(cache.PauseLeaderTransfer(id)) @@ -1843,7 +1843,7 @@ func Test(t *testing.T) { _, opts, err := newTestScheduleConfig() re.NoError(err) tc := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opts, storage.NewStorageWithMemoryBackend()) - cache := tc.core + cache := tc.BasicCluster for i := uint64(0); i < n; i++ { region := regions[i] @@ -1961,7 +1961,7 @@ func TestAwakenStore(t *testing.T) { stores := newTestStores(n, "6.5.0") re.True(stores[0].NeedAwakenStore()) for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } for i := uint64(1); i <= n; i++ { re.False(cluster.slowStat.ExistsSlowStores()) @@ -1971,7 +1971,7 @@ func TestAwakenStore(t *testing.T) { now := time.Now() store4 := stores[0].Clone(core.SetLastHeartbeatTS(now), core.SetLastAwakenTime(now.Add(-11*time.Minute))) - re.NoError(cluster.putStoreLocked(store4)) + re.NoError(cluster.setStore(store4)) store1 := cluster.GetStore(1) re.True(store1.NeedAwakenStore()) @@ -2013,7 +2013,7 @@ func TestUpdateAndDeleteLabel(t *testing.T) { cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) stores := newTestStores(1, "6.5.1") for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Empty(cluster.GetStore(1).GetLabels()) // Update label. @@ -2105,7 +2105,7 @@ func TestUpdateAndDeleteLabel(t *testing.T) { newStore := typeutil.DeepClone(cluster.GetStore(1).GetMeta(), core.StoreFactory) newStore.Labels = nil // Store rebooting will call PutStore. - err = cluster.PutStore(newStore) + err = cluster.PutMetaStore(newStore) re.NoError(err) // Check the label after rebooting. re.Equal([]*metapb.StoreLabel{{Key: "mode", Value: "readonly"}}, cluster.GetStore(1).GetLabels()) @@ -2142,7 +2142,7 @@ func newTestRaftCluster( s storage.Storage, ) *RaftCluster { opt.GetScheduleConfig().EnableHeartbeatConcurrentRunner = false - rc := &RaftCluster{serverCtx: ctx, core: core.NewBasicCluster(), storage: s} + rc := &RaftCluster{serverCtx: ctx, BasicCluster: core.NewBasicCluster(), storage: s} rc.InitCluster(id, opt, nil, nil) rc.ruleManager = placement.NewRuleManager(ctx, storage.NewStorageWithMemoryBackend(), rc, opt) if opt.IsPlacementRulesEnabled() { @@ -2151,7 +2151,7 @@ func newTestRaftCluster( panic(err) } } - rc.schedulingController = newSchedulingController(rc.ctx, rc.core, rc.opt, rc.ruleManager) + rc.schedulingController = newSchedulingController(rc.ctx, rc.BasicCluster, rc.opt, rc.ruleManager) return rc } @@ -2324,7 +2324,7 @@ func (c *testCluster) addRegionStore(storeID uint64, regionCount int, regionSize c.SetStoreLimit(storeID, storelimit.RemovePeer, 60) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) addLeaderRegion(regionID uint64, leaderStoreID uint64, followerStoreIDs ...uint64) error { @@ -2347,7 +2347,7 @@ func (c *testCluster) updateLeaderCount(storeID uint64, leaderCount int) error { ) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) addLeaderStore(storeID uint64, leaderCount int) error { @@ -2363,7 +2363,7 @@ func (c *testCluster) addLeaderStore(storeID uint64, leaderCount int) error { c.SetStoreLimit(storeID, storelimit.RemovePeer, 60) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) setStoreDown(storeID uint64) error { @@ -2374,7 +2374,7 @@ func (c *testCluster) setStoreDown(storeID uint64) error { ) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) setStoreOffline(storeID uint64) error { @@ -2382,7 +2382,7 @@ func (c *testCluster) setStoreOffline(storeID uint64) error { newStore := store.Clone(core.SetStoreState(metapb.StoreState_Offline, false)) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) LoadRegion(regionID uint64, followerStoreIDs ...uint64) error { @@ -2966,7 +2966,7 @@ func TestShouldRun(t *testing.T) { nr := &metapb.Region{Id: 6, Peers: []*metapb.Peer{}} newRegion := core.NewRegionInfo(nr, nil, core.SetSource(core.Heartbeat)) re.Error(tc.processRegionHeartbeat(core.ContextTODO(), newRegion)) - re.Equal(7, tc.core.GetClusterNotFromStorageRegionsCnt()) + re.Equal(7, tc.GetClusterNotFromStorageRegionsCnt()) } func TestShouldRunWithNonLeaderRegions(t *testing.T) { @@ -3009,7 +3009,7 @@ func TestShouldRunWithNonLeaderRegions(t *testing.T) { nr := &metapb.Region{Id: 9, Peers: []*metapb.Peer{}} newRegion := core.NewRegionInfo(nr, nil, core.SetSource(core.Heartbeat)) re.Error(tc.processRegionHeartbeat(core.ContextTODO(), newRegion)) - re.Equal(9, tc.core.GetClusterNotFromStorageRegionsCnt()) + re.Equal(9, tc.GetClusterNotFromStorageRegionsCnt()) // Now, after server is prepared, there exist some regions with no leader. re.Equal(uint64(0), tc.GetRegion(10).GetLeader().GetStoreId()) diff --git a/server/cluster/scheduling_controller.go b/server/cluster/scheduling_controller.go index 20d5a6bceae..ca846eaa885 100644 --- a/server/cluster/scheduling_controller.go +++ b/server/cluster/scheduling_controller.go @@ -195,7 +195,7 @@ func (sc *schedulingController) collectSchedulingMetrics() { // collect hot cache metrics sc.hotStat.CollectMetrics() // collect the lock metrics - sc.RegionsInfo.CollectWaitLockMetrics() + sc.CollectWaitLockMetrics() } func (sc *schedulingController) removeStoreStatistics(storeID uint64) { diff --git a/server/grpc_service.go b/server/grpc_service.go index 2b3ee232686..acfc87fcf71 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -826,7 +826,7 @@ func (s *GrpcServer) PutStore(ctx context.Context, request *pdpb.PutStoreRequest }, nil } - if err := rc.PutStore(store); err != nil { + if err := rc.PutMetaStore(store); err != nil { return &pdpb.PutStoreResponse{ Header: s.wrapErrorToHeader(pdpb.ErrorType_UNKNOWN, err.Error()), }, nil diff --git a/server/server.go b/server/server.go index af9f48f8c9b..1d38a5ee495 100644 --- a/server/server.go +++ b/server/server.go @@ -1555,8 +1555,6 @@ func (s *Server) UpdateGRPCServiceRateLimiter(serviceLabel string, opts ...ratel // GetClusterStatus gets cluster status. func (s *Server) GetClusterStatus() (*cluster.Status, error) { - s.cluster.Lock() - defer s.cluster.Unlock() return s.cluster.LoadClusterStatus() } diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go index cf2c6dd2508..365ab1ca493 100644 --- a/tests/integrations/mcs/scheduling/api_test.go +++ b/tests/integrations/mcs/scheduling/api_test.go @@ -498,19 +498,19 @@ func (suite *apiTestSuite) checkAdminRegionCacheForward(cluster *tests.TestClust apiServer := cluster.GetLeaderServer().GetServer() schedulingServer := cluster.GetSchedulingPrimaryServer() re.Equal(3, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(3, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(3, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) addr := cluster.GetLeaderServer().GetAddr() urlPrefix := fmt.Sprintf("%s/pd/api/v1/admin/cache/region", addr) err := testutil.CheckDelete(tests.TestDialClient, fmt.Sprintf("%s/%s", urlPrefix, "30"), testutil.StatusOK(re)) re.NoError(err) re.Equal(2, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(2, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(2, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) err = testutil.CheckDelete(tests.TestDialClient, urlPrefix+"s", testutil.StatusOK(re)) re.NoError(err) re.Equal(0, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(0, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(0, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) } func (suite *apiTestSuite) TestFollowerForward() { diff --git a/tests/integrations/mcs/scheduling/config_test.go b/tests/integrations/mcs/scheduling/config_test.go index d7883379731..54622d5c515 100644 --- a/tests/integrations/mcs/scheduling/config_test.go +++ b/tests/integrations/mcs/scheduling/config_test.go @@ -175,7 +175,7 @@ func (suite *configTestSuite) TestSchedulerConfigWatch() { }) assertEvictLeaderStoreIDs(re, storage, []uint64{1}) // Update the scheduler by adding a store. - err = suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + err = suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{ Id: 2, Address: "mock://2", diff --git a/tests/integrations/mcs/scheduling/meta_test.go b/tests/integrations/mcs/scheduling/meta_test.go index abc1efd9021..11782590ab9 100644 --- a/tests/integrations/mcs/scheduling/meta_test.go +++ b/tests/integrations/mcs/scheduling/meta_test.go @@ -79,7 +79,7 @@ func (suite *metaTestSuite) TestStoreWatch() { ) re.NoError(err) for i := uint64(1); i <= 4; i++ { - suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{Id: i, Address: fmt.Sprintf("mock-%d", i), State: metapb.StoreState_Up, NodeState: metapb.NodeState_Serving, LastHeartbeat: time.Now().UnixNano()}, ) } @@ -102,7 +102,7 @@ func (suite *metaTestSuite) TestStoreWatch() { }) // test synchronized store labels - suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{Id: 5, Address: "mock-5", State: metapb.StoreState_Up, NodeState: metapb.NodeState_Serving, LastHeartbeat: time.Now().UnixNano(), Labels: []*metapb.StoreLabel{{Key: "zone", Value: "z1"}}}, ) testutil.Eventually(re, func() bool { diff --git a/tests/integrations/mcs/scheduling/server_test.go b/tests/integrations/mcs/scheduling/server_test.go index 38c1cc6a41b..82da47d18f3 100644 --- a/tests/integrations/mcs/scheduling/server_test.go +++ b/tests/integrations/mcs/scheduling/server_test.go @@ -310,7 +310,7 @@ func (suite *serverTestSuite) TestSchedulerSync() { checkEvictLeaderSchedulerExist(re, schedulersController, true) checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1}) // Add a store_id to the evict-leader-scheduler through the API server. - err = suite.pdLeader.GetServer().GetRaftCluster().PutStore( + err = suite.pdLeader.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{ Id: 2, Address: "mock://2", diff --git a/tests/server/api/region_test.go b/tests/server/api/region_test.go index 2ff0b5d4b86..23ebceaefd6 100644 --- a/tests/server/api/region_test.go +++ b/tests/server/api/region_test.go @@ -407,7 +407,7 @@ func (suite *regionTestSuite) checkRegionsReplicated(cluster *tests.TestCluster) func checkRegionCount(re *require.Assertions, cluster *tests.TestCluster, count uint64) { leader := cluster.GetLeaderServer() tu.Eventually(re, func() bool { - return leader.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count == int(count) + return leader.GetRaftCluster().GetRegionCount([]byte{}, []byte{}) == int(count) }) if sche := cluster.GetSchedulingPrimaryServer(); sche != nil { tu.Eventually(re, func() bool { diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 61a4561c55a..07bcf3ee2a1 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -601,7 +601,7 @@ func TestRaftClusterMultipleRestart(t *testing.T) { store := newMetaStore(storeID, "127.0.0.1:4", "2.1.0", metapb.StoreState_Offline, getTestDeployPath(storeID)) rc := leaderServer.GetRaftCluster() re.NotNil(rc) - err = rc.PutStore(store) + err = rc.PutMetaStore(store) re.NoError(err) re.NotNil(tc) rc.Stop() From b1cbc7151f40e6e34c0582820aca1463c8e8c8c4 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 28 May 2024 15:58:21 +0800 Subject: [PATCH 16/21] tests: fix the testify usage of re.Positive/Negative (#8221) ref tikv/pd#4399 Signed-off-by: JmPotato --- pkg/schedule/schedulers/evict_slow_trend_test.go | 2 +- pkg/statistics/hot_peer_cache_test.go | 2 +- tests/integrations/client/http_client_test.go | 4 ++-- tests/integrations/mcs/tso/keyspace_group_manager_test.go | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/schedule/schedulers/evict_slow_trend_test.go b/pkg/schedule/schedulers/evict_slow_trend_test.go index 834ef337639..dd6807f4a85 100644 --- a/pkg/schedule/schedulers/evict_slow_trend_test.go +++ b/pkg/schedule/schedulers/evict_slow_trend_test.go @@ -105,7 +105,7 @@ func (suite *evictSlowTrendTestSuite) TestEvictSlowTrendBasicFuncs() { re.Equal(slowCandidate{}, es2.conf.evictCandidate) es2.conf.markCandidateRecovered() lastCapturedCandidate = es2.conf.lastCapturedCandidate() - re.Greater(lastCapturedCandidate.recoverTS.Compare(recoverTS), 0) + re.Positive(lastCapturedCandidate.recoverTS.Compare(recoverTS)) re.Equal(lastCapturedCandidate.storeID, store.GetID()) // Test capture another store 2 diff --git a/pkg/statistics/hot_peer_cache_test.go b/pkg/statistics/hot_peer_cache_test.go index c116e020f54..db215238604 100644 --- a/pkg/statistics/hot_peer_cache_test.go +++ b/pkg/statistics/hot_peer_cache_test.go @@ -375,7 +375,7 @@ func TestUpdateHotPeerStat(t *testing.T) { cache.updateStat(newItem[0]) newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) } - re.Less(newItem[0].HotDegree, 0) + re.Negative(newItem[0].HotDegree) re.Equal(0, newItem[0].AntiCount) re.Equal(utils.Remove, newItem[0].actionType) } diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index 9e712b808f3..33652da9be0 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -174,11 +174,11 @@ func (suite *httpClientTestSuite) checkMeta(mode mode, client pd.Client) { re.Equal("INPROGRESS", state) regionStats, err := client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), false) re.NoError(err) - re.Greater(regionStats.Count, 0) + re.Positive(regionStats.Count) re.NotEmpty(regionStats.StoreLeaderCount) regionStats, err = client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), true) re.NoError(err) - re.Greater(regionStats.Count, 0) + re.Positive(regionStats.Count) re.Empty(regionStats.StoreLeaderCount) hotReadRegions, err := client.GetHotReadRegions(env.ctx) re.NoError(err) diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index f7b892ce77d..25d9516bf63 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -300,7 +300,7 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupSplit() { // Check the split TSO from keyspace group `newID` now. splitTS, err := suite.requestTSO(re, 222, newID) re.NoError(err) - re.Greater(tsoutil.CompareTimestamp(&splitTS, &ts), 0) + re.Positive(tsoutil.CompareTimestamp(&splitTS, &ts)) } func (suite *tsoKeyspaceGroupManagerTestSuite) requestTSO( @@ -636,7 +636,7 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMerge() { } return err == nil && tsoutil.CompareTimestamp(&mergedTS, &pdpb.Timestamp{}) > 0 }, testutil.WithTickInterval(5*time.Second), testutil.WithWaitFor(time.Minute)) - re.Greater(tsoutil.CompareTimestamp(&mergedTS, &ts), 0) + re.Positive(tsoutil.CompareTimestamp(&mergedTS, &ts)) } func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMergeClient() { From b7d8b94060e3c60693829574a40286b08d444f16 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Tue, 28 May 2024 17:52:21 +0800 Subject: [PATCH 17/21] controller: fix error retry and add more metrics (#8219) close tikv/pd#8217 controller: fix error retry and add more metrics Signed-off-by: nolouch Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../resource_group/controller/controller.go | 71 ++++++++++++------- .../controller/controller_test.go | 15 ++++ client/resource_group/controller/limiter.go | 27 ++++++- client/resource_group/controller/metrics.go | 18 ++++- 4 files changed, 101 insertions(+), 30 deletions(-) diff --git a/client/resource_group/controller/controller.go b/client/resource_group/controller/controller.go index 11ea3f7997d..1910e37eff8 100755 --- a/client/resource_group/controller/controller.go +++ b/client/resource_group/controller/controller.go @@ -515,7 +515,7 @@ func (c *ResourceGroupsController) collectTokenBucketRequests(ctx context.Contex request := gc.collectRequestAndConsumption(typ) if request != nil { c.run.currentRequests = append(c.run.currentRequests, request) - gc.tokenRequestCounter.Inc() + gc.metrics.tokenRequestCounter.Inc() } return true }) @@ -632,13 +632,9 @@ type groupCostController struct { calculators []ResourceCalculator handleRespFunc func(*rmpb.TokenBucketResponse) - successfulRequestDuration prometheus.Observer - failedLimitReserveDuration prometheus.Observer - requestRetryCounter prometheus.Counter - failedRequestCounter prometheus.Counter - tokenRequestCounter prometheus.Counter - - mu struct { + // metrics + metrics *groupMetricsCollection + mu struct { sync.Mutex consumption *rmpb.Consumption storeCounter map[uint64]*rmpb.Consumption @@ -685,6 +681,30 @@ type groupCostController struct { tombstone bool } +type groupMetricsCollection struct { + successfulRequestDuration prometheus.Observer + failedLimitReserveDuration prometheus.Observer + requestRetryCounter prometheus.Counter + failedRequestCounterWithOthers prometheus.Counter + failedRequestCounterWithThrottled prometheus.Counter + tokenRequestCounter prometheus.Counter +} + +func initMetrics(oldName, name string) *groupMetricsCollection { + const ( + otherType = "others" + throttledType = "throttled" + ) + return &groupMetricsCollection{ + successfulRequestDuration: successfulRequestDuration.WithLabelValues(oldName, name), + failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(oldName, name), + failedRequestCounterWithOthers: failedRequestCounter.WithLabelValues(oldName, name, otherType), + failedRequestCounterWithThrottled: failedRequestCounter.WithLabelValues(oldName, name, throttledType), + requestRetryCounter: requestRetryCounter.WithLabelValues(oldName, name), + tokenRequestCounter: resourceGroupTokenRequestCounter.WithLabelValues(oldName, name), + } +} + type tokenCounter struct { getTokenBucketFunc func() *rmpb.TokenBucket @@ -725,16 +745,13 @@ func newGroupCostController( default: return nil, errs.ErrClientResourceGroupConfigUnavailable.FastGenByArgs("not supports the resource type") } + ms := initMetrics(group.Name, group.Name) gc := &groupCostController{ - meta: group, - name: group.Name, - mainCfg: mainCfg, - mode: group.GetMode(), - successfulRequestDuration: successfulRequestDuration.WithLabelValues(group.Name, group.Name), - failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(group.Name, group.Name), - failedRequestCounter: failedRequestCounter.WithLabelValues(group.Name, group.Name), - requestRetryCounter: requestRetryCounter.WithLabelValues(group.Name, group.Name), - tokenRequestCounter: resourceGroupTokenRequestCounter.WithLabelValues(group.Name, group.Name), + meta: group, + name: group.Name, + mainCfg: mainCfg, + mode: group.GetMode(), + metrics: ms, calculators: []ResourceCalculator{ newKVCalculator(mainCfg), newSQLCalculator(mainCfg), @@ -789,7 +806,7 @@ func (gc *groupCostController) initRunState() { case rmpb.GroupMode_RUMode: gc.run.requestUnitTokens = make(map[rmpb.RequestUnitType]*tokenCounter) for typ := range requestUnitLimitTypeList { - limiter := NewLimiterWithCfg(now, cfgFunc(getRUTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) + limiter := NewLimiterWithCfg(gc.name, now, cfgFunc(getRUTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) counter := &tokenCounter{ limiter: limiter, avgRUPerSec: 0, @@ -803,7 +820,7 @@ func (gc *groupCostController) initRunState() { case rmpb.GroupMode_RawMode: gc.run.resourceTokens = make(map[rmpb.RawResourceType]*tokenCounter) for typ := range requestResourceLimitTypeList { - limiter := NewLimiterWithCfg(now, cfgFunc(getRawResourceTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) + limiter := NewLimiterWithCfg(gc.name, now, cfgFunc(getRawResourceTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) counter := &tokenCounter{ limiter: limiter, avgRUPerSec: 0, @@ -1233,7 +1250,7 @@ func (gc *groupCostController) onRequestWait( res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.LTBMaxWaitDuration, now, v)) } } - if d, err = WaitReservations(ctx, now, res); err == nil { + if d, err = WaitReservations(ctx, now, res); err == nil || errs.ErrClientResourceGroupThrottled.NotEqual(err) { break retryLoop } case rmpb.GroupMode_RUMode: @@ -1243,18 +1260,20 @@ func (gc *groupCostController) onRequestWait( res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.LTBMaxWaitDuration, now, v)) } } - if d, err = WaitReservations(ctx, now, res); err == nil { + if d, err = WaitReservations(ctx, now, res); err == nil || errs.ErrClientResourceGroupThrottled.NotEqual(err) { break retryLoop } } - gc.requestRetryCounter.Inc() + gc.metrics.requestRetryCounter.Inc() time.Sleep(gc.mainCfg.WaitRetryInterval) waitDuration += gc.mainCfg.WaitRetryInterval } if err != nil { - gc.failedRequestCounter.Inc() - if d.Seconds() > 0 { - gc.failedLimitReserveDuration.Observe(d.Seconds()) + if errs.ErrClientResourceGroupThrottled.Equal(err) { + gc.metrics.failedRequestCounterWithThrottled.Inc() + gc.metrics.failedLimitReserveDuration.Observe(d.Seconds()) + } else { + gc.metrics.failedRequestCounterWithOthers.Inc() } gc.mu.Lock() sub(gc.mu.consumption, delta) @@ -1264,7 +1283,7 @@ func (gc *groupCostController) onRequestWait( }) return nil, nil, waitDuration, 0, err } - gc.successfulRequestDuration.Observe(d.Seconds()) + gc.metrics.successfulRequestDuration.Observe(d.Seconds()) waitDuration += d } diff --git a/client/resource_group/controller/controller_test.go b/client/resource_group/controller/controller_test.go index fea4a133ad0..4f4ec592793 100644 --- a/client/resource_group/controller/controller_test.go +++ b/client/resource_group/controller/controller_test.go @@ -26,6 +26,7 @@ import ( rmpb "github.com/pingcap/kvproto/pkg/resource_manager" "github.com/stretchr/testify/require" + "github.com/tikv/pd/client/errs" ) func createTestGroupCostController(re *require.Assertions) *groupCostController { @@ -117,3 +118,17 @@ func TestRequestAndResponseConsumption(t *testing.T) { re.Equal(expectedConsumption.TotalCpuTimeMs, consumption.TotalCpuTimeMs, caseNum) } } + +func TestResourceGroupThrottledError(t *testing.T) { + re := require.New(t) + gc := createTestGroupCostController(re) + gc.initRunState() + req := &TestRequestInfo{ + isWrite: true, + writeBytes: 10000000, + } + // The group is throttled + _, _, _, _, err := gc.onRequestWait(context.TODO(), req) + re.Error(err) + re.True(errs.ErrClientResourceGroupThrottled.Equal(err)) +} diff --git a/client/resource_group/controller/limiter.go b/client/resource_group/controller/limiter.go index a726b0e219a..2e42f591b8b 100644 --- a/client/resource_group/controller/limiter.go +++ b/client/resource_group/controller/limiter.go @@ -26,6 +26,7 @@ import ( "time" "github.com/pingcap/log" + "github.com/prometheus/client_golang/prometheus" "github.com/tikv/pd/client/errs" "go.uber.org/zap" ) @@ -81,6 +82,15 @@ type Limiter struct { isLowProcess bool // remainingNotifyTimes is used to limit notify when the speed limit is already set. remainingNotifyTimes int + name string + + // metrics + metrics *limiterMetricsCollection +} + +// limiterMetricsCollection is a collection of metrics for a limiter. +type limiterMetricsCollection struct { + lowTokenNotifyCounter prometheus.Counter } // Limit returns the maximum overall event rate. @@ -106,8 +116,9 @@ func NewLimiter(now time.Time, r Limit, b int64, tokens float64, lowTokensNotify // NewLimiterWithCfg returns a new Limiter that allows events up to rate r and permits // bursts of at most b tokens. -func NewLimiterWithCfg(now time.Time, cfg tokenBucketReconfigureArgs, lowTokensNotifyChan chan<- struct{}) *Limiter { +func NewLimiterWithCfg(name string, now time.Time, cfg tokenBucketReconfigureArgs, lowTokensNotifyChan chan<- struct{}) *Limiter { lim := &Limiter{ + name: name, limit: Limit(cfg.NewRate), last: now, tokens: cfg.NewTokens, @@ -115,6 +126,9 @@ func NewLimiterWithCfg(now time.Time, cfg tokenBucketReconfigureArgs, lowTokensN notifyThreshold: cfg.NotifyThreshold, lowTokensNotifyChan: lowTokensNotifyChan, } + lim.metrics = &limiterMetricsCollection{ + lowTokenNotifyCounter: lowTokenRequestNotifyCounter.WithLabelValues(lim.name), + } log.Debug("new limiter", zap.String("limiter", fmt.Sprintf("%+v", lim))) return lim } @@ -224,6 +238,14 @@ func (lim *Limiter) SetupNotificationThreshold(threshold float64) { lim.notifyThreshold = threshold } +// SetName sets the name of the limiter. +func (lim *Limiter) SetName(name string) *Limiter { + lim.mu.Lock() + defer lim.mu.Unlock() + lim.name = name + return lim +} + // notify tries to send a non-blocking notification on notifyCh and disables // further notifications (until the next Reconfigure or StartNotification). func (lim *Limiter) notify() { @@ -234,6 +256,9 @@ func (lim *Limiter) notify() { lim.isLowProcess = true select { case lim.lowTokensNotifyChan <- struct{}{}: + if lim.metrics != nil { + lim.metrics.lowTokenNotifyCounter.Inc() + } default: } } diff --git a/client/resource_group/controller/metrics.go b/client/resource_group/controller/metrics.go index 4261705a6f6..30a0b850c7d 100644 --- a/client/resource_group/controller/metrics.go +++ b/client/resource_group/controller/metrics.go @@ -24,6 +24,8 @@ const ( // TODO: remove old label in 8.x resourceGroupNameLabel = "name" newResourceGroupNameLabel = "resource_group" + + errType = "type" ) var ( @@ -40,7 +42,7 @@ var ( Namespace: namespace, Subsystem: requestSubsystem, Name: "success", - Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 + Buckets: []float64{0.0005, .005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30, 60, 600, 1800, 3600}, // 0.0005 ~ 1h Help: "Bucketed histogram of wait duration of successful request.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) @@ -49,7 +51,7 @@ var ( Namespace: namespace, Subsystem: requestSubsystem, Name: "limit_reserve_time_failed", - Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 + Buckets: []float64{0.0005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30, 60, 600, 1800, 3600, 86400}, // 0.0005 ~ 24h Help: "Bucketed histogram of wait duration of failed request.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) @@ -59,7 +61,7 @@ var ( Subsystem: requestSubsystem, Name: "fail", Help: "Counter of failed request.", - }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, errType}) requestRetryCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -73,6 +75,7 @@ var ( prometheus.HistogramOpts{ Namespace: namespace, Subsystem: tokenRequestSubsystem, + Buckets: prometheus.ExponentialBuckets(0.001, 2, 13), // 1ms ~ 8s Name: "duration", Help: "Bucketed histogram of latency(s) of token request.", }, []string{"type"}) @@ -84,6 +87,14 @@ var ( Name: "resource_group", Help: "Counter of token request by every resource group.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) + + lowTokenRequestNotifyCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: tokenRequestSubsystem, + Name: "low_token_notified", + Help: "Counter of low token request.", + }, []string{newResourceGroupNameLabel}) ) var ( @@ -100,4 +111,5 @@ func init() { prometheus.MustRegister(requestRetryCounter) prometheus.MustRegister(tokenRequestDuration) prometheus.MustRegister(resourceGroupTokenRequestCounter) + prometheus.MustRegister(lowTokenRequestNotifyCounter) } From 4820bc5f505824a0212bc54c8a2d92cb5d3251dc Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 29 May 2024 15:26:52 +0800 Subject: [PATCH 18/21] OWNERS: Auto Sync OWNERS files from community membership (#8163) Signed-off-by: Ti Chi Robot Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- OWNERS | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 OWNERS diff --git a/OWNERS b/OWNERS new file mode 100644 index 00000000000..5911dfd3b66 --- /dev/null +++ b/OWNERS @@ -0,0 +1,26 @@ +# See the OWNERS docs at https://go.k8s.io/owners +approvers: + - AndreMouche + - binshi-bing + - bufferflies + - CabinfeverB + - Connor1996 + - disksing + - huachaohuang + - HunDunDM + - HuSharp + - JmPotato + - lhy1024 + - nolouch + - overvenus + - qiuyesuifeng + - rleungx + - siddontang + - Yisaer + - zhouqiang-cl +reviewers: + - BusyJay + - howardlau1999 + - Luffbee + - shafreeck + - xhebox From c498063583dabfbc35a1bb3198fe9224f806d744 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 29 May 2024 17:49:22 +0800 Subject: [PATCH 19/21] tests: test HTTP client initialization with different suites (#8224) ref tikv/pd#7300 Use different suites to test HTTP client initialization instead of maintaining different modes manually. Signed-off-by: JmPotato --- tests/integrations/client/http_client_test.go | 465 ++++++++---------- 1 file changed, 196 insertions(+), 269 deletions(-) diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index 33652da9be0..fa109946e4b 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -41,190 +41,169 @@ import ( "github.com/tikv/pd/tests" ) -type mode int - -// We have two ways to create HTTP client. -// 1. using `NewClient` which created `DefaultPDServiceDiscovery` -// 2. using `NewClientWithServiceDiscovery` which pass a `PDServiceDiscovery` as parameter -// test cases should be run in both modes. -const ( - defaultServiceDiscovery mode = iota - specificServiceDiscovery -) - type httpClientTestSuite struct { suite.Suite - env map[mode]*httpClientTestEnv + // 1. Using `NewClient` will create a `DefaultPDServiceDiscovery` internal. + // 2. Using `NewClientWithServiceDiscovery` will need a `PDServiceDiscovery` to be passed in. + withServiceDiscovery bool + ctx context.Context + cancelFunc context.CancelFunc + cluster *tests.TestCluster + endpoints []string + client pd.Client } -type httpClientTestEnv struct { - ctx context.Context - cancelFunc context.CancelFunc - cluster *tests.TestCluster - endpoints []string +func TestHTTPClientTestSuite(t *testing.T) { + suite.Run(t, &httpClientTestSuite{ + withServiceDiscovery: false, + }) } -func TestHTTPClientTestSuite(t *testing.T) { - suite.Run(t, new(httpClientTestSuite)) +func TestHTTPClientTestSuiteWithServiceDiscovery(t *testing.T) { + suite.Run(t, &httpClientTestSuite{ + withServiceDiscovery: true, + }) } func (suite *httpClientTestSuite) SetupSuite() { - suite.env = make(map[mode]*httpClientTestEnv) re := suite.Require() + suite.ctx, suite.cancelFunc = context.WithCancel(context.Background()) - for _, mode := range []mode{defaultServiceDiscovery, specificServiceDiscovery} { - env := &httpClientTestEnv{} - env.ctx, env.cancelFunc = context.WithCancel(context.Background()) + cluster, err := tests.NewTestCluster(suite.ctx, 2) + re.NoError(err) - cluster, err := tests.NewTestCluster(env.ctx, 2) - re.NoError(err) + err = cluster.RunInitialServers() + re.NoError(err) + leader := cluster.WaitLeader() + re.NotEmpty(leader) + leaderServer := cluster.GetLeaderServer() - err = cluster.RunInitialServers() + err = leaderServer.BootstrapCluster() + re.NoError(err) + for _, region := range []*core.RegionInfo{ + core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), + core.NewTestRegionInfo(11, 1, []byte("a2"), []byte("a3")), + } { + err := leaderServer.GetRaftCluster().HandleRegionHeartbeat(region) re.NoError(err) - leader := cluster.WaitLeader() - re.NotEmpty(leader) - leaderServer := cluster.GetLeaderServer() - - err = leaderServer.BootstrapCluster() + } + var ( + testServers = cluster.GetServers() + endpoints = make([]string, 0, len(testServers)) + ) + for _, s := range testServers { + addr := s.GetConfig().AdvertiseClientUrls + url, err := url.Parse(addr) re.NoError(err) - for _, region := range []*core.RegionInfo{ - core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), - core.NewTestRegionInfo(11, 1, []byte("a2"), []byte("a3")), - } { - err := leaderServer.GetRaftCluster().HandleRegionHeartbeat(region) - re.NoError(err) - } - var ( - testServers = cluster.GetServers() - endpoints = make([]string, 0, len(testServers)) - ) - for _, s := range testServers { - addr := s.GetConfig().AdvertiseClientUrls - url, err := url.Parse(addr) - re.NoError(err) - endpoints = append(endpoints, url.Host) - } - env.endpoints = endpoints - env.cluster = cluster - - suite.env[mode] = env + endpoints = append(endpoints, url.Host) } -} - -func (suite *httpClientTestSuite) TearDownSuite() { - for _, env := range suite.env { - env.cancelFunc() - env.cluster.Destroy() + suite.endpoints = endpoints + suite.cluster = cluster + + if suite.withServiceDiscovery { + // Run test with specific service discovery. + cli := setupCli(suite.ctx, re, suite.endpoints) + sd := cli.GetServiceDiscovery() + suite.client = pd.NewClientWithServiceDiscovery("pd-http-client-it-grpc", sd) + } else { + // Run test with default service discovery. + suite.client = pd.NewClient("pd-http-client-it-http", suite.endpoints) } } -// RunTestInTwoModes is to run test in two modes. -func (suite *httpClientTestSuite) RunTestInTwoModes(test func(mode mode, client pd.Client)) { - // Run test with specific service discovery. - cli := setupCli(suite.env[specificServiceDiscovery].ctx, suite.Require(), suite.env[specificServiceDiscovery].endpoints) - sd := cli.GetServiceDiscovery() - client := pd.NewClientWithServiceDiscovery("pd-http-client-it-grpc", sd) - test(specificServiceDiscovery, client) - client.Close() - - // Run test with default service discovery. - client = pd.NewClient("pd-http-client-it-http", suite.env[defaultServiceDiscovery].endpoints) - test(defaultServiceDiscovery, client) - client.Close() +func (suite *httpClientTestSuite) TearDownSuite() { + suite.cancelFunc() + suite.client.Close() + suite.cluster.Destroy() } func (suite *httpClientTestSuite) TestMeta() { - suite.RunTestInTwoModes(suite.checkMeta) -} - -func (suite *httpClientTestSuite) checkMeta(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - replicateConfig, err := client.GetReplicateConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + replicateConfig, err := client.GetReplicateConfig(ctx) re.NoError(err) re.Equal(3.0, replicateConfig["max-replicas"]) - region, err := client.GetRegionByID(env.ctx, 10) + region, err := client.GetRegionByID(ctx, 10) re.NoError(err) re.Equal(int64(10), region.ID) re.Equal(core.HexRegionKeyStr([]byte("a1")), region.StartKey) re.Equal(core.HexRegionKeyStr([]byte("a2")), region.EndKey) - region, err = client.GetRegionByKey(env.ctx, []byte("a2")) + region, err = client.GetRegionByKey(ctx, []byte("a2")) re.NoError(err) re.Equal(int64(11), region.ID) re.Equal(core.HexRegionKeyStr([]byte("a2")), region.StartKey) re.Equal(core.HexRegionKeyStr([]byte("a3")), region.EndKey) - regions, err := client.GetRegions(env.ctx) + regions, err := client.GetRegions(ctx) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), -1) + regions, err = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), -1) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetRegionsByStoreID(env.ctx, 1) + regions, err = client.GetRegionsByStoreID(ctx, 1) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetEmptyRegions(env.ctx) + regions, err = client.GetEmptyRegions(ctx) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - state, err := client.GetRegionsReplicatedStateByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3"))) + state, err := client.GetRegionsReplicatedStateByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3"))) re.NoError(err) re.Equal("INPROGRESS", state) - regionStats, err := client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), false) + regionStats, err := client.GetRegionStatusByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), false) re.NoError(err) re.Positive(regionStats.Count) re.NotEmpty(regionStats.StoreLeaderCount) - regionStats, err = client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), true) + regionStats, err = client.GetRegionStatusByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), true) re.NoError(err) re.Positive(regionStats.Count) re.Empty(regionStats.StoreLeaderCount) - hotReadRegions, err := client.GetHotReadRegions(env.ctx) + hotReadRegions, err := client.GetHotReadRegions(ctx) re.NoError(err) re.Len(hotReadRegions.AsPeer, 1) re.Len(hotReadRegions.AsLeader, 1) - hotWriteRegions, err := client.GetHotWriteRegions(env.ctx) + hotWriteRegions, err := client.GetHotWriteRegions(ctx) re.NoError(err) re.Len(hotWriteRegions.AsPeer, 1) re.Len(hotWriteRegions.AsLeader, 1) - historyHorRegions, err := client.GetHistoryHotRegions(env.ctx, &pd.HistoryHotRegionsRequest{ + historyHorRegions, err := client.GetHistoryHotRegions(ctx, &pd.HistoryHotRegionsRequest{ StartTime: 0, EndTime: time.Now().AddDate(0, 0, 1).UnixNano() / int64(time.Millisecond), }) re.NoError(err) re.Empty(historyHorRegions.HistoryHotRegion) - store, err := client.GetStores(env.ctx) + store, err := client.GetStores(ctx) re.NoError(err) re.Equal(1, store.Count) re.Len(store.Stores, 1) storeID := uint64(store.Stores[0].Store.ID) // TODO: why type is different? - store2, err := client.GetStore(env.ctx, storeID) + store2, err := client.GetStore(ctx, storeID) re.NoError(err) re.EqualValues(storeID, store2.Store.ID) - version, err := client.GetClusterVersion(env.ctx) + version, err := client.GetClusterVersion(ctx) re.NoError(err) re.Equal("0.0.0", version) - rgs, _ := client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) + rgs, _ := client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) re.Equal(int64(0), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) re.Equal(int64(2), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a2"), []byte("b")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a2"), []byte("b")), 100) re.Equal(int64(1), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) re.Equal(int64(2), rgs.Count) } func (suite *httpClientTestSuite) TestGetMinResolvedTSByStoresIDs() { - suite.RunTestInTwoModes(suite.checkGetMinResolvedTSByStoresIDs) -} - -func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() testMinResolvedTS := tsoutil.TimeToTS(time.Now()) - raftCluster := env.cluster.GetLeaderServer().GetRaftCluster() + raftCluster := suite.cluster.GetLeaderServer().GetRaftCluster() err := raftCluster.SetMinResolvedTS(1, testMinResolvedTS) re.NoError(err) // Make sure the min resolved TS is updated. @@ -233,18 +212,18 @@ func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, cl return minResolvedTS == testMinResolvedTS }) // Wait for the cluster-level min resolved TS to be initialized. - minResolvedTS, storeMinResolvedTSMap, err := client.GetMinResolvedTSByStoresIDs(env.ctx, nil) + minResolvedTS, storeMinResolvedTSMap, err := client.GetMinResolvedTSByStoresIDs(ctx, nil) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Empty(storeMinResolvedTSMap) // Get the store-level min resolved TS. - minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(env.ctx, []uint64{1}) + minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(ctx, []uint64{1}) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Len(storeMinResolvedTSMap, 1) re.Equal(minResolvedTS, storeMinResolvedTSMap[1]) // Get the store-level min resolved TS with an invalid store ID. - minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(env.ctx, []uint64{1, 2}) + minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(ctx, []uint64{1, 2}) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Len(storeMinResolvedTSMap, 2) @@ -253,22 +232,19 @@ func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, cl } func (suite *httpClientTestSuite) TestRule() { - suite.RunTestInTwoModes(suite.checkRule) -} - -func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - bundles, err := client.GetAllPlacementRuleBundles(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + bundles, err := client.GetAllPlacementRuleBundles(ctx) re.NoError(err) re.Len(bundles, 1) re.Equal(placement.DefaultGroupID, bundles[0].ID) - bundle, err := client.GetPlacementRuleBundleByGroup(env.ctx, placement.DefaultGroupID) + bundle, err := client.GetPlacementRuleBundleByGroup(ctx, placement.DefaultGroupID) re.NoError(err) re.Equal(bundles[0], bundle) // Check if we have the default rule. - checkRuleResult(re, env, client, &pd.Rule{ + suite.checkRuleResult(ctx, re, &pd.Rule{ GroupID: placement.DefaultGroupID, ID: placement.DefaultRuleID, Role: pd.Voter, @@ -277,7 +253,7 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { EndKey: []byte{}, }, 1, true) // Should be the same as the rules in the bundle. - checkRuleResult(re, env, client, bundle.Rules[0], 1, true) + suite.checkRuleResult(ctx, re, bundle.Rules[0], 1, true) testRule := &pd.Rule{ GroupID: placement.DefaultGroupID, ID: "test", @@ -286,39 +262,39 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { StartKey: []byte{}, EndKey: []byte{}, } - err = client.SetPlacementRule(env.ctx, testRule) + err = client.SetPlacementRule(ctx, testRule) re.NoError(err) - checkRuleResult(re, env, client, testRule, 2, true) - err = client.DeletePlacementRule(env.ctx, placement.DefaultGroupID, "test") + suite.checkRuleResult(ctx, re, testRule, 2, true) + err = client.DeletePlacementRule(ctx, placement.DefaultGroupID, "test") re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, false) + suite.checkRuleResult(ctx, re, testRule, 1, false) testRuleOp := &pd.RuleOp{ Rule: testRule, Action: pd.RuleOpAdd, } - err = client.SetPlacementRuleInBatch(env.ctx, []*pd.RuleOp{testRuleOp}) + err = client.SetPlacementRuleInBatch(ctx, []*pd.RuleOp{testRuleOp}) re.NoError(err) - checkRuleResult(re, env, client, testRule, 2, true) + suite.checkRuleResult(ctx, re, testRule, 2, true) testRuleOp = &pd.RuleOp{ Rule: testRule, Action: pd.RuleOpDel, } - err = client.SetPlacementRuleInBatch(env.ctx, []*pd.RuleOp{testRuleOp}) + err = client.SetPlacementRuleInBatch(ctx, []*pd.RuleOp{testRuleOp}) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, false) - err = client.SetPlacementRuleBundles(env.ctx, []*pd.GroupBundle{ + suite.checkRuleResult(ctx, re, testRule, 1, false) + err = client.SetPlacementRuleBundles(ctx, []*pd.GroupBundle{ { ID: placement.DefaultGroupID, Rules: []*pd.Rule{testRule}, }, }, true) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, true) - ruleGroups, err := client.GetAllPlacementRuleGroups(env.ctx) + suite.checkRuleResult(ctx, re, testRule, 1, true) + ruleGroups, err := client.GetAllPlacementRuleGroups(ctx) re.NoError(err) re.Len(ruleGroups, 1) re.Equal(placement.DefaultGroupID, ruleGroups[0].ID) - ruleGroup, err := client.GetPlacementRuleGroupByID(env.ctx, placement.DefaultGroupID) + ruleGroup, err := client.GetPlacementRuleGroupByID(ctx, placement.DefaultGroupID) re.NoError(err) re.Equal(ruleGroups[0], ruleGroup) testRuleGroup := &pd.RuleGroup{ @@ -326,14 +302,14 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { Index: 1, Override: true, } - err = client.SetPlacementRuleGroup(env.ctx, testRuleGroup) + err = client.SetPlacementRuleGroup(ctx, testRuleGroup) re.NoError(err) - ruleGroup, err = client.GetPlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + ruleGroup, err = client.GetPlacementRuleGroupByID(ctx, testRuleGroup.ID) re.NoError(err) re.Equal(testRuleGroup, ruleGroup) - err = client.DeletePlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + err = client.DeletePlacementRuleGroupByID(ctx, testRuleGroup.ID) re.NoError(err) - ruleGroup, err = client.GetPlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + ruleGroup, err = client.GetPlacementRuleGroupByID(ctx, testRuleGroup.ID) re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Empty(ruleGroup) // Test the start key and end key. @@ -345,34 +321,33 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { StartKey: []byte("a1"), EndKey: []byte(""), } - err = client.SetPlacementRule(env.ctx, testRule) + err = client.SetPlacementRule(ctx, testRule) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, true) + suite.checkRuleResult(ctx, re, testRule, 1, true) } -func checkRuleResult( - re *require.Assertions, - env *httpClientTestEnv, - client pd.Client, +func (suite *httpClientTestSuite) checkRuleResult( + ctx context.Context, re *require.Assertions, rule *pd.Rule, totalRuleCount int, exist bool, ) { + client := suite.client if exist { - got, err := client.GetPlacementRule(env.ctx, rule.GroupID, rule.ID) + got, err := client.GetPlacementRule(ctx, rule.GroupID, rule.ID) re.NoError(err) // skip comparison of the generated field got.StartKeyHex = rule.StartKeyHex got.EndKeyHex = rule.EndKeyHex re.Equal(rule, got) } else { - _, err := client.GetPlacementRule(env.ctx, rule.GroupID, rule.ID) + _, err := client.GetPlacementRule(ctx, rule.GroupID, rule.ID) re.ErrorContains(err, http.StatusText(http.StatusNotFound)) } // Check through the `GetPlacementRulesByGroup` API. - rules, err := client.GetPlacementRulesByGroup(env.ctx, rule.GroupID) + rules, err := client.GetPlacementRulesByGroup(ctx, rule.GroupID) re.NoError(err) checkRuleFunc(re, rules, rule, totalRuleCount, exist) // Check through the `GetPlacementRuleBundleByGroup` API. - bundle, err := client.GetPlacementRuleBundleByGroup(env.ctx, rule.GroupID) + bundle, err := client.GetPlacementRuleBundleByGroup(ctx, rule.GroupID) re.NoError(err) checkRuleFunc(re, bundle.Rules, rule, totalRuleCount, exist) } @@ -400,14 +375,11 @@ func checkRuleFunc( } func (suite *httpClientTestSuite) TestRegionLabel() { - suite.RunTestInTwoModes(suite.checkRegionLabel) -} - -func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - labelRules, err := client.GetAllRegionLabelRules(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + labelRules, err := client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 1) re.Equal("keyspaces/0", labelRules[0].ID) @@ -418,9 +390,9 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) RuleType: "key-range", Data: labeler.MakeKeyRanges("1234", "5678"), } - err = client.SetRegionLabelRule(env.ctx, labelRule) + err = client.SetRegionLabelRule(ctx, labelRule) re.NoError(err) - labelRules, err = client.GetAllRegionLabelRules(env.ctx) + labelRules, err = client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 2) sort.Slice(labelRules, func(i, j int) bool { @@ -440,9 +412,9 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) SetRules: []*pd.LabelRule{labelRule}, DeleteRules: []string{"rule1"}, } - err = client.PatchRegionLabelRules(env.ctx, patch) + err = client.PatchRegionLabelRules(ctx, patch) re.NoError(err) - allLabelRules, err := client.GetAllRegionLabelRules(env.ctx) + allLabelRules, err := client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 2) sort.Slice(allLabelRules, func(i, j int) bool { @@ -451,7 +423,7 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) re.Equal(labelRule.ID, allLabelRules[1].ID) re.Equal(labelRule.Labels, allLabelRules[1].Labels) re.Equal(labelRule.RuleType, allLabelRules[1].RuleType) - labelRules, err = client.GetRegionLabelRulesByIDs(env.ctx, []string{"keyspaces/0", "rule2"}) + labelRules, err = client.GetRegionLabelRulesByIDs(ctx, []string{"keyspaces/0", "rule2"}) re.NoError(err) sort.Slice(labelRules, func(i, j int) bool { return labelRules[i].ID < labelRules[j].ID @@ -460,24 +432,21 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) } func (suite *httpClientTestSuite) TestAccelerateSchedule() { - suite.RunTestInTwoModes(suite.checkAccelerateSchedule) -} - -func (suite *httpClientTestSuite) checkAccelerateSchedule(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - raftCluster := env.cluster.GetLeaderServer().GetRaftCluster() + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + raftCluster := suite.cluster.GetLeaderServer().GetRaftCluster() suspectRegions := raftCluster.GetSuspectRegions() re.Empty(suspectRegions) - err := client.AccelerateSchedule(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a2"))) + err := client.AccelerateSchedule(ctx, pd.NewKeyRange([]byte("a1"), []byte("a2"))) re.NoError(err) suspectRegions = raftCluster.GetSuspectRegions() re.Len(suspectRegions, 1) raftCluster.ClearSuspectRegions() suspectRegions = raftCluster.GetSuspectRegions() re.Empty(suspectRegions) - err = client.AccelerateScheduleInBatch(env.ctx, []*pd.KeyRange{ + err = client.AccelerateScheduleInBatch(ctx, []*pd.KeyRange{ pd.NewKeyRange([]byte("a1"), []byte("a2")), pd.NewKeyRange([]byte("a2"), []byte("a3")), }) @@ -487,24 +456,21 @@ func (suite *httpClientTestSuite) checkAccelerateSchedule(mode mode, client pd.C } func (suite *httpClientTestSuite) TestConfig() { - suite.RunTestInTwoModes(suite.checkConfig) -} - -func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - config, err := client.GetConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + config, err := client.GetConfig(ctx) re.NoError(err) re.Equal(float64(4), config["schedule"].(map[string]any)["leader-schedule-limit"]) newConfig := map[string]any{ "schedule.leader-schedule-limit": float64(8), } - err = client.SetConfig(env.ctx, newConfig) + err = client.SetConfig(ctx, newConfig) re.NoError(err) - config, err = client.GetConfig(env.ctx) + config, err = client.GetConfig(ctx) re.NoError(err) re.Equal(float64(8), config["schedule"].(map[string]any)["leader-schedule-limit"]) @@ -512,15 +478,15 @@ func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { newConfig = map[string]any{ "schedule.leader-schedule-limit": float64(16), } - err = client.SetConfig(env.ctx, newConfig, 5) + err = client.SetConfig(ctx, newConfig, 5) re.NoError(err) - resp, err := env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") + resp, err := suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") re.NoError(err) re.Equal([]byte("16"), resp.Kvs[0].Value) // delete the config with TTL. - err = client.SetConfig(env.ctx, newConfig, 0) + err = client.SetConfig(ctx, newConfig, 0) re.NoError(err) - resp, err = env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") + resp, err = suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") re.NoError(err) re.Empty(resp.Kvs) @@ -528,81 +494,72 @@ func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { newConfig = map[string]any{ "schedule.max-pending-peer-count": uint64(math.MaxInt32), } - err = client.SetConfig(env.ctx, newConfig, 4) + err = client.SetConfig(ctx, newConfig, 4) re.NoError(err) - c := env.cluster.GetLeaderServer().GetRaftCluster().GetOpts().GetMaxPendingPeerCount() + c := suite.cluster.GetLeaderServer().GetRaftCluster().GetOpts().GetMaxPendingPeerCount() re.Equal(uint64(math.MaxInt32), c) - err = client.SetConfig(env.ctx, newConfig, 0) + err = client.SetConfig(ctx, newConfig, 0) re.NoError(err) - resp, err = env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.max-pending-peer-count") + resp, err = suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.max-pending-peer-count") re.NoError(err) re.Empty(resp.Kvs) } func (suite *httpClientTestSuite) TestScheduleConfig() { - suite.RunTestInTwoModes(suite.checkScheduleConfig) -} - -func (suite *httpClientTestSuite) checkScheduleConfig(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - config, err := client.GetScheduleConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + config, err := client.GetScheduleConfig(ctx) re.NoError(err) re.Equal(float64(4), config["hot-region-schedule-limit"]) re.Equal(float64(2048), config["region-schedule-limit"]) config["hot-region-schedule-limit"] = float64(8) - err = client.SetScheduleConfig(env.ctx, config) + err = client.SetScheduleConfig(ctx, config) re.NoError(err) - config, err = client.GetScheduleConfig(env.ctx) + config, err = client.GetScheduleConfig(ctx) re.NoError(err) re.Equal(float64(8), config["hot-region-schedule-limit"]) re.Equal(float64(2048), config["region-schedule-limit"]) } func (suite *httpClientTestSuite) TestSchedulers() { - suite.RunTestInTwoModes(suite.checkSchedulers) -} - -func (suite *httpClientTestSuite) checkSchedulers(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - schedulers, err := client.GetSchedulers(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + schedulers, err := client.GetSchedulers(ctx) re.NoError(err) re.Empty(schedulers) - err = client.CreateScheduler(env.ctx, "evict-leader-scheduler", 1) + err = client.CreateScheduler(ctx, "evict-leader-scheduler", 1) re.NoError(err) - schedulers, err = client.GetSchedulers(env.ctx) + schedulers, err = client.GetSchedulers(ctx) re.NoError(err) re.Len(schedulers, 1) - err = client.SetSchedulerDelay(env.ctx, "evict-leader-scheduler", 100) + err = client.SetSchedulerDelay(ctx, "evict-leader-scheduler", 100) re.NoError(err) - err = client.SetSchedulerDelay(env.ctx, "not-exist", 100) + err = client.SetSchedulerDelay(ctx, "not-exist", 100) re.ErrorContains(err, "500 Internal Server Error") // TODO: should return friendly error message } func (suite *httpClientTestSuite) TestSetStoreLabels() { - suite.RunTestInTwoModes(suite.checkSetStoreLabels) -} - -func (suite *httpClientTestSuite) checkSetStoreLabels(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - resp, err := client.GetStores(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + resp, err := client.GetStores(ctx) re.NoError(err) setStore := resp.Stores[0] re.Empty(setStore.Store.Labels, nil) storeLabels := map[string]string{ "zone": "zone1", } - err = client.SetStoreLabels(env.ctx, 1, storeLabels) + err = client.SetStoreLabels(ctx, 1, storeLabels) re.NoError(err) - resp, err = client.GetStores(env.ctx) + resp, err = client.GetStores(ctx) re.NoError(err) for _, store := range resp.Stores { if store.Store.ID == setStore.Store.ID { @@ -614,67 +571,52 @@ func (suite *httpClientTestSuite) checkSetStoreLabels(mode mode, client pd.Clien } func (suite *httpClientTestSuite) TestTransferLeader() { - suite.RunTestInTwoModes(suite.checkTransferLeader) -} - -func (suite *httpClientTestSuite) checkTransferLeader(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - members, err := client.GetMembers(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + members, err := client.GetMembers(ctx) re.NoError(err) re.Len(members.Members, 2) - leader, err := client.GetLeader(env.ctx) + leader, err := client.GetLeader(ctx) re.NoError(err) // Transfer leader to another pd for _, member := range members.Members { if member.GetName() != leader.GetName() { - err = client.TransferLeader(env.ctx, member.GetName()) + err = client.TransferLeader(ctx, member.GetName()) re.NoError(err) break } } - newLeader := env.cluster.WaitLeader() + newLeader := suite.cluster.WaitLeader() re.NotEmpty(newLeader) re.NoError(err) re.NotEqual(leader.GetName(), newLeader) // Force to update the members info. testutil.Eventually(re, func() bool { - leader, err = client.GetLeader(env.ctx) + leader, err = client.GetLeader(ctx) re.NoError(err) return newLeader == leader.GetName() }) - members, err = client.GetMembers(env.ctx) + members, err = client.GetMembers(ctx) re.NoError(err) re.Len(members.Members, 2) re.Equal(leader.GetName(), members.Leader.GetName()) } func (suite *httpClientTestSuite) TestVersion() { - suite.RunTestInTwoModes(suite.checkVersion) -} - -func (suite *httpClientTestSuite) checkVersion(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - ver, err := client.GetPDVersion(env.ctx) + ver, err := suite.client.GetPDVersion(suite.ctx) re.NoError(err) re.Equal(versioninfo.PDReleaseVersion, ver) } func (suite *httpClientTestSuite) TestStatus() { - suite.RunTestInTwoModes(suite.checkStatus) -} - -func (suite *httpClientTestSuite) checkStatus(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - status, err := client.GetStatus(env.ctx) + status, err := suite.client.GetStatus(suite.ctx) re.NoError(err) re.Equal(versioninfo.PDReleaseVersion, status.Version) re.Equal(versioninfo.PDGitHash, status.GitHash) @@ -683,48 +625,41 @@ func (suite *httpClientTestSuite) checkStatus(mode mode, client pd.Client) { } func (suite *httpClientTestSuite) TestAdmin() { - suite.RunTestInTwoModes(suite.checkAdmin) -} - -func (suite *httpClientTestSuite) checkAdmin(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - err := client.SetSnapshotRecoveringMark(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + err := client.SetSnapshotRecoveringMark(ctx) re.NoError(err) - err = client.ResetTS(env.ctx, 123, true) + err = client.ResetTS(ctx, 123, true) re.NoError(err) - err = client.ResetBaseAllocID(env.ctx, 456) + err = client.ResetBaseAllocID(ctx, 456) re.NoError(err) - err = client.DeleteSnapshotRecoveringMark(env.ctx) + err = client.DeleteSnapshotRecoveringMark(ctx) re.NoError(err) } func (suite *httpClientTestSuite) TestWithBackoffer() { - suite.RunTestInTwoModes(suite.checkWithBackoffer) -} - -func (suite *httpClientTestSuite) checkWithBackoffer(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() // Should return with 404 error without backoffer. - rule, err := client.GetPlacementRule(env.ctx, "non-exist-group", "non-exist-rule") + rule, err := client.GetPlacementRule(ctx, "non-exist-group", "non-exist-rule") re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Nil(rule) // Should return with 404 error even with an infinite backoffer. rule, err = client. WithBackoffer(retry.InitialBackoffer(100*time.Millisecond, time.Second, 0)). - GetPlacementRule(env.ctx, "non-exist-group", "non-exist-rule") + GetPlacementRule(ctx, "non-exist-group", "non-exist-rule") re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Nil(rule) } func (suite *httpClientTestSuite) TestRedirectWithMetrics() { re := suite.Require() - env := suite.env[defaultServiceDiscovery] - cli := setupCli(env.ctx, suite.Require(), env.endpoints) + cli := setupCli(suite.ctx, re, suite.endpoints) defer cli.Close() sd := cli.GetServiceDiscovery() @@ -785,12 +720,10 @@ func (suite *httpClientTestSuite) TestRedirectWithMetrics() { } func (suite *httpClientTestSuite) TestUpdateKeyspaceGCManagementType() { - suite.RunTestInTwoModes(suite.checkUpdateKeyspaceGCManagementType) -} - -func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() keyspaceName := "DEFAULT" expectGCManagementType := "keyspace_level_gc" @@ -800,10 +733,10 @@ func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, GCManagementType: expectGCManagementType, }, } - err := client.UpdateKeyspaceGCManagementType(env.ctx, keyspaceName, &keyspaceSafePointVersionConfig) + err := client.UpdateKeyspaceGCManagementType(ctx, keyspaceName, &keyspaceSafePointVersionConfig) re.NoError(err) - keyspaceMetaRes, err := client.GetKeyspaceMetaByName(env.ctx, keyspaceName) + keyspaceMetaRes, err := client.GetKeyspaceMetaByName(ctx, keyspaceName) re.NoError(err) val, ok := keyspaceMetaRes.Config["gc_management_type"] @@ -813,14 +746,8 @@ func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, } func (suite *httpClientTestSuite) TestGetHealthStatus() { - suite.RunTestInTwoModes(suite.checkGetHealthStatus) -} - -func (suite *httpClientTestSuite) checkGetHealthStatus(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - healths, err := client.GetHealthStatus(env.ctx) + healths, err := suite.client.GetHealthStatus(suite.ctx) re.NoError(err) re.Len(healths, 2) sort.Slice(healths, func(i, j int) bool { From 52389b04f21726b54117ee29acf62923480ccbde Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 30 May 2024 15:07:21 +0800 Subject: [PATCH 20/21] simulator: make store,region,replica configurable in cases (#8215) ref tikv/pd#8135 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/utils/configutil/configutil.go | 7 +++ tools/pd-simulator/main.go | 34 +++++------ .../pd-simulator/simulator/cases/add_nodes.go | 49 +++++++-------- .../simulator/cases/add_nodes_dynamic.go | 60 +++++++++---------- .../simulator/cases/balance_leader.go | 42 +++++++------ .../simulator/cases/balance_region.go | 45 +++++++------- tools/pd-simulator/simulator/cases/cases.go | 42 +++++-------- .../simulator/cases/delete_nodes.go | 55 +++++++++-------- .../cases/diagnose_label_isolation.go | 7 ++- .../simulator/cases/diagnose_rule.go | 5 +- .../pd-simulator/simulator/cases/hot_read.go | 32 +++++----- .../pd-simulator/simulator/cases/hot_write.go | 33 +++++----- .../simulator/cases/import_data.go | 33 +++++----- .../simulator/cases/makeup_down_replica.go | 55 +++++++---------- .../simulator/cases/region_merge.go | 41 ++++++------- .../simulator/cases/region_split.go | 25 ++++---- tools/pd-simulator/simulator/client.go | 5 +- .../simulator/{ => config}/config.go | 23 ++++--- tools/pd-simulator/simulator/conn.go | 3 +- tools/pd-simulator/simulator/drive.go | 11 ++-- tools/pd-simulator/simulator/node.go | 5 +- tools/pd-simulator/simulator/raft.go | 5 +- .../simulator/simutil/case_config.go | 34 ----------- tools/pd-simulator/simulator/task.go | 2 +- 24 files changed, 303 insertions(+), 350 deletions(-) rename tools/pd-simulator/simulator/{ => config}/config.go (85%) delete mode 100644 tools/pd-simulator/simulator/simutil/case_config.go diff --git a/pkg/utils/configutil/configutil.go b/pkg/utils/configutil/configutil.go index 2e7c74d9f8c..086f74ff842 100644 --- a/pkg/utils/configutil/configutil.go +++ b/pkg/utils/configutil/configutil.go @@ -171,3 +171,10 @@ func AdjustPath(p *string) { *p = absPath } } + +// AdjustBool adjusts the value of a bool variable. +func AdjustBool(v *bool, defValue bool) { + if !*v { + *v = defValue + } +} diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 73f4a0bba12..04de914f5f0 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -38,21 +38,19 @@ import ( "github.com/tikv/pd/tools/pd-analysis/analysis" "github.com/tikv/pd/tools/pd-simulator/simulator" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) var ( - pdAddr = flag.String("pd-endpoints", "", "pd address") - configFile = flag.String("config", "conf/simconfig.toml", "config file") - caseName = flag.String("case", "", "case name") - serverLogLevel = flag.String("serverLog", "info", "pd server log level") - simLogLevel = flag.String("simLog", "info", "simulator log level") - simLogFile = flag.String("log-file", "", "simulator log file") - regionNum = flag.Int("regionNum", 0, "regionNum of one store") - storeNum = flag.Int("storeNum", 0, "storeNum") - enableTransferRegionCounter = flag.Bool("enableTransferRegionCounter", false, "enableTransferRegionCounter") - statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") + pdAddr = flag.String("pd-endpoints", "", "pd address") + configFile = flag.String("config", "conf/simconfig.toml", "config file") + caseName = flag.String("case", "", "case name") + serverLogLevel = flag.String("serverLog", "info", "pd server log level") + simLogLevel = flag.String("simLog", "info", "simulator log level") + simLogFile = flag.String("log-file", "", "simulator log file") + statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") ) func main() { @@ -63,14 +61,12 @@ func main() { flag.Parse() simutil.InitLogger(*simLogLevel, *simLogFile) - simutil.InitCaseConfig(*storeNum, *regionNum, *enableTransferRegionCounter) statistics.Denoising = false - if simutil.CaseConfigure.EnableTransferRegionCounter { - analysis.GetTransferCounter().Init(simutil.CaseConfigure.StoreNum, simutil.CaseConfigure.RegionNum) - } - schedulers.Register() // register schedulers, which is needed by simConfig.Adjust - simConfig := simulator.NewSimConfig(*serverLogLevel) + simConfig := sc.NewSimConfig(*serverLogLevel) + if simConfig.EnableTransferRegionCounter { + analysis.GetTransferCounter().Init(simConfig.TotalStore, simConfig.TotalRegion) + } var meta toml.MetaData var err error if *configFile != "" { @@ -97,7 +93,7 @@ func main() { } } -func run(simCase string, simConfig *simulator.SimConfig) { +func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { go runHTTPServer() simStart(*pdAddr, simCase, simConfig) @@ -136,7 +132,7 @@ func runHTTPServer() { } // NewSingleServer creates a pd server for simulator. -func NewSingleServer(ctx context.Context, simConfig *simulator.SimConfig) (*server.Server, testutil.CleanupFunc) { +func NewSingleServer(ctx context.Context, simConfig *sc.SimConfig) (*server.Server, testutil.CleanupFunc) { err := logutil.SetupLogger(simConfig.ServerConfig.Log, &simConfig.ServerConfig.Logger, &simConfig.ServerConfig.LogProps) if err == nil { log.ReplaceGlobals(simConfig.ServerConfig.Logger, simConfig.ServerConfig.LogProps) @@ -161,7 +157,7 @@ func cleanServer(cfg *config.Config) { os.RemoveAll(cfg.DataDir) } -func simStart(pdAddr string, simCase string, simConfig *simulator.SimConfig, clean ...testutil.CleanupFunc) { +func simStart(pdAddr string, simCase string, simConfig *sc.SimConfig, clean ...testutil.CleanupFunc) { start := time.Now() driver, err := simulator.NewDriver(pdAddr, simCase, simConfig) if err != nil { diff --git a/tools/pd-simulator/simulator/cases/add_nodes.go b/tools/pd-simulator/simulator/cases/add_nodes.go index 241b34a9473..5c73fe9764c 100644 --- a/tools/pd-simulator/simulator/cases/add_nodes.go +++ b/tools/pd-simulator/simulator/cases/add_nodes.go @@ -15,35 +15,35 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newAddNodes() *Case { +func newAddNodes(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyRatio := rand.Float64() // the ratio of noEmpty store to total store - noEmptyStoreNum := getNoEmptyStoreNum(storeNum, noEmptyRatio) + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+1)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+2)%noEmptyStoreNum + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%noEmptyStoreNum + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -54,21 +54,18 @@ func newAddNodes() *Case { }) } - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - leaderCounts := make([]int, 0, storeNum) - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - regionCount := regions.GetStoreRegionCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum, threshold) + peerCount := regions.GetStoreRegionCount(uint64(i)) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } + if !isUniform(peerCount, totalRegion*replica/totalStore) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go index 59b0b54e1ca..aa585b48923 100644 --- a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go +++ b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go @@ -15,24 +15,22 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newAddNodesDynamic() *Case { +func newAddNodesDynamic(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyRatio := rand.Float64() // the ratio of noEmpty store to total store - noEmptyStoreNum := getNoEmptyStoreNum(storeNum, noEmptyRatio) + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - for i := 1; i <= int(noEmptyStoreNum); i++ { + for i := 0; i < noEmptyStoreNum; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, @@ -40,15 +38,17 @@ func newAddNodesDynamic() *Case { } var ids []uint64 - for i := 1; i <= storeNum-int(noEmptyStoreNum); i++ { + for i := 0; i < totalStore-noEmptyStoreNum; i++ { ids = append(ids, IDAllocator.nextID()) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+1)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+2)%noEmptyStoreNum + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%noEmptyStoreNum + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -59,11 +59,11 @@ func newAddNodesDynamic() *Case { }) } - numNodes := int(noEmptyStoreNum) + currentStoreCount := noEmptyStoreNum e := &AddNodesDescriptor{} e.Step = func(tick int64) uint64 { - if tick%100 == 0 && numNodes < storeNum { - numNodes++ + if tick%100 == 0 && currentStoreCount < totalStore { + currentStoreCount++ nodeID := ids[0] ids = append(ids[:0], ids[1:]...) return nodeID @@ -72,21 +72,21 @@ func newAddNodesDynamic() *Case { } simCase.Events = []EventDescriptor{e} - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := numNodes == storeNum - leaderCounts := make([]int, 0, numNodes) - regionCounts := make([]int, 0, numNodes) - for i := 1; i <= numNodes; i++ { + if currentStoreCount != totalStore { + return false + } + for i := 1; i <= currentStoreCount; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - regionCount := regions.GetStoreRegionCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum, threshold) + peerCount := regions.GetStoreRegionCount(uint64(i)) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } + if !isUniform(peerCount, totalRegion*replica/totalStore) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/balance_leader.go b/tools/pd-simulator/simulator/cases/balance_leader.go index bbc7ce97f68..c5315f85d8e 100644 --- a/tools/pd-simulator/simulator/cases/balance_leader.go +++ b/tools/pd-simulator/simulator/cases/balance_leader.go @@ -18,28 +18,35 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newBalanceLeader() *Case { +func newBalanceLeader(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeNum)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%(storeNum-1)) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%(storeNum-1)) + 1}, + leaderStoreID := simCase.Stores[totalStore-1].ID + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: leaderStoreID, + }) + for j := 1; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%(totalStore-1) + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -50,17 +57,14 @@ func newBalanceLeader() *Case { }) } - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - leaderCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - res = res && isUniform(leaderCount, regionNum/3, threshold) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } } - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/balance_region.go b/tools/pd-simulator/simulator/cases/balance_region.go index 3b0c46f1670..a559a335c97 100644 --- a/tools/pd-simulator/simulator/cases/balance_region.go +++ b/tools/pd-simulator/simulator/cases/balance_region.go @@ -19,21 +19,18 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRedundantBalanceRegion() *Case { +func newRedundantBalanceRegion(config *sc.SimConfig) *Case { var simCase Case - storeNum := simutil.CaseConfigure.StoreNum - regionNum := simutil.CaseConfigure.RegionNum - if storeNum == 0 || regionNum == 0 { - storeNum, regionNum = 6, 4000 - } + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) - for i := 0; i < storeNum; i++ { + for i := 0; i < totalStore; i++ { s := &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, @@ -44,11 +41,13 @@ func newRedundantBalanceRegion() *Case { simCase.Stores = append(simCase.Stores, s) } - for i := 0; i < regionNum; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i%storeNum + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -57,30 +56,26 @@ func newRedundantBalanceRegion() *Case { }) } - storesLastUpdateTime := make([]int64, storeNum+1) - storeLastAvailable := make([]uint64, storeNum+1) + storesLastUpdateTime := make([]int64, totalStore+1) + storeLastAvailable := make([]uint64, totalStore+1) simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { - res := true curTime := time.Now().Unix() - storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { available := stats[i].GetAvailable() - storesAvailable = append(storesAvailable, available) if curTime-storesLastUpdateTime[i] > 60 { if storeLastAvailable[i] != available { - res = false + return false } if stats[i].ToCompactionSize != 0 { - res = false + return false } storesLastUpdateTime[i] = curTime storeLastAvailable[i] = available } else { - res = false + return false } } - simutil.Logger.Info("current counts", zap.Uint64s("storesAvailable", storesAvailable)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index 0a8967a8d86..f2e79a81924 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -15,12 +15,14 @@ package cases import ( + "math/rand" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) // Store is used to simulate tikv. @@ -86,7 +88,7 @@ func (a *idAllocator) GetID() uint64 { var IDAllocator idAllocator // CaseMap is a mapping of the cases to the their corresponding initialize functions. -var CaseMap = map[string]func() *Case{ +var CaseMap = map[string]func(*config.SimConfig) *Case{ "balance-leader": newBalanceLeader, "redundant-balance-region": newRedundantBalanceRegion, "add-nodes": newAddNodes, @@ -106,43 +108,27 @@ var CaseMap = map[string]func() *Case{ } // NewCase creates a new case. -func NewCase(name string) *Case { +func NewCase(name string, simConfig *config.SimConfig) *Case { if f, ok := CaseMap[name]; ok { - return f() + return f(simConfig) } return nil } -func leaderAndRegionIsUniform(leaderCount, regionCount, regionNum int, threshold float64) bool { - return isUniform(leaderCount, regionNum/3, threshold) && isUniform(regionCount, regionNum, threshold) -} - -func isUniform(count, meanCount int, threshold float64) bool { +func isUniform(count, meanCount int) bool { + threshold := 0.05 maxCount := int((1.0 + threshold) * float64(meanCount)) minCount := int((1.0 - threshold) * float64(meanCount)) return minCount <= count && count <= maxCount } -func getStoreNum() int { - storeNum := simutil.CaseConfigure.StoreNum - if storeNum < 3 { - simutil.Logger.Fatal("store num should be larger than or equal to 3") - } - return storeNum -} - -func getRegionNum() int { - regionNum := simutil.CaseConfigure.RegionNum - if regionNum <= 0 { - simutil.Logger.Fatal("region num should be larger than 0") +func getNoEmptyStoreNum(storeNum int, replica int) int { + noEmptyStoreNum := rand.Intn(storeNum) + if noEmptyStoreNum < replica { + return replica } - return regionNum -} - -func getNoEmptyStoreNum(storeNum int, noEmptyRatio float64) uint64 { - noEmptyStoreNum := uint64(float64(storeNum) * noEmptyRatio) - if noEmptyStoreNum < 3 || noEmptyStoreNum == uint64(storeNum) { - noEmptyStoreNum = 3 + if noEmptyStoreNum == storeNum { + return storeNum - 1 } return noEmptyStoreNum } diff --git a/tools/pd-simulator/simulator/cases/delete_nodes.go b/tools/pd-simulator/simulator/cases/delete_nodes.go index 4ba8e5064a4..80650cf109d 100644 --- a/tools/pd-simulator/simulator/cases/delete_nodes.go +++ b/tools/pd-simulator/simulator/cases/delete_nodes.go @@ -20,28 +20,31 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newDeleteNodes() *Case { +func newDeleteNodes(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyStoreNum := storeNum - 1 - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := totalStore - 1 + for i := 1; i <= totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum) + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -57,12 +60,12 @@ func newDeleteNodes() *Case { ids = append(ids, store.ID) } - numNodes := storeNum + currentStoreCount := totalStore e := &DeleteNodesDescriptor{} e.Step = func(tick int64) uint64 { - if numNodes > noEmptyStoreNum && tick%100 == 0 { - idx := rand.Intn(numNodes) - numNodes-- + if currentStoreCount > noEmptyStoreNum && tick%100 == 0 { + idx := rand.Intn(currentStoreCount) + currentStoreCount-- nodeID := ids[idx] ids = append(ids[:idx], ids[idx+1:]...) return nodeID @@ -71,21 +74,21 @@ func newDeleteNodes() *Case { } simCase.Events = []EventDescriptor{e} - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := numNodes == noEmptyStoreNum - leaderCounts := make([]int, 0, numNodes) - regionCounts := make([]int, 0, numNodes) + if currentStoreCount != noEmptyStoreNum { + return false + } for _, i := range ids { leaderCount := regions.GetStoreLeaderCount(i) - regionCount := regions.GetStoreRegionCount(i) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum*storeNum/noEmptyStoreNum, threshold) + peerCount := regions.GetStoreRegionCount(i) + if !isUniform(leaderCount, totalRegion/noEmptyStoreNum) { + return false + } + if !isUniform(peerCount, totalRegion*replica/noEmptyStoreNum) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go index 7fa50e56197..09037136608 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go +++ b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go @@ -21,12 +21,13 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newLabelNotMatch1() *Case { +func newLabelNotMatch1(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"host"} @@ -88,7 +89,7 @@ func newLabelNotMatch1() *Case { return &simCase } -func newLabelIsolation1() *Case { +func newLabelIsolation1(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"host"} @@ -154,7 +155,7 @@ func newLabelIsolation1() *Case { return &simCase } -func newLabelIsolation2() *Case { +func newLabelIsolation2(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"dc", "zone", "host"} diff --git a/tools/pd-simulator/simulator/cases/diagnose_rule.go b/tools/pd-simulator/simulator/cases/diagnose_rule.go index 15c5942d810..5d34e051071 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_rule.go +++ b/tools/pd-simulator/simulator/cases/diagnose_rule.go @@ -21,12 +21,13 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newRule1() *Case { +func newRule1(_ *sc.SimConfig) *Case { var simCase Case simCase.Rules = make([]*placement.Rule, 0) @@ -126,7 +127,7 @@ func newRule1() *Case { return &simCase } -func newRule2() *Case { +func newRule2(_ *sc.SimConfig) *Case { var simCase Case simCase.Rules = make([]*placement.Rule, 0) diff --git a/tools/pd-simulator/simulator/cases/hot_read.go b/tools/pd-simulator/simulator/cases/hot_read.go index d4ec6831d95..50ad08d6011 100644 --- a/tools/pd-simulator/simulator/cases/hot_read.go +++ b/tools/pd-simulator/simulator/cases/hot_read.go @@ -15,35 +15,34 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newHotRead() *Case { +func newHotRead(config *sc.SimConfig) *Case { var simCase Case - - storeNum, regionNum := getStoreNum(), getRegionNum() + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) // Initialize the cluster - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -56,7 +55,7 @@ func newHotRead() *Case { // Events description // select regions on store 1 as hot read regions. - selectRegionNum := 4 * storeNum + selectRegionNum := 4 * totalStore readFlow := make(map[uint64]int64, selectRegionNum) for _, r := range simCase.Regions { if r.Leader.GetStoreId() == 1 { @@ -73,12 +72,11 @@ func newHotRead() *Case { simCase.Events = []EventDescriptor{e} // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, storeNum) + leaderCount := make([]int, totalStore) for id := range readFlow { leaderStore := regions.GetRegion(id).GetLeader().GetStoreId() leaderCount[int(leaderStore-1)]++ } - simutil.Logger.Info("current hot region counts", zap.Reflect("hot-region", leaderCount)) // check count diff < 2. var min, max int diff --git a/tools/pd-simulator/simulator/cases/hot_write.go b/tools/pd-simulator/simulator/cases/hot_write.go index 8428afa75b5..a30afd1a8ec 100644 --- a/tools/pd-simulator/simulator/cases/hot_write.go +++ b/tools/pd-simulator/simulator/cases/hot_write.go @@ -15,34 +15,34 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newHotWrite() *Case { +func newHotWrite(config *sc.SimConfig) *Case { var simCase Case + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) - storeNum, regionNum := getStoreNum(), getRegionNum() // Initialize the cluster - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -55,7 +55,7 @@ func newHotWrite() *Case { // Events description // select regions on store 1 as hot write regions. - selectStoreNum := storeNum + selectStoreNum := totalStore writeFlow := make(map[uint64]int64, selectStoreNum) for _, r := range simCase.Regions { if r.Leader.GetStoreId() == 1 { @@ -74,8 +74,8 @@ func newHotWrite() *Case { // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, storeNum) - peerCount := make([]int, storeNum) + leaderCount := make([]int, totalStore) + peerCount := make([]int, totalStore) for id := range writeFlow { region := regions.GetRegion(id) leaderCount[int(region.GetLeader().GetStoreId()-1)]++ @@ -83,7 +83,6 @@ func newHotWrite() *Case { peerCount[int(p.GetStoreId()-1)]++ } } - simutil.Logger.Info("current hot region counts", zap.Reflect("leader", leaderCount), zap.Reflect("peer", peerCount)) // check count diff <= 2. var minLeader, maxLeader, minPeer, maxPeer int diff --git a/tools/pd-simulator/simulator/cases/import_data.go b/tools/pd-simulator/simulator/cases/import_data.go index 6cf3b79a736..b9f448a6cf6 100644 --- a/tools/pd-simulator/simulator/cases/import_data.go +++ b/tools/pd-simulator/simulator/cases/import_data.go @@ -17,7 +17,6 @@ package cases import ( "bytes" "fmt" - "math/rand" "os" "github.com/docker/go-units" @@ -26,27 +25,33 @@ import ( "github.com/pingcap/log" "github.com/tikv/pd/pkg/codec" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newImportData() *Case { +func newImportData(config *sc.SimConfig) *Case { var simCase Case + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + // Initialize the cluster - for i := 1; i <= 10; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < getRegionNum(); i++ { - storeIDs := rand.Perm(10) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -65,7 +70,7 @@ func newImportData() *Case { table12 := string(codec.EncodeBytes(codec.GenerateTableKey(12))) table13 := string(codec.EncodeBytes(codec.GenerateTableKey(13))) e.Step = func(tick int64) map[string]int64 { - if tick > int64(getRegionNum())/10 { + if tick > int64(totalRegion)/10 { return nil } return map[string]int64{ @@ -141,14 +146,14 @@ func newImportData() *Case { if dev > 0.02 { simutil.Logger.Warn("Not balanced, change scheduler or store limit", zap.Float64("dev score", dev)) } - if checkCount > uint64(getRegionNum())/5 { + if checkCount > uint64(totalRegion)/5 { isEnd = true - } else if checkCount > uint64(getRegionNum())/10 { + } else if checkCount > uint64(totalRegion)/10 { isEnd = dev < 0.01 } if isEnd { - renderPlot("new_region.html", newRegionCount, int(checkCount), 0, getRegionNum()/10) - renderPlot("all_region.html", allRegionCount, int(checkCount), 28*getRegionNum()/100, getRegionNum()/3) + renderPlot("new_region.html", newRegionCount, int(checkCount), 0, totalRegion/10) + renderPlot("all_region.html", allRegionCount, int(checkCount), 28*totalRegion/100, totalRegion/3) } return isEnd } diff --git a/tools/pd-simulator/simulator/cases/makeup_down_replica.go b/tools/pd-simulator/simulator/cases/makeup_down_replica.go index 86c9b4cac1d..28de9577cfc 100644 --- a/tools/pd-simulator/simulator/cases/makeup_down_replica.go +++ b/tools/pd-simulator/simulator/cases/makeup_down_replica.go @@ -18,27 +18,31 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newMakeupDownReplicas() *Case { +func newMakeupDownReplicas(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyStoreNum := storeNum - 1 - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + + noEmptyStoreNum := totalStore - 1 + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64((i)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum) + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -49,7 +53,7 @@ func newMakeupDownReplicas() *Case { }) } - numNodes := storeNum + numNodes := totalStore down := false e := &DeleteNodesDescriptor{} e.Step = func(tick int64) uint64 { @@ -65,31 +69,16 @@ func newMakeupDownReplicas() *Case { simCase.Events = []EventDescriptor{e} simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - sum := 0 - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - sum += regionCount - } - simutil.Logger.Info("current region counts", zap.Ints("region", regionCounts)) - - if down && sum < storeNum*regionNum { - // only need to print once - down = false - simutil.Logger.Error("making up replicas don't start immediately") + if !down { return false } - - res := true - threshold := 0.05 - for index, regionCount := range regionCounts { - if index == 0 { // storeId == 1 - continue + for i := 1; i <= totalStore; i++ { + peerCount := regions.GetStoreRegionCount(uint64(i)) + if isUniform(peerCount, replica*totalRegion/noEmptyStoreNum) { + return false } - res = res && isUniform(regionCount, storeNum*regionNum/noEmptyStoreNum, threshold) } - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/region_merge.go b/tools/pd-simulator/simulator/cases/region_merge.go index 3d5d57f804f..953b0e309e1 100644 --- a/tools/pd-simulator/simulator/cases/region_merge.go +++ b/tools/pd-simulator/simulator/cases/region_merge.go @@ -15,33 +15,33 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRegionMerge() *Case { +func newRegionMerge(config *sc.SimConfig) *Case { var simCase Case - // Initialize the cluster - storeNum, regionNum := getStoreNum(), getRegionNum() - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -52,18 +52,13 @@ func newRegionMerge() *Case { }) } // Checker description - threshold := 0.05 mergeRatio := 4 // when max-merge-region-size is 20, per region will reach 40MB simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - sum := 0 - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - sum += regionCount + currentPeerCount := 0 + for i := 1; i <= totalStore; i++ { + currentPeerCount += regions.GetStoreRegionCount(uint64(i)) } - simutil.Logger.Info("current counts", zap.Ints("region", regionCounts), zap.Int64("average region size", regions.GetAverageRegionSize())) - return isUniform(sum, storeNum*regionNum/mergeRatio, threshold) + return isUniform(currentPeerCount, totalRegion*replica/mergeRatio) } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/region_split.go b/tools/pd-simulator/simulator/cases/region_split.go index b85cd319494..7b712f4dc48 100644 --- a/tools/pd-simulator/simulator/cases/region_split.go +++ b/tools/pd-simulator/simulator/cases/region_split.go @@ -18,16 +18,15 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRegionSplit() *Case { +func newRegionSplit(config *sc.SimConfig) *Case { var simCase Case - // Initialize the cluster - storeNum := getStoreNum() - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: uint64(i), Status: metapb.StoreState_Up, @@ -57,15 +56,13 @@ func newRegionSplit() *Case { // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - res = res && regionCount > 5 + for i := 1; i <= totalStore; i++ { + peerCount := regions.GetStoreRegionCount(uint64(i)) + if peerCount < 5 { + return false + } } - simutil.Logger.Info("current counts", zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 808c991e97f..50ed57995df 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -30,6 +30,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" "google.golang.org/grpc" @@ -45,7 +46,7 @@ type Client interface { PutStore(ctx context.Context, store *metapb.Store) error StoreHeartbeat(ctx context.Context, stats *pdpb.StoreStats) error RegionHeartbeat(ctx context.Context, region *core.RegionInfo) error - PutPDConfig(*PDConfig) error + PutPDConfig(*sc.PDConfig) error Close() } @@ -316,7 +317,7 @@ func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { return nil } -func (c *client) PutPDConfig(config *PDConfig) error { +func (c *client) PutPDConfig(config *sc.PDConfig) error { if len(config.PlacementRules) > 0 { path := fmt.Sprintf("%s/%s/config/rules/batch", c.url, httpPrefix) ruleOps := make([]*placement.RuleOp, 0) diff --git a/tools/pd-simulator/simulator/config.go b/tools/pd-simulator/simulator/config/config.go similarity index 85% rename from tools/pd-simulator/simulator/config.go rename to tools/pd-simulator/simulator/config/config.go index 4f197fb83c2..01bf8199ab4 100644 --- a/tools/pd-simulator/simulator/config.go +++ b/tools/pd-simulator/simulator/config/config.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package simulator +package config import ( "fmt" @@ -31,8 +31,11 @@ import ( ) const ( - // tick - defaultSimTickInterval = 100 * time.Millisecond + // simulator + defaultSimTickInterval = 100 * time.Millisecond + defaultTotalStore = 3 + defaultTotalRegion = 1000 + defaultEnableTransferRegionCounter = false // store defaultStoreIOMBPerSecond = 40 defaultStoreHeartbeat = 10 * time.Second @@ -53,9 +56,12 @@ const ( // SimConfig is the simulator configuration. type SimConfig struct { - // tick - CaseName string `toml:"case-name"` - SimTickInterval typeutil.Duration `toml:"sim-tick-interval"` + // Simulator + CaseName string `toml:"case-name"` + TotalStore int `toml:"total-store"` + TotalRegion int `toml:"total-region"` + EnableTransferRegionCounter bool `toml:"enable-transfer-region-counter"` + SimTickInterval typeutil.Duration `toml:"sim-tick-interval"` // store StoreIOMBPerSecond int64 `toml:"store-io-per-second"` StoreVersion string `toml:"store-version"` @@ -99,6 +105,9 @@ func NewSimConfig(serverLogLevel string) *SimConfig { // Adjust is used to adjust configurations func (sc *SimConfig) Adjust(meta *toml.MetaData) error { configutil.AdjustDuration(&sc.SimTickInterval, defaultSimTickInterval) + configutil.AdjustInt(&sc.TotalStore, defaultTotalStore) + configutil.AdjustInt(&sc.TotalRegion, defaultTotalRegion) + configutil.AdjustBool(&sc.EnableTransferRegionCounter, defaultEnableTransferRegionCounter) configutil.AdjustInt64(&sc.StoreIOMBPerSecond, defaultStoreIOMBPerSecond) configutil.AdjustString(&sc.StoreVersion, versioninfo.PDReleaseVersion) configutil.AdjustDuration(&sc.RaftStore.RegionHeartBeatInterval, defaultRegionHeartbeat) @@ -118,7 +127,7 @@ func (sc *SimConfig) Adjust(meta *toml.MetaData) error { return sc.ServerConfig.Adjust(meta, false) } -func (sc *SimConfig) speed() uint64 { +func (sc *SimConfig) Speed() uint64 { return uint64(time.Second / sc.SimTickInterval.Duration) } diff --git a/tools/pd-simulator/simulator/conn.go b/tools/pd-simulator/simulator/conn.go index 588fec246d4..b95b33ee63d 100644 --- a/tools/pd-simulator/simulator/conn.go +++ b/tools/pd-simulator/simulator/conn.go @@ -17,6 +17,7 @@ package simulator import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" ) // Connection records the information of connection among nodes. @@ -26,7 +27,7 @@ type Connection struct { } // NewConnection creates nodes according to the configuration and returns the connection among nodes. -func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *SimConfig) (*Connection, error) { +func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *config.SimConfig) (*Connection, error) { conn := &Connection{ pdAddr: pdAddr, Nodes: make(map[uint64]*Node), diff --git a/tools/pd-simulator/simulator/drive.go b/tools/pd-simulator/simulator/drive.go index c7f64324c19..3d2bce74675 100644 --- a/tools/pd-simulator/simulator/drive.go +++ b/tools/pd-simulator/simulator/drive.go @@ -26,6 +26,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.etcd.io/etcd/clientv3" @@ -42,17 +43,17 @@ type Driver struct { eventRunner *EventRunner raftEngine *RaftEngine conn *Connection - simConfig *SimConfig - pdConfig *PDConfig + simConfig *config.SimConfig + pdConfig *config.PDConfig } // NewDriver returns a driver. -func NewDriver(pdAddr string, caseName string, simConfig *SimConfig) (*Driver, error) { - simCase := cases.NewCase(caseName) +func NewDriver(pdAddr string, caseName string, simConfig *config.SimConfig) (*Driver, error) { + simCase := cases.NewCase(caseName, simConfig) if simCase == nil { return nil, errors.Errorf("failed to create case %s", caseName) } - pdConfig := &PDConfig{} + pdConfig := &config.PDConfig{} pdConfig.PlacementRules = simCase.Rules pdConfig.LocationLabels = simCase.Labels return &Driver{ diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index 68a10a8638e..883b5d4474b 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -27,6 +27,7 @@ import ( "github.com/tikv/pd/pkg/ratelimit" "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" @@ -57,7 +58,7 @@ type Node struct { } // NewNode returns a Node. -func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { +func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) { ctx, cancel := context.WithCancel(context.Background()) store := &metapb.Store{ Id: s.ID, @@ -93,7 +94,7 @@ func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { cancel() return nil, err } - ratio := config.speed() + ratio := config.Speed() speed := config.StoreIOMBPerSecond * units.MiB * int64(ratio) return &Node{ Store: store, diff --git a/tools/pd-simulator/simulator/raft.go b/tools/pd-simulator/simulator/raft.go index fccf75781d3..d416f69ff80 100644 --- a/tools/pd-simulator/simulator/raft.go +++ b/tools/pd-simulator/simulator/raft.go @@ -22,6 +22,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) @@ -34,12 +35,12 @@ type RaftEngine struct { regionChange map[uint64][]uint64 regionSplitSize int64 regionSplitKeys int64 - storeConfig *SimConfig + storeConfig *config.SimConfig useTiDBEncodedKey bool } // NewRaftEngine creates the initialized raft with the configuration. -func NewRaftEngine(conf *cases.Case, conn *Connection, storeConfig *SimConfig) *RaftEngine { +func NewRaftEngine(conf *cases.Case, conn *Connection, storeConfig *config.SimConfig) *RaftEngine { r := &RaftEngine{ regionsInfo: core.NewRegionsInfo(), conn: conn, diff --git a/tools/pd-simulator/simulator/simutil/case_config.go b/tools/pd-simulator/simulator/simutil/case_config.go deleted file mode 100644 index a34035c15aa..00000000000 --- a/tools/pd-simulator/simulator/simutil/case_config.go +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2019 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package simutil - -// CaseConfig is to save flags -type CaseConfig struct { - StoreNum int - RegionNum int - EnableTransferRegionCounter bool -} - -// CaseConfigure is an global instance for CaseConfig -var CaseConfigure *CaseConfig - -// InitCaseConfig is to init caseConfigure -func InitCaseConfig(storeNum, regionNum int, enableTransferRegionCounter bool) { - CaseConfigure = &CaseConfig{ - StoreNum: storeNum, - RegionNum: regionNum, - EnableTransferRegionCounter: enableTransferRegionCounter, - } -} diff --git a/tools/pd-simulator/simulator/task.go b/tools/pd-simulator/simulator/task.go index a19854b53ba..c0bfa1e691b 100644 --- a/tools/pd-simulator/simulator/task.go +++ b/tools/pd-simulator/simulator/task.go @@ -415,7 +415,7 @@ func (a *addPeer) tick(engine *RaftEngine, region *core.RegionInfo) (newRegion * pendingPeers := append(region.GetPendingPeers(), a.peer) return region.Clone(core.WithAddPeer(a.peer), core.WithIncConfVer(), core.WithPendingPeers(pendingPeers)), false } - speed := engine.storeConfig.speed() + speed := engine.storeConfig.Speed() // Step 2: Process Snapshot if !processSnapshot(sendNode, a.sendingStat, speed) { return nil, false From 71490f72b4c57a70f4f5b4e3486018859f85189c Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 30 May 2024 16:46:21 +0800 Subject: [PATCH 21/21] pkg/member: Fixing residual counts in campaign times (#8226) close tikv/pd#8225 Signed-off-by: husharp --- pkg/election/leadership.go | 15 +++++++------- pkg/election/leadership_test.go | 33 ++++++++++++++++++++++++++++++ pkg/member/member.go | 3 ++- tests/server/member/member_test.go | 10 +++++++-- 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/pkg/election/leadership.go b/pkg/election/leadership.go index 02f519dbc75..3ee413818a5 100644 --- a/pkg/election/leadership.go +++ b/pkg/election/leadership.go @@ -34,11 +34,12 @@ import ( ) const ( - defaultCampaignTimesSlot = 10 - watchLoopUnhealthyTimeout = 60 * time.Second - campaignTimesRecordTimeout = 5 * time.Minute + defaultCampaignTimesSlot = 10 + watchLoopUnhealthyTimeout = 60 * time.Second ) +var campaignTimesRecordTimeout = 5 * time.Minute + // GetLeader gets the corresponding leader from etcd by given leaderPath (as the key). func GetLeader(c *clientv3.Client, leaderPath string) (*pdpb.Member, int64, error) { leader := &pdpb.Member{} @@ -114,6 +115,7 @@ func (ls *Leadership) GetLeaderKey() string { } // GetCampaignTimesNum is used to get the campaign times of the leader within `campaignTimesRecordTimeout`. +// Need to make sure `AddCampaignTimes` is called before this function. func (ls *Leadership) GetCampaignTimesNum() int { if ls == nil { return 0 @@ -129,8 +131,8 @@ func (ls *Leadership) ResetCampaignTimes() { ls.campaignTimes = make([]time.Time, 0, defaultCampaignTimesSlot) } -// addCampaignTimes is used to add the campaign times of the leader. -func (ls *Leadership) addCampaignTimes() { +// AddCampaignTimes is used to add the campaign times of the leader. +func (ls *Leadership) AddCampaignTimes() { if ls == nil { return } @@ -138,7 +140,7 @@ func (ls *Leadership) addCampaignTimes() { if time.Since(ls.campaignTimes[i]) > campaignTimesRecordTimeout { // remove the time which is more than `campaignTimesRecordTimeout` // array is sorted by time - ls.campaignTimes = ls.campaignTimes[i:] + ls.campaignTimes = ls.campaignTimes[i+1:] break } } @@ -148,7 +150,6 @@ func (ls *Leadership) addCampaignTimes() { // Campaign is used to campaign the leader with given lease and returns a leadership func (ls *Leadership) Campaign(leaseTimeout int64, leaderData string, cmps ...clientv3.Cmp) error { - ls.addCampaignTimes() ls.leaderValue = leaderData // Create a new lease to campaign newLease := &lease{ diff --git a/pkg/election/leadership_test.go b/pkg/election/leadership_test.go index 1fde4ddeba7..40f0bcbee23 100644 --- a/pkg/election/leadership_test.go +++ b/pkg/election/leadership_test.go @@ -262,3 +262,36 @@ func TestRequestProgress(t *testing.T) { checkWatcherRequestProgress(false) checkWatcherRequestProgress(true) } + +func TestCampaignTimes(t *testing.T) { + re := require.New(t) + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() + leadership := NewLeadership(client, "test_leader", "test_leader") + + // all the campaign times are within the timeout. + campaignTimesRecordTimeout = 10 * time.Second + defer func() { + campaignTimesRecordTimeout = 5 * time.Minute + }() + for i := 0; i < 3; i++ { + leadership.AddCampaignTimes() + time.Sleep(100 * time.Millisecond) + } + re.Equal(3, leadership.GetCampaignTimesNum()) + + // only the last 2 records are valid. + campaignTimesRecordTimeout = 200 * time.Millisecond + for i := 0; i < 3; i++ { + leadership.AddCampaignTimes() + time.Sleep(100 * time.Millisecond) + } + re.Equal(2, leadership.GetCampaignTimesNum()) + + time.Sleep(200 * time.Millisecond) + // need to wait for the next addCampaignTimes to update the campaign time. + re.Equal(2, leadership.GetCampaignTimesNum()) + // check campaign leader frequency. + leadership.AddCampaignTimes() + re.Equal(1, leadership.GetCampaignTimesNum()) +} diff --git a/pkg/member/member.go b/pkg/member/member.go index af504d83963..bbf46d8f167 100644 --- a/pkg/member/member.go +++ b/pkg/member/member.go @@ -182,11 +182,12 @@ func (m *EmbeddedEtcdMember) GetLastLeaderUpdatedTime() time.Time { // and make it become a PD leader. // leader should be changed when campaign leader frequently. func (m *EmbeddedEtcdMember) CampaignLeader(ctx context.Context, leaseTimeout int64) error { + m.leadership.AddCampaignTimes() failpoint.Inject("skipCampaignLeaderCheck", func() { failpoint.Return(m.leadership.Campaign(leaseTimeout, m.MemberValue())) }) - if m.leadership.GetCampaignTimesNum() >= campaignLeaderFrequencyTimes { + if m.leadership.GetCampaignTimesNum() > campaignLeaderFrequencyTimes { if err := m.ResignEtcdLeader(ctx, m.Name(), ""); err != nil { return err } diff --git a/tests/server/member/member_test.go b/tests/server/member/member_test.go index c581eb39390..edff14a3b98 100644 --- a/tests/server/member/member_test.go +++ b/tests/server/member/member_test.go @@ -328,20 +328,26 @@ func TestCampaignLeaderFrequently(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - cluster, err := tests.NewTestCluster(ctx, 5) + cluster, err := tests.NewTestCluster(ctx, 3) defer cluster.Destroy() re.NoError(err) err = cluster.RunInitialServers() re.NoError(err) + // the 1st time campaign leader. cluster.WaitLeader() leader := cluster.GetLeader() re.NotEmpty(cluster.GetLeader()) - for i := 0; i < 3; i++ { + // need to prevent 3 times(including the above 1st time) campaign leader in 5 min. + for i := 0; i < 2; i++ { cluster.GetLeaderServer().ResetPDLeader() cluster.WaitLeader() + re.Equal(leader, cluster.GetLeader()) } + // check for the 4th time. + cluster.GetLeaderServer().ResetPDLeader() + cluster.WaitLeader() // PD leader should be different from before because etcd leader changed. re.NotEmpty(cluster.GetLeader()) re.NotEqual(leader, cluster.GetLeader())