Skip to content

Commit ce640b9

Browse files
authored
region cache: check if the pd returned regions covers the ranges (tikv#1377) (tikv#1380)
* cherry pick tikv#1377 to tidb-7.1 Signed-off-by: you06 <[email protected]> * lint Signed-off-by: you06 <[email protected]> * rename function Signed-off-by: you06 <[email protected]> * warn when no leader regions missing Signed-off-by: you06 <[email protected]> --------- Signed-off-by: you06 <[email protected]>
1 parent c6d7a48 commit ce640b9

File tree

3 files changed

+171
-2
lines changed

3 files changed

+171
-2
lines changed

internal/locate/region_cache.go

+55-2
Original file line numberDiff line numberDiff line change
@@ -1784,7 +1784,19 @@ func (c *RegionCache) scanRegions(bo *retry.Backoffer, startKey, endKey []byte,
17841784
metrics.RegionCacheCounterWithScanRegionsOK.Inc()
17851785

17861786
if len(regionsInfo) == 0 {
1787-
return nil, errors.New("PD returned no region")
1787+
return nil, errors.Errorf(
1788+
"PD returned no region, startKey: %q, endKey: %q, limit: %d, encode_start_key: %q, encode_end_key: %q",
1789+
util.HexRegionKeyStr(startKey), util.HexRegionKeyStr(endKey), limit,
1790+
util.HexRegionKeyStr(c.codec.EncodeRegionKey(startKey)), util.HexRegionKeyStr(c.codec.EncodeRegionKey(endKey)),
1791+
)
1792+
}
1793+
1794+
if regionsHaveGapInRange(startKey, endKey, regionsInfo, limit) {
1795+
backoffErr = errors.Errorf(
1796+
"PD returned regions have gaps, startKey: %q, endKey: %q, limit: %d",
1797+
startKey, endKey, limit,
1798+
)
1799+
continue
17881800
}
17891801
regions := make([]*Region, 0, len(regionsInfo))
17901802
for _, r := range regionsInfo {
@@ -1802,13 +1814,54 @@ func (c *RegionCache) scanRegions(bo *retry.Backoffer, startKey, endKey []byte,
18021814
return nil, errors.New("receive Regions with no peer")
18031815
}
18041816
if len(regions) < len(regionsInfo) {
1805-
logutil.Logger(context.Background()).Debug(
1817+
logutil.Logger(context.Background()).Warn(
18061818
"regionCache: scanRegion finished but some regions has no leader.")
18071819
}
18081820
return regions, nil
18091821
}
18101822
}
18111823

1824+
// regionsHaveGapInRange checks if the loaded regions can fully cover the key ranges.
1825+
// If there are any gaps between the regions, it returns true, then the requests might be retried.
1826+
// TODO: remove this function after PD client supports gap detection and handling it.
1827+
func regionsHaveGapInRange(start, end []byte, regionsInfo []*pd.Region, limit int) bool {
1828+
if len(regionsInfo) == 0 {
1829+
return true
1830+
}
1831+
var lastEndKey []byte
1832+
for i, r := range regionsInfo {
1833+
if r.Meta == nil {
1834+
return true
1835+
}
1836+
if i == 0 {
1837+
if bytes.Compare(r.Meta.StartKey, start) > 0 {
1838+
// there is a gap between first returned region's start_key and start key.
1839+
return true
1840+
}
1841+
}
1842+
if i > 0 && bytes.Compare(r.Meta.StartKey, lastEndKey) > 0 {
1843+
// there is a gap between two regions.
1844+
return true
1845+
}
1846+
if len(r.Meta.EndKey) == 0 {
1847+
// the current region contains all the rest ranges.
1848+
return false
1849+
}
1850+
// note lastEndKey never be empty.
1851+
lastEndKey = r.Meta.EndKey
1852+
}
1853+
if limit > 0 && len(regionsInfo) == limit {
1854+
// the regionsInfo is limited by the limit, so there may be some ranges not covered.
1855+
// The rest range will be loaded in the next scanRegions call.
1856+
return false
1857+
}
1858+
if len(end) == 0 {
1859+
// the end key of the range is empty, but we can't cover it.
1860+
return true
1861+
}
1862+
return bytes.Compare(lastEndKey, end) < 0
1863+
}
1864+
18121865
// GetCachedRegionWithRLock returns region with lock.
18131866
func (c *RegionCache) GetCachedRegionWithRLock(regionID RegionVerID) (r *Region) {
18141867
c.mu.RLock()

internal/locate/region_cache_test.go

+106
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import (
4646
"unsafe"
4747

4848
"github.com/gogo/protobuf/proto"
49+
"github.com/pingcap/failpoint"
4950
"github.com/pingcap/kvproto/pkg/errorpb"
5051
"github.com/pingcap/kvproto/pkg/metapb"
5152
"github.com/stretchr/testify/suite"
@@ -1782,3 +1783,108 @@ func (s *testRegionRequestToSingleStoreSuite) TestRefreshCacheConcurrency() {
17821783

17831784
cancel()
17841785
}
1786+
1787+
func (s *testRegionCacheSuite) TestRangesAreCoveredCheck() {
1788+
check := func(ranges []string, regions []string, limit int, expect bool) {
1789+
s.Len(ranges, 2)
1790+
rgs := make([]*pd.Region, 0, len(regions))
1791+
for i := 0; i < len(regions); i += 2 {
1792+
rgs = append(rgs, &pd.Region{Meta: &metapb.Region{
1793+
StartKey: []byte(regions[i]),
1794+
EndKey: []byte(regions[i+1]),
1795+
}})
1796+
}
1797+
s.Equal(expect, regionsHaveGapInRange([]byte(ranges[0]), []byte(ranges[1]), rgs, limit))
1798+
}
1799+
1800+
boundCase := []string{"a", "c"}
1801+
// positive
1802+
check(boundCase, []string{"a", "c"}, -1, false)
1803+
check(boundCase, []string{"a", ""}, -1, false)
1804+
check(boundCase, []string{"", "c"}, -1, false)
1805+
// negative
1806+
check(boundCase, []string{"a", "b"}, -1, true)
1807+
check(boundCase, []string{"b", "c"}, -1, true)
1808+
check(boundCase, []string{"b", ""}, -1, true)
1809+
check(boundCase, []string{"", "b"}, -1, true)
1810+
// positive
1811+
check(boundCase, []string{"a", "b", "b", "c"}, -1, false)
1812+
check(boundCase, []string{"", "b", "b", "c"}, -1, false)
1813+
check(boundCase, []string{"a", "b", "b", ""}, -1, false)
1814+
check(boundCase, []string{"", "b", "b", ""}, -1, false)
1815+
// negative
1816+
check(boundCase, []string{"a", "b", "b1", "c"}, -1, true)
1817+
check(boundCase, []string{"", "b", "b1", "c"}, -1, true)
1818+
check(boundCase, []string{"a", "b", "b1", ""}, -1, true)
1819+
check(boundCase, []string{"", "b", "b1", ""}, -1, true)
1820+
check(boundCase, []string{}, -1, true)
1821+
1822+
unboundCase := []string{"", ""}
1823+
// positive
1824+
check(unboundCase, []string{"", ""}, -1, false)
1825+
// negative
1826+
check(unboundCase, []string{"a", "c"}, -1, true)
1827+
check(unboundCase, []string{"a", ""}, -1, true)
1828+
check(unboundCase, []string{"", "c"}, -1, true)
1829+
// positive
1830+
check(unboundCase, []string{"", "b", "b", ""}, -1, false)
1831+
// negative
1832+
check(unboundCase, []string{"", "b", "b1", ""}, -1, true)
1833+
check(unboundCase, []string{"a", "b", "b", ""}, -1, true)
1834+
check(unboundCase, []string{"", "b", "b", "c"}, -1, true)
1835+
check(unboundCase, []string{}, -1, true)
1836+
1837+
// test half bounded ranges
1838+
check([]string{"", "b"}, []string{"", "a"}, -1, true)
1839+
check([]string{"", "b"}, []string{"", "a"}, 1, false) // it's just limitation reached
1840+
check([]string{"", "b"}, []string{"", "a"}, 2, true)
1841+
check([]string{"a", ""}, []string{"b", ""}, -1, true)
1842+
check([]string{"a", ""}, []string{"b", ""}, 1, true)
1843+
check([]string{"a", ""}, []string{"b", "c"}, 1, true)
1844+
check([]string{"a", ""}, []string{"a", ""}, -1, false)
1845+
}
1846+
1847+
func (s *testRegionCacheSuite) TestScanRegionsWithGaps() {
1848+
// Split at "a", "c", "e"
1849+
// nil --- 'a' --- 'c' --- 'e' --- nil
1850+
// <- 0 -> <- 1 -> <- 2 -> <- 3 -->
1851+
regions := s.cluster.AllocIDs(3)
1852+
regions = append([]uint64{s.region1}, regions...)
1853+
1854+
peers := [][]uint64{{s.peer1, s.peer2}}
1855+
for i := 0; i < 3; i++ {
1856+
peers = append(peers, s.cluster.AllocIDs(2))
1857+
}
1858+
1859+
for i := 0; i < 3; i++ {
1860+
s.cluster.Split(regions[i], regions[i+1], []byte{'a' + 2*byte(i)}, peers[i+1], peers[i+1][0])
1861+
}
1862+
1863+
// the last region is not reported to PD yet
1864+
getRegionIDsWithInject := func(fn func() ([]*Region, error)) []uint64 {
1865+
s.cache.clear()
1866+
err := failpoint.Enable("tikvclient/mockSplitRegionNotReportToPD", fmt.Sprintf(`return(%d)`, regions[2]))
1867+
s.Nil(err)
1868+
resCh := make(chan []*Region)
1869+
errCh := make(chan error)
1870+
go func() {
1871+
rs, err := fn()
1872+
errCh <- err
1873+
resCh <- rs
1874+
}()
1875+
time.Sleep(time.Second)
1876+
failpoint.Disable("tikvclient/mockSplitRegionNotReportToPD")
1877+
s.Nil(<-errCh)
1878+
rs := <-resCh
1879+
regionIDs := make([]uint64, 0, len(rs))
1880+
for _, r := range rs {
1881+
regionIDs = append(regionIDs, r.GetID())
1882+
}
1883+
return regionIDs
1884+
}
1885+
1886+
scanRegionRes := getRegionIDsWithInject(func() ([]*Region, error) {
1887+
return s.cache.BatchLoadRegionsWithKeyRange(s.bo, []byte(""), []byte(""), 10)
1888+
})
1889+
s.Equal(scanRegionRes, regions)
1890+
}

internal/mockstore/mocktikv/cluster.go

+10
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import (
4646
"github.com/pingcap/kvproto/pkg/kvrpcpb"
4747
"github.com/pingcap/kvproto/pkg/metapb"
4848
"github.com/tikv/client-go/v2/internal/mockstore/cluster"
49+
"github.com/tikv/client-go/v2/util"
4950
pd "github.com/tikv/pd/client"
5051
)
5152

@@ -348,6 +349,15 @@ func (c *Cluster) ScanRegions(startKey, endKey []byte, limit int) []*pd.Region {
348349
regions = regions[:endPos]
349350
}
350351
}
352+
if rid, err := util.EvalFailpoint("mockSplitRegionNotReportToPD"); err == nil {
353+
notReportRegionID := uint64(rid.(int))
354+
for i, r := range regions {
355+
if r.Meta.Id == notReportRegionID {
356+
regions = append(regions[:i], regions[i+1:]...)
357+
break
358+
}
359+
}
360+
}
351361
if limit > 0 && len(regions) > limit {
352362
regions = regions[:limit]
353363
}

0 commit comments

Comments
 (0)