Skip to content
This repository has been archived by the owner on Jul 24, 2024. It is now read-only.

restore: retry scatter error #1402

Merged
merged 6 commits into from
Aug 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions pkg/restore/split.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
berrors "github.com/pingcap/br/pkg/errors"
"github.com/pingcap/br/pkg/logutil"
"github.com/pingcap/br/pkg/rtree"
"github.com/pingcap/br/pkg/utils"
)

// Constants for split retry machinery.
Expand Down Expand Up @@ -272,14 +273,26 @@ func (rs *RegionSplitter) splitAndScatterRegions(
if err != nil {
return nil, errors.Trace(err)
}
rs.ScatterRegions(ctx, newRegions)
return newRegions, nil
}

// ScatterRegions scatter the regions.
func (rs *RegionSplitter) ScatterRegions(ctx context.Context, newRegions []*RegionInfo) {
for _, region := range newRegions {
// Wait for a while until the regions successfully split.
rs.waitForSplit(ctx, region.Region.Id)
if err = rs.client.ScatterRegion(ctx, region); err != nil {
log.Warn("scatter region failed", logutil.Region(region.Region), zap.Error(err))
if err := utils.WithRetry(ctx,
func() error { return rs.client.ScatterRegion(ctx, region) },
// backoff about 6s, or we give up scattering this region.
&scatterBackoffer{
attempt: 7,
baseBackoff: 100 * time.Millisecond,
},
); err != nil {
log.Warn("scatter region failed, stop retry", logutil.Region(region.Region), zap.Error(err))
}
}
return newRegions, nil
}

// PaginateScanRegion scan regions with a limit pagination and
Expand Down
51 changes: 51 additions & 0 deletions pkg/restore/split_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"strconv"
"strings"
"sync"
"time"

"github.com/pingcap/errors"
"github.com/pingcap/failpoint"
Expand All @@ -29,6 +30,7 @@ import (
"go.uber.org/zap"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/status"

berrors "github.com/pingcap/br/pkg/errors"
"github.com/pingcap/br/pkg/httputil"
Expand Down Expand Up @@ -498,3 +500,52 @@ func checkRegionEpoch(new, old *RegionInfo) bool {
new.Region.GetRegionEpoch().GetVersion() == old.Region.GetRegionEpoch().GetVersion() &&
new.Region.GetRegionEpoch().GetConfVer() == old.Region.GetRegionEpoch().GetConfVer()
}

type scatterBackoffer struct {
attempt int
baseBackoff time.Duration
}

func (b *scatterBackoffer) exponentialBackoff() time.Duration {
bo := b.baseBackoff
b.attempt--
if b.attempt == 0 {
return 0
}
b.baseBackoff *= 2
return bo
}

func (b *scatterBackoffer) giveUp() time.Duration {
b.attempt = 0
return 0
}

// NextBackoff returns a duration to wait before retrying again
func (b *scatterBackoffer) NextBackoff(err error) time.Duration {
// There are 3 type of reason that PD would reject a `scatter` request:
// (1) region %d has no leader
// (2) region %d is hot
// (3) region %d is not fully replicated
//
// (2) shouldn't happen in a recently splitted region.
// (1) and (3) might happen, and should be retried.
grpcErr := status.Convert(err)
if grpcErr == nil {
return b.giveUp()
}
if strings.Contains(grpcErr.Message(), "is not fully replicated") {
log.Info("scatter region failed, retring", logutil.ShortError(err), zap.Int("attempt-remain", b.attempt))
return b.exponentialBackoff()
}
if strings.Contains(grpcErr.Message(), "has no leader") {
log.Info("scatter region failed, retring", logutil.ShortError(err), zap.Int("attempt-remain", b.attempt))
return b.exponentialBackoff()
}
return b.giveUp()
}

// Attempt returns the remain attempt times
func (b *scatterBackoffer) Attempt() int {
return b.attempt
}
27 changes: 26 additions & 1 deletion pkg/restore/split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import (
"github.com/pingcap/tidb/util/codec"
"github.com/tikv/pd/server/core"
"github.com/tikv/pd/server/schedule/placement"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"github.com/pingcap/br/pkg/restore"
"github.com/pingcap/br/pkg/rtree"
Expand All @@ -26,6 +28,8 @@ type TestClient struct {
regions map[uint64]*restore.RegionInfo
regionsInfo *core.RegionsInfo // For now it's only used in ScanRegions
nextRegionID uint64

scattered map[uint64]bool
}

func NewTestClient(
Expand All @@ -42,6 +46,7 @@ func NewTestClient(
regions: regions,
regionsInfo: regionsInfo,
nextRegionID: nextRegionID,
scattered: map[uint64]bool{},
}
}

Expand Down Expand Up @@ -160,6 +165,11 @@ func (c *TestClient) BatchSplitRegions(
}

func (c *TestClient) ScatterRegion(ctx context.Context, regionInfo *restore.RegionInfo) error {
if _, ok := c.scattered[regionInfo.Region.Id]; !ok {
c.scattered[regionInfo.Region.Id] = false
return status.Errorf(codes.Unknown, "region %d is not fully replicated", regionInfo.Region.Id)
}
c.scattered[regionInfo.Region.Id] = true
return nil
}

Expand Down Expand Up @@ -197,13 +207,22 @@ func (c *TestClient) SetStoresLabel(ctx context.Context, stores []uint64, labelK
return nil
}

func (c *TestClient) checkScatter(check *C) {
regions := c.GetAllRegions()
for key := range regions {
if !c.scattered[key] {
check.Fatalf("region %d has not been scattered: %#v", key, regions[key])
}
}
}

// region: [, aay), [aay, bba), [bba, bbh), [bbh, cca), [cca, )
// range: [aaa, aae), [aae, aaz), [ccd, ccf), [ccf, ccj)
// rewrite rules: aa -> xx, cc -> bb
// expected regions after split:
// [, aay), [aay, bb), [bb, bba), [bba, bbf), [bbf, bbh), [bbh, bbj),
// [bbj, cca), [cca, xx), [xx, xxe), [xxe, xxz), [xxz, )
func (s *testRangeSuite) TestSplit(c *C) {
func (s *testRangeSuite) TestSplitAndScatter(c *C) {
client := initTestClient()
ranges := initRanges()
rewriteRules := initRewriteRules()
Expand All @@ -222,6 +241,12 @@ func (s *testRangeSuite) TestSplit(c *C) {
c.Log("get wrong result")
c.Fail()
}
regionInfos := make([]*restore.RegionInfo, 0, len(regions))
for _, info := range regions {
regionInfos = append(regionInfos, info)
}
regionSplitter.ScatterRegions(ctx, regionInfos)
client.checkScatter(c)
}

// region: [, aay), [aay, bba), [bba, bbh), [bbh, cca), [cca, )
Expand Down