From eefaf7d4698c421662d099c6f770add889e7df8f Mon Sep 17 00:00:00 2001 From: hillium Date: Tue, 3 Aug 2021 13:59:16 +0800 Subject: [PATCH 1/3] restore: retry scatter error --- pkg/restore/split.go | 16 ++++++++++-- pkg/restore/split_client.go | 51 +++++++++++++++++++++++++++++++++++++ pkg/restore/split_test.go | 27 +++++++++++++++++++- 3 files changed, 91 insertions(+), 3 deletions(-) diff --git a/pkg/restore/split.go b/pkg/restore/split.go index 803701f6d..c725f2202 100644 --- a/pkg/restore/split.go +++ b/pkg/restore/split.go @@ -21,6 +21,7 @@ import ( berrors "github.com/pingcap/br/pkg/errors" "github.com/pingcap/br/pkg/logutil" "github.com/pingcap/br/pkg/rtree" + "github.com/pingcap/br/pkg/utils" ) // Constants for split retry machinery. @@ -272,14 +273,25 @@ func (rs *RegionSplitter) splitAndScatterRegions( if err != nil { return nil, errors.Trace(err) } + rs.ScatterRegions(ctx, newRegions) + return newRegions, nil +} + +func (rs *RegionSplitter) ScatterRegions(ctx context.Context, newRegions []*RegionInfo) { for _, region := range newRegions { // Wait for a while until the regions successfully split. rs.waitForSplit(ctx, region.Region.Id) - if err = rs.client.ScatterRegion(ctx, region); err != nil { + if err := utils.WithRetry(ctx, + func() error { return rs.client.ScatterRegion(ctx, region) }, + // backoff about ~2s, or we give up scattering this region. + &scatterBackoffer{ + attempt: 5, + baseBackoff: 100 * time.Millisecond, + }, + ); err != nil { log.Warn("scatter region failed", logutil.Region(region.Region), zap.Error(err)) } } - return newRegions, nil } // PaginateScanRegion scan regions with a limit pagination and diff --git a/pkg/restore/split_client.go b/pkg/restore/split_client.go index 71cb2886e..8e5edb94e 100755 --- a/pkg/restore/split_client.go +++ b/pkg/restore/split_client.go @@ -14,6 +14,7 @@ import ( "strconv" "strings" "sync" + "time" "github.com/pingcap/errors" "github.com/pingcap/failpoint" @@ -29,6 +30,7 @@ import ( "go.uber.org/zap" "google.golang.org/grpc" "google.golang.org/grpc/credentials" + "google.golang.org/grpc/status" berrors "github.com/pingcap/br/pkg/errors" "github.com/pingcap/br/pkg/httputil" @@ -498,3 +500,52 @@ func checkRegionEpoch(new, old *RegionInfo) bool { new.Region.GetRegionEpoch().GetVersion() == old.Region.GetRegionEpoch().GetVersion() && new.Region.GetRegionEpoch().GetConfVer() == old.Region.GetRegionEpoch().GetConfVer() } + +type scatterBackoffer struct { + attempt int + baseBackoff time.Duration +} + +func (b *scatterBackoffer) exponentialBackoff() time.Duration { + bo := b.baseBackoff + b.attempt-- + if b.attempt == 0 { + return 0 + } + b.baseBackoff *= 2 + return bo +} + +func (b *scatterBackoffer) giveUp() time.Duration { + b.attempt = 0 + return 0 +} + +// NextBackoff returns a duration to wait before retrying again +func (b *scatterBackoffer) NextBackoff(err error) time.Duration { + // There are 3 type of reason that BR would reject a `scatter` request: + // (1) region %d has no leader + // (2) region %d is hot + // (3) region %d is not fully replicated + // + // (2) shouldn't happen in a recently splitted region. + // (1) and (3) might happen, and should be retried. + grpcErr := status.Convert(err) + if grpcErr == nil { + return b.giveUp() + } + if strings.Contains(grpcErr.Message(), "is not fully replicated") { + log.Warn("scatter region failed, retring", logutil.ShortError(err), zap.Int("attempt-remain", b.attempt)) + return b.exponentialBackoff() + } + if strings.Contains(grpcErr.Message(), "has no leader") { + log.Warn("scatter region failed, retring", logutil.ShortError(err), zap.Int("attempt-remain", b.attempt)) + return b.exponentialBackoff() + } + return b.giveUp() +} + +// Attempt returns the remain attempt times +func (b *scatterBackoffer) Attempt() int { + return b.attempt +} diff --git a/pkg/restore/split_test.go b/pkg/restore/split_test.go index 9164f7b17..e5f19fb65 100644 --- a/pkg/restore/split_test.go +++ b/pkg/restore/split_test.go @@ -15,6 +15,8 @@ import ( "github.com/pingcap/tidb/util/codec" "github.com/tikv/pd/server/core" "github.com/tikv/pd/server/schedule/placement" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "github.com/pingcap/br/pkg/restore" "github.com/pingcap/br/pkg/rtree" @@ -26,6 +28,8 @@ type TestClient struct { regions map[uint64]*restore.RegionInfo regionsInfo *core.RegionsInfo // For now it's only used in ScanRegions nextRegionID uint64 + + scattered map[uint64]bool } func NewTestClient( @@ -42,6 +46,7 @@ func NewTestClient( regions: regions, regionsInfo: regionsInfo, nextRegionID: nextRegionID, + scattered: map[uint64]bool{}, } } @@ -160,6 +165,11 @@ func (c *TestClient) BatchSplitRegions( } func (c *TestClient) ScatterRegion(ctx context.Context, regionInfo *restore.RegionInfo) error { + if _, ok := c.scattered[regionInfo.Region.Id]; !ok { + c.scattered[regionInfo.Region.Id] = false + return status.Errorf(codes.Unknown, "region %d is not fully replicated", regionInfo.Region.Id) + } + c.scattered[regionInfo.Region.Id] = true return nil } @@ -197,13 +207,22 @@ func (c *TestClient) SetStoresLabel(ctx context.Context, stores []uint64, labelK return nil } +func (c *TestClient) checkScatter(check *C) { + regions := c.GetAllRegions() + for key := range regions { + if !c.scattered[key] { + check.Fatalf("region %d has not been scattered: %#v", key, regions[key]) + } + } +} + // region: [, aay), [aay, bba), [bba, bbh), [bbh, cca), [cca, ) // range: [aaa, aae), [aae, aaz), [ccd, ccf), [ccf, ccj) // rewrite rules: aa -> xx, cc -> bb // expected regions after split: // [, aay), [aay, bb), [bb, bba), [bba, bbf), [bbf, bbh), [bbh, bbj), // [bbj, cca), [cca, xx), [xx, xxe), [xxe, xxz), [xxz, ) -func (s *testRangeSuite) TestSplit(c *C) { +func (s *testRangeSuite) TestSplitAndScatter(c *C) { client := initTestClient() ranges := initRanges() rewriteRules := initRewriteRules() @@ -222,6 +241,12 @@ func (s *testRangeSuite) TestSplit(c *C) { c.Log("get wrong result") c.Fail() } + regionInfos := make([]*restore.RegionInfo, 0, len(regions)) + for _, info := range regions { + regionInfos = append(regionInfos, info) + } + regionSplitter.ScatterRegions(ctx, regionInfos) + client.checkScatter(c) } // region: [, aay), [aay, bba), [bba, bbh), [bbh, cca), [cca, ) From 3418442acf048e605487aebc292dc962be2d7e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 3 Aug 2021 14:20:13 +0800 Subject: [PATCH 2/3] Update pkg/restore/split_client.go --- pkg/restore/split_client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/restore/split_client.go b/pkg/restore/split_client.go index 8e5edb94e..05347e80f 100755 --- a/pkg/restore/split_client.go +++ b/pkg/restore/split_client.go @@ -523,7 +523,7 @@ func (b *scatterBackoffer) giveUp() time.Duration { // NextBackoff returns a duration to wait before retrying again func (b *scatterBackoffer) NextBackoff(err error) time.Duration { - // There are 3 type of reason that BR would reject a `scatter` request: + // There are 3 type of reason that PD would reject a `scatter` request: // (1) region %d has no leader // (2) region %d is hot // (3) region %d is not fully replicated From 01ee385576a490413c99436fd2f3b1ddb729d3c0 Mon Sep 17 00:00:00 2001 From: hillium Date: Tue, 3 Aug 2021 14:50:15 +0800 Subject: [PATCH 3/3] restore: fix lints --- pkg/restore/split.go | 7 ++++--- pkg/restore/split_client.go | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/restore/split.go b/pkg/restore/split.go index c725f2202..7d7f7d31e 100644 --- a/pkg/restore/split.go +++ b/pkg/restore/split.go @@ -277,19 +277,20 @@ func (rs *RegionSplitter) splitAndScatterRegions( return newRegions, nil } +// ScatterRegions scatter the regions. func (rs *RegionSplitter) ScatterRegions(ctx context.Context, newRegions []*RegionInfo) { for _, region := range newRegions { // Wait for a while until the regions successfully split. rs.waitForSplit(ctx, region.Region.Id) if err := utils.WithRetry(ctx, func() error { return rs.client.ScatterRegion(ctx, region) }, - // backoff about ~2s, or we give up scattering this region. + // backoff about 6s, or we give up scattering this region. &scatterBackoffer{ - attempt: 5, + attempt: 7, baseBackoff: 100 * time.Millisecond, }, ); err != nil { - log.Warn("scatter region failed", logutil.Region(region.Region), zap.Error(err)) + log.Warn("scatter region failed, stop retry", logutil.Region(region.Region), zap.Error(err)) } } } diff --git a/pkg/restore/split_client.go b/pkg/restore/split_client.go index 8e5edb94e..3dda210e5 100755 --- a/pkg/restore/split_client.go +++ b/pkg/restore/split_client.go @@ -535,11 +535,11 @@ func (b *scatterBackoffer) NextBackoff(err error) time.Duration { return b.giveUp() } if strings.Contains(grpcErr.Message(), "is not fully replicated") { - log.Warn("scatter region failed, retring", logutil.ShortError(err), zap.Int("attempt-remain", b.attempt)) + log.Info("scatter region failed, retring", logutil.ShortError(err), zap.Int("attempt-remain", b.attempt)) return b.exponentialBackoff() } if strings.Contains(grpcErr.Message(), "has no leader") { - log.Warn("scatter region failed, retring", logutil.ShortError(err), zap.Int("attempt-remain", b.attempt)) + log.Info("scatter region failed, retring", logutil.ShortError(err), zap.Int("attempt-remain", b.attempt)) return b.exponentialBackoff() } return b.giveUp()