From b693b2a25e5f3c101a1d73cebcb4b01774bee832 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 24 Apr 2023 15:52:45 +0800 Subject: [PATCH] lightning: always retry from rescan when NotLeader (#43351) * lightning: always retry from rescan when NotLeader Signed-off-by: lance6716 * fix CI Signed-off-by: lance6716 * fix CI Signed-off-by: lance6716 --------- Signed-off-by: lance6716 --- br/pkg/lightning/backend/local/local.go | 19 +--- br/pkg/lightning/backend/local/local_test.go | 101 ++++++++++++++++++- 2 files changed, 100 insertions(+), 20 deletions(-) diff --git a/br/pkg/lightning/backend/local/local.go b/br/pkg/lightning/backend/local/local.go index eea3a734b0176..a7db0b8537314 100644 --- a/br/pkg/lightning/backend/local/local.go +++ b/br/pkg/lightning/backend/local/local.go @@ -1359,7 +1359,7 @@ WriteAndIngest: local.ingestConcurrency.Recycle(w) if err != nil { if !local.isRetryableImportTiKVError(err) { - return err + return errors.Trace(err) } _, regionStart, _ := codec.DecodeBytes(region.Region.StartKey, []byte{}) // if we have at least succeeded one region, retry without increasing the retry count @@ -1987,22 +1987,7 @@ func (local *local) isIngestRetryable( var err error switch errPb := resp.GetError(); { case errPb.NotLeader != nil: - if newLeader := errPb.GetNotLeader().GetLeader(); newLeader != nil { - newRegion = &split.RegionInfo{ - Leader: newLeader, - Region: region.Region, - } - } else { - newRegion, err = getRegion() - if err != nil { - return retryNone, nil, errors.Trace(err) - } - } - // TODO: because in some case, TiKV may return retryable error while the ingest is succeeded. - // Thus directly retry ingest may cause TiKV panic. So always return retryWrite here to avoid - // this issue. - // See: https://github.com/tikv/tikv/issues/9496 - return retryWrite, newRegion, common.ErrKVNotLeader.GenWithStack(errPb.GetMessage()) + return retryNone, nil, common.ErrKVNotLeader.GenWithStack(errPb.GetMessage()) case errPb.EpochNotMatch != nil: if currentRegions := errPb.GetEpochNotMatch().GetCurrentRegions(); currentRegions != nil { var currentRegion *metapb.Region diff --git a/br/pkg/lightning/backend/local/local_test.go b/br/pkg/lightning/backend/local/local_test.go index 0913aa9b86801..731ccd7563396 100644 --- a/br/pkg/lightning/backend/local/local_test.go +++ b/br/pkg/lightning/backend/local/local_test.go @@ -479,9 +479,8 @@ func TestIsIngestRetryable(t *testing.T) { }, }, } - retryType, newRegion, err := local.isIngestRetryable(ctx, resp, region, metas) - require.Equal(t, retryWrite, retryType) - require.Equal(t, uint64(2), newRegion.Leader.Id) + retryType, _, err := local.isIngestRetryable(ctx, resp, region, metas) + require.Equal(t, retryNone, retryType) require.Error(t, err) resp.Error = &errorpb.Error{ @@ -500,6 +499,7 @@ func TestIsIngestRetryable(t *testing.T) { }, }, } + var newRegion *split.RegionInfo retryType, newRegion, err = local.isIngestRetryable(ctx, resp, region, metas) require.Equal(t, retryWrite, retryType) require.Equal(t, uint64(2), newRegion.Region.RegionEpoch.Version) @@ -1327,3 +1327,98 @@ func TestCheckPeersBusy(t *testing.T) { // store 12 has a follower busy, so it will cause region peers (11, 12, 13) retry once require.Equal(t, []uint64{11, 12, 11, 12, 13, 11, 21, 22, 23, 21}, apiInvokeRecorder["MultiIngest"]) } + +type regionChangedHook struct { + noopHook + scanCount int +} + +func (r *regionChangedHook) AfterScanRegions(res []*split.RegionInfo, err error) ([]*split.RegionInfo, error) { + r.scanCount++ + if r.scanCount == 1 { + return res, err + } + for _, info := range res { + // skip modified region + if info.Leader.Id > 3 { + return res, err + } + for _, p := range info.Region.Peers { + p.Id += 10 + p.StoreId += 10 + } + } + + return res, err +} + +func TestNotLeaderErrorNeedUpdatePeers(t *testing.T) { + ctx := context.Background() + + // test lightning using stale region info (1,2,3), now the region is (11,12,13) + apiInvokeRecorder := map[string][]uint64{} + notLeaderResp := &sst.IngestResponse{ + Error: &errorpb.Error{ + NotLeader: &errorpb.NotLeader{Leader: &metapb.Peer{StoreId: 11}}, + }} + + pdCli := &mockPdClient{} + pdCtl := &pdutil.PdController{} + pdCtl.SetPDClient(pdCli) + + h := ®ionChangedHook{} + splitCli := initTestSplitClient3Replica([][]byte{{}, {'b'}, {}}, h) + + local := &local{ + pdCtl: pdCtl, + splitCli: splitCli, + importClientFactory: &mockImportClientFactory{ + stores: []*metapb.Store{ + {Id: 1}, {Id: 2}, {Id: 3}, + {Id: 11}, {Id: 12}, {Id: 13}, + }, + createClientFn: func(store *metapb.Store) sst.ImportSSTClient { + importCli := newMockImportClient() + importCli.store = store + importCli.apiInvokeRecorder = apiInvokeRecorder + if store.Id == 1 { + importCli.retry = 1 + importCli.resp = notLeaderResp + } + return importCli + }, + }, + logger: log.L(), + ingestConcurrency: worker.NewPool(ctx, 1, "ingest"), + writeLimiter: noopStoreWriteLimiter{}, + bufferPool: membuf.NewPool(), + supportMultiIngest: true, + shouldCheckWriteStall: true, + } + + db, tmpPath := makePebbleDB(t, nil) + _, engineUUID := backend.MakeUUID("ww", 0) + engineCtx, cancel := context.WithCancel(context.Background()) + f := &Engine{ + db: db, + UUID: engineUUID, + sstDir: tmpPath, + ctx: engineCtx, + cancel: cancel, + sstMetasChan: make(chan metaOrFlush, 64), + keyAdapter: noopKeyAdapter{}, + logger: log.L(), + } + err := f.db.Set([]byte("a"), []byte("a"), nil) + require.NoError(t, err) + err = local.writeAndIngestByRange(ctx, f, []byte{}, []byte("b"), 0, 0) + require.NoError(t, err) + + // "ingest" to test peers busy of stale region: 1,2,3 + // then "write" to stale region: 1,2,3 + // then "ingest" to stale leader: 1 + // then meet NotLeader error, scanned new region (11,12,13) + // repeat above for 11,12,13 + require.Equal(t, []uint64{1, 2, 3, 11, 12, 13}, apiInvokeRecorder["Write"]) + require.Equal(t, []uint64{1, 2, 3, 1, 11, 12, 13, 11}, apiInvokeRecorder["MultiIngest"]) +}