Skip to content

Commit

Permalink
lightning: always retry from rescan when NotLeader (#43351)
Browse files Browse the repository at this point in the history
* lightning: always retry from rescan when NotLeader

Signed-off-by: lance6716 <lance6716@gmail.com>

* fix CI

Signed-off-by: lance6716 <lance6716@gmail.com>

* fix CI

Signed-off-by: lance6716 <lance6716@gmail.com>

---------

Signed-off-by: lance6716 <lance6716@gmail.com>
  • Loading branch information
lance6716 authored Apr 24, 2023
1 parent ac8b256 commit b693b2a
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 20 deletions.
19 changes: 2 additions & 17 deletions br/pkg/lightning/backend/local/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -1359,7 +1359,7 @@ WriteAndIngest:
local.ingestConcurrency.Recycle(w)
if err != nil {
if !local.isRetryableImportTiKVError(err) {
return err
return errors.Trace(err)
}
_, regionStart, _ := codec.DecodeBytes(region.Region.StartKey, []byte{})
// if we have at least succeeded one region, retry without increasing the retry count
Expand Down Expand Up @@ -1987,22 +1987,7 @@ func (local *local) isIngestRetryable(
var err error
switch errPb := resp.GetError(); {
case errPb.NotLeader != nil:
if newLeader := errPb.GetNotLeader().GetLeader(); newLeader != nil {
newRegion = &split.RegionInfo{
Leader: newLeader,
Region: region.Region,
}
} else {
newRegion, err = getRegion()
if err != nil {
return retryNone, nil, errors.Trace(err)
}
}
// TODO: because in some case, TiKV may return retryable error while the ingest is succeeded.
// Thus directly retry ingest may cause TiKV panic. So always return retryWrite here to avoid
// this issue.
// See: https://github.com/tikv/tikv/issues/9496
return retryWrite, newRegion, common.ErrKVNotLeader.GenWithStack(errPb.GetMessage())
return retryNone, nil, common.ErrKVNotLeader.GenWithStack(errPb.GetMessage())
case errPb.EpochNotMatch != nil:
if currentRegions := errPb.GetEpochNotMatch().GetCurrentRegions(); currentRegions != nil {
var currentRegion *metapb.Region
Expand Down
101 changes: 98 additions & 3 deletions br/pkg/lightning/backend/local/local_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,9 +479,8 @@ func TestIsIngestRetryable(t *testing.T) {
},
},
}
retryType, newRegion, err := local.isIngestRetryable(ctx, resp, region, metas)
require.Equal(t, retryWrite, retryType)
require.Equal(t, uint64(2), newRegion.Leader.Id)
retryType, _, err := local.isIngestRetryable(ctx, resp, region, metas)
require.Equal(t, retryNone, retryType)
require.Error(t, err)

resp.Error = &errorpb.Error{
Expand All @@ -500,6 +499,7 @@ func TestIsIngestRetryable(t *testing.T) {
},
},
}
var newRegion *split.RegionInfo
retryType, newRegion, err = local.isIngestRetryable(ctx, resp, region, metas)
require.Equal(t, retryWrite, retryType)
require.Equal(t, uint64(2), newRegion.Region.RegionEpoch.Version)
Expand Down Expand Up @@ -1327,3 +1327,98 @@ func TestCheckPeersBusy(t *testing.T) {
// store 12 has a follower busy, so it will cause region peers (11, 12, 13) retry once
require.Equal(t, []uint64{11, 12, 11, 12, 13, 11, 21, 22, 23, 21}, apiInvokeRecorder["MultiIngest"])
}

type regionChangedHook struct {
noopHook
scanCount int
}

func (r *regionChangedHook) AfterScanRegions(res []*split.RegionInfo, err error) ([]*split.RegionInfo, error) {
r.scanCount++
if r.scanCount == 1 {
return res, err
}
for _, info := range res {
// skip modified region
if info.Leader.Id > 3 {
return res, err
}
for _, p := range info.Region.Peers {
p.Id += 10
p.StoreId += 10
}
}

return res, err
}

func TestNotLeaderErrorNeedUpdatePeers(t *testing.T) {
ctx := context.Background()

// test lightning using stale region info (1,2,3), now the region is (11,12,13)
apiInvokeRecorder := map[string][]uint64{}
notLeaderResp := &sst.IngestResponse{
Error: &errorpb.Error{
NotLeader: &errorpb.NotLeader{Leader: &metapb.Peer{StoreId: 11}},
}}

pdCli := &mockPdClient{}
pdCtl := &pdutil.PdController{}
pdCtl.SetPDClient(pdCli)

h := &regionChangedHook{}
splitCli := initTestSplitClient3Replica([][]byte{{}, {'b'}, {}}, h)

local := &local{
pdCtl: pdCtl,
splitCli: splitCli,
importClientFactory: &mockImportClientFactory{
stores: []*metapb.Store{
{Id: 1}, {Id: 2}, {Id: 3},
{Id: 11}, {Id: 12}, {Id: 13},
},
createClientFn: func(store *metapb.Store) sst.ImportSSTClient {
importCli := newMockImportClient()
importCli.store = store
importCli.apiInvokeRecorder = apiInvokeRecorder
if store.Id == 1 {
importCli.retry = 1
importCli.resp = notLeaderResp
}
return importCli
},
},
logger: log.L(),
ingestConcurrency: worker.NewPool(ctx, 1, "ingest"),
writeLimiter: noopStoreWriteLimiter{},
bufferPool: membuf.NewPool(),
supportMultiIngest: true,
shouldCheckWriteStall: true,
}

db, tmpPath := makePebbleDB(t, nil)
_, engineUUID := backend.MakeUUID("ww", 0)
engineCtx, cancel := context.WithCancel(context.Background())
f := &Engine{
db: db,
UUID: engineUUID,
sstDir: tmpPath,
ctx: engineCtx,
cancel: cancel,
sstMetasChan: make(chan metaOrFlush, 64),
keyAdapter: noopKeyAdapter{},
logger: log.L(),
}
err := f.db.Set([]byte("a"), []byte("a"), nil)
require.NoError(t, err)
err = local.writeAndIngestByRange(ctx, f, []byte{}, []byte("b"), 0, 0)
require.NoError(t, err)

// "ingest" to test peers busy of stale region: 1,2,3
// then "write" to stale region: 1,2,3
// then "ingest" to stale leader: 1
// then meet NotLeader error, scanned new region (11,12,13)
// repeat above for 11,12,13
require.Equal(t, []uint64{1, 2, 3, 11, 12, 13}, apiInvokeRecorder["Write"])
require.Equal(t, []uint64{1, 2, 3, 1, 11, 12, 13, 11}, apiInvokeRecorder["MultiIngest"])
}

0 comments on commit b693b2a

Please sign in to comment.