Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mpp: check the tiflash availabilities before launching mpp queries. #26130

Merged
merged 18 commits into from
Jul 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ require (
github.com/pingcap/errors v0.11.5-0.20210425183316-da1aaba5fb63
github.com/pingcap/failpoint v0.0.0-20210316064728-7acb0f0a3dfd
github.com/pingcap/fn v0.0.0-20200306044125-d5540d389059
github.com/pingcap/kvproto v0.0.0-20210611081648-a215b4e61d2f
github.com/pingcap/kvproto v0.0.0-20210712050333-b66fdbd6bfd5
github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7
github.com/pingcap/parser v0.0.0-20210707071004-31c87e37af5c
github.com/pingcap/sysutil v0.0.0-20210315073920-cc0985d983a3
Expand All @@ -56,7 +56,7 @@ require (
github.com/soheilhy/cmux v0.1.4
github.com/stretchr/testify v1.7.0
github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2
github.com/tikv/client-go/v2 v2.0.0-alpha.0.20210709052506-aadf3cf62721
github.com/tikv/client-go/v2 v2.0.0-alpha.0.20210712082038-2c7970b2b7e8
github.com/tikv/pd v1.1.0-beta.0.20210323121136-78679e5e209d
github.com/twmb/murmur3 v1.1.3
github.com/uber-go/atomic v1.4.0
Expand Down
9 changes: 4 additions & 5 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -437,9 +437,8 @@ github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989/go.mod h1:O17Xtb
github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w=
github.com/pingcap/kvproto v0.0.0-20200411081810-b85805c9476c/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/kvproto v0.0.0-20210219064844-c1844a4775d6/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/kvproto v0.0.0-20210531063847-f42e582bf0bb/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/kvproto v0.0.0-20210611081648-a215b4e61d2f h1:6K+5nbl1I3el9mYt/mlWorBR95qqmQdwrXYaNQrWWE8=
github.com/pingcap/kvproto v0.0.0-20210611081648-a215b4e61d2f/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/kvproto v0.0.0-20210712050333-b66fdbd6bfd5 h1:LN/ml4lm5+AYdn+N/CJ102wFUph2OIpo8hHqi8QxKiQ=
github.com/pingcap/kvproto v0.0.0-20210712050333-b66fdbd6bfd5/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/log v0.0.0-20191012051959-b742a5d432e9/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
github.com/pingcap/log v0.0.0-20200511115504-543df19646ad/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
github.com/pingcap/log v0.0.0-20201112100606-8f1e84a3abc8/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
Expand Down Expand Up @@ -562,8 +561,8 @@ github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2/go.mod h1:2PfK
github.com/tidwall/gjson v1.3.5/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls=
github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E=
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
github.com/tikv/client-go/v2 v2.0.0-alpha.0.20210709052506-aadf3cf62721 h1:TxzBXVMbGWm5SuLpWgWoj4HgPbeCNdq1t9zrO6/UXTA=
github.com/tikv/client-go/v2 v2.0.0-alpha.0.20210709052506-aadf3cf62721/go.mod h1:4odrPuyxU7g2kgFK9V4i1ZBOg+EDIeHCGWPygqA0ym4=
github.com/tikv/client-go/v2 v2.0.0-alpha.0.20210712082038-2c7970b2b7e8 h1:xMbxiAVbJPzzrITV8yqUBgltm4GNdYebkuwUBB5RX9Y=
github.com/tikv/client-go/v2 v2.0.0-alpha.0.20210712082038-2c7970b2b7e8/go.mod h1:+bOiuuZZUqIq19EqyhTWQFaB0PeXLOh/il1vnVZx3Tk=
github.com/tikv/pd v1.1.0-beta.0.20210323121136-78679e5e209d h1:K0XnvsnT6ofLDuM8Rt3PuFQO4p8bNraeHYstspD316g=
github.com/tikv/pd v1.1.0-beta.0.20210323121136-78679e5e209d/go.mod h1:Jw9KG11C/23Rr7DW4XWQ7H5xOgGZo6DFL1OKAF4+Igw=
github.com/tklauser/go-sysconf v0.3.4 h1:HT8SVixZd3IzLdfs/xlpq0jeSfTX57g1v6wB1EuzV7M=
Expand Down
98 changes: 73 additions & 25 deletions store/copr/batch_coprocessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/pingcap/kvproto/pkg/coprocessor"
"github.com/pingcap/kvproto/pkg/kvrpcpb"
"github.com/pingcap/kvproto/pkg/metapb"
"github.com/pingcap/kvproto/pkg/mpp"
"github.com/pingcap/log"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/store/driver/backoff"
Expand Down Expand Up @@ -102,46 +103,90 @@ func (rs *batchCopResponse) RespTime() time.Duration {
// 2. for the remaining regions:
// if there is only 1 available store, then put the region to the related store
// otherwise, use a greedy algorithm to put it into the store with highest weight
func balanceBatchCopTask(originalTasks []*batchCopTask) []*batchCopTask {
func balanceBatchCopTask(ctx context.Context, kvStore *kvStore, originalTasks []*batchCopTask, isMPP bool) []*batchCopTask {
if len(originalTasks) <= 1 {
return originalTasks
}
cache := kvStore.GetRegionCache()
storeTaskMap := make(map[uint64]*batchCopTask)
// storeCandidateRegionMap stores all the possible store->region map. Its content is
// store id -> region signature -> region info. We can see it as store id -> region lists.
storeCandidateRegionMap := make(map[uint64]map[string]RegionInfo)
totalRegionCandidateNum := 0
totalRemainingRegionNum := 0

for _, task := range originalTasks {
taskStoreID := task.regionInfos[0].AllStores[0]
batchTask := &batchCopTask{
storeAddr: task.storeAddr,
cmdType: task.cmdType,
ctx: task.ctx,
regionInfos: []RegionInfo{task.regionInfos[0]},
if !isMPP {
for _, task := range originalTasks {
taskStoreID := task.regionInfos[0].AllStores[0]
batchTask := &batchCopTask{
storeAddr: task.storeAddr,
cmdType: task.cmdType,
ctx: task.ctx,
regionInfos: []RegionInfo{task.regionInfos[0]},
}
storeTaskMap[taskStoreID] = batchTask
}
} else {
// decide the available stores
stores := cache.RegionCache.GetTiFlashStores()
var wg sync.WaitGroup
var mu sync.Mutex
wg.Add(len(stores))
for i := range stores {
go func(idx int) {
defer wg.Done()
s := stores[idx]
aliveReq := tikvrpc.NewRequest(tikvrpc.CmdMPPAlive, &mpp.IsAliveRequest{}, kvrpcpb.Context{})
aliveReq.StoreTp = tikvrpc.TiFlash
alive := false
resp, err := kvStore.GetTiKVClient().SendRequest(ctx, s.GetAddr(), aliveReq, tikv.ReadTimeoutMedium)
youjiali1995 marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
logutil.BgLogger().Warn("Cannot detect store's availablity", zap.String("store address", s.GetAddr()), zap.String("err message", err.Error()))
} else {
rpcResp := resp.Resp.(*mpp.IsAliveResponse)
if rpcResp.Available {
alive = true
} else {
logutil.BgLogger().Warn("Cannot detect store's availablity", zap.String("store address", s.GetAddr()))
}
}
if !alive {
return
}

mu.Lock()
defer mu.Unlock()
storeTaskMap[s.StoreID()] = &batchCopTask{
storeAddr: s.GetAddr(),
cmdType: originalTasks[0].cmdType,
ctx: &tikv.RPCContext{Addr: s.GetAddr(), Store: s},
}
}(i)
}
storeTaskMap[taskStoreID] = batchTask
wg.Wait()
}

for _, task := range originalTasks {
taskStoreID := task.regionInfos[0].AllStores[0]
for index, ri := range task.regionInfos {
// for each region, figure out the valid store num
validStoreNum := 0
if index == 0 {
if index == 0 && !isMPP {
continue
}
if len(ri.AllStores) <= 1 {
validStoreNum = 1
} else {
for _, storeID := range ri.AllStores {
if _, ok := storeTaskMap[storeID]; ok {
validStoreNum++
}
var validStoreID uint64
for _, storeID := range ri.AllStores {
if _, ok := storeTaskMap[storeID]; ok {
validStoreNum++
// original store id might be invalid, so we have to set it again.
validStoreID = storeID
}
}
if validStoreNum == 1 {
if validStoreNum == 0 {
logutil.BgLogger().Warn("Meet regions that don't have an available store. Give up balancing")
return originalTasks
} else if validStoreNum == 1 {
// if only one store is valid, just put it to storeTaskMap
storeTaskMap[taskStoreID].regionInfos = append(storeTaskMap[taskStoreID].regionInfos, ri)
storeTaskMap[validStoreID].regionInfos = append(storeTaskMap[validStoreID].regionInfos, ri)
} else {
// if more than one store is valid, put the region
// to store candidate map
Expand Down Expand Up @@ -239,12 +284,15 @@ func balanceBatchCopTask(originalTasks []*batchCopTask) []*batchCopTask {

var ret []*batchCopTask
for _, task := range storeTaskMap {
ret = append(ret, task)
if len(task.regionInfos) > 0 {
ret = append(ret, task)
}
}
return ret
}

func buildBatchCopTasks(bo *Backoffer, cache *RegionCache, ranges *KeyRanges, storeType kv.StoreType) ([]*batchCopTask, error) {
func buildBatchCopTasks(bo *Backoffer, store *kvStore, ranges *KeyRanges, storeType kv.StoreType, isMPP bool) ([]*batchCopTask, error) {
cache := store.GetRegionCache()
start := time.Now()
const cmdType = tikvrpc.CmdBatchCop
rangesLen := ranges.Len()
Expand Down Expand Up @@ -318,7 +366,7 @@ func buildBatchCopTasks(bo *Backoffer, cache *RegionCache, ranges *KeyRanges, st
}
logutil.BgLogger().Debug(msg)
}
batchTasks = balanceBatchCopTask(batchTasks)
batchTasks = balanceBatchCopTask(bo.GetCtx(), store, batchTasks, isMPP)
if log.GetLevel() <= zap.DebugLevel {
msg := "After region balance:"
for _, task := range batchTasks {
Expand All @@ -345,7 +393,7 @@ func (c *CopClient) sendBatch(ctx context.Context, req *kv.Request, vars *tikv.V
ctx = context.WithValue(ctx, tikv.TxnStartKey(), req.StartTs)
bo := backoff.NewBackofferWithVars(ctx, copBuildTaskMaxBackoff, vars)
ranges := NewKeyRanges(req.KeyRanges)
tasks, err := buildBatchCopTasks(bo, c.store.GetRegionCache(), ranges, req.StoreType)
tasks, err := buildBatchCopTasks(bo, c.store.kvStore, ranges, req.StoreType, false)
if err != nil {
return copErrorResponse{err}
}
Expand Down Expand Up @@ -486,7 +534,7 @@ func (b *batchCopIterator) retryBatchCopTask(ctx context.Context, bo *Backoffer,
ranges = append(ranges, *ran)
})
}
return buildBatchCopTasks(bo, b.store.GetRegionCache(), NewKeyRanges(ranges), b.req.StoreType)
return buildBatchCopTasks(bo, b.store, NewKeyRanges(ranges), b.req.StoreType, false)
}

const readTimeoutUltraLong = 3600 * time.Second // For requests that may scan many regions for tiflash.
Expand Down
6 changes: 3 additions & 3 deletions store/copr/mpp.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ func (c *batchCopTask) GetAddress() string {

func (c *MPPClient) selectAllTiFlashStore() []kv.MPPTaskMeta {
resultTasks := make([]kv.MPPTaskMeta, 0)
for _, addr := range c.store.GetRegionCache().GetTiFlashStoreAddrs() {
task := &batchCopTask{storeAddr: addr, cmdType: tikvrpc.CmdMPPTask}
for _, s := range c.store.GetRegionCache().GetTiFlashStores() {
task := &batchCopTask{storeAddr: s.GetAddr(), cmdType: tikvrpc.CmdMPPTask}
resultTasks = append(resultTasks, task)
}
return resultTasks
Expand All @@ -62,7 +62,7 @@ func (c *MPPClient) ConstructMPPTasks(ctx context.Context, req *kv.MPPBuildTasks
return c.selectAllTiFlashStore(), nil
}
ranges := NewKeyRanges(req.KeyRanges)
tasks, err := buildBatchCopTasks(bo, c.store.GetRegionCache(), ranges, kv.TiFlash)
tasks, err := buildBatchCopTasks(bo, c.store, ranges, kv.TiFlash, true)
if err != nil {
return nil, errors.Trace(err)
}
Expand Down
7 changes: 6 additions & 1 deletion store/mockstore/unistore/tikv/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,12 @@ func (mrm *MockRegionManager) removeMPPTaskHandler(taskID int64, storeID uint64)
return errors.New("cannot find mpp task")
}

// DispatchMPPTask implements implements the tikvpb.TikvServer interface.
// IsAlive implements the tikvpb.TikvServer interface.
func (svr *Server) IsAlive(_ context.Context, _ *mpp.IsAliveRequest) (*mpp.IsAliveResponse, error) {
panic("todo")
}

// DispatchMPPTask implements the tikvpb.TikvServer interface.
func (svr *Server) DispatchMPPTask(_ context.Context, _ *mpp.DispatchTaskRequest) (*mpp.DispatchTaskResponse, error) {
panic("todo")
}
Expand Down