-
Notifications
You must be signed in to change notification settings - Fork 5.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
store/tikv:Ignore error and do gc anyway #5797
Changes from 6 commits
d1c6181
774980e
3a09833
1f724ac
68c8f4b
65ef1bf
971928b
f54618f
68b5a60
6702d83
4112b67
30516f1
9e0c930
e02f351
8360650
afd9c75
d24eb68
3d0155a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import ( | |
"time" | ||
|
||
"github.com/juju/errors" | ||
"github.com/pingcap/kvproto/pkg/errorpb" | ||
"github.com/pingcap/kvproto/pkg/kvrpcpb" | ||
"github.com/pingcap/tidb" | ||
"github.com/pingcap/tidb/ddl/util" | ||
|
@@ -90,6 +91,7 @@ func (w *GCWorker) Close() { | |
const ( | ||
gcTimeFormat = "20060102-15:04:05 -0700 MST" | ||
gcWorkerTickInterval = time.Minute | ||
gcJobLogTickInterval = time.Minute * 10 | ||
gcWorkerLease = time.Minute * 2 | ||
gcLeaderUUIDKey = "tikv_gc_leader_uuid" | ||
gcLeaderDescKey = "tikv_gc_leader_desc" | ||
|
@@ -327,7 +329,6 @@ func (w *GCWorker) runGCJob(ctx goctx.Context, safePoint uint64) { | |
} | ||
err = doGC(ctx, w.store, safePoint, w.uuid) | ||
if err != nil { | ||
gcFailureCounter.WithLabelValues("gc").Inc() | ||
log.Error("do GC returns an error", err) | ||
w.gcIsRunning = false | ||
w.done <- errors.Trace(err) | ||
|
@@ -346,7 +347,7 @@ func (w *GCWorker) deleteRanges(ctx goctx.Context, safePoint uint64) error { | |
return errors.Trace(err) | ||
} | ||
|
||
bo := tikv.NewBackoffer(tikv.GcDeleteRangeMaxBackoff, goctx.Background()) | ||
bo := tikv.NewBackoffer(tikv.GcDeleteRangeMaxBackoff, ctx) | ||
log.Infof("[gc worker] %s start delete %v ranges", w.uuid, len(ranges)) | ||
startTime := time.Now() | ||
regions := 0 | ||
|
@@ -430,7 +431,7 @@ func resolveLocks(ctx goctx.Context, store tikv.Storage, safePoint uint64, ident | |
Limit: gcScanLockLimit, | ||
}, | ||
} | ||
bo := tikv.NewBackoffer(tikv.GcResolveLockMaxBackoff, goctx.Background()) | ||
bo := tikv.NewBackoffer(tikv.GcResolveLockMaxBackoff, ctx) | ||
|
||
log.Infof("[gc worker] %s start resolve locks, safePoint: %v.", identifier, safePoint) | ||
startTime := time.Now() | ||
|
@@ -515,63 +516,94 @@ func doGC(ctx goctx.Context, store tikv.Storage, safePoint uint64, identifier st | |
// Sleep to wait for all other tidb instances update their safepoint cache. | ||
time.Sleep(gcSafePointCacheInterval) | ||
|
||
log.Infof("[gc worker] %s start gc, safePoint: %v.", identifier, safePoint) | ||
startTime := time.Now() | ||
regions := 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/regions/successRegions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
errRegions := 0 | ||
|
||
ticker := time.NewTicker(gcJobLogTickInterval) | ||
defer ticker.Stop() | ||
|
||
req := &tikvrpc.Request{ | ||
Type: tikvrpc.CmdGC, | ||
GC: &kvrpcpb.GCRequest{ | ||
SafePoint: safePoint, | ||
}, | ||
} | ||
bo := tikv.NewBackoffer(tikv.GcMaxBackoff, goctx.Background()) | ||
|
||
log.Infof("[gc worker] %s start gc, safePoint: %v.", identifier, safePoint) | ||
startTime := time.Now() | ||
regions := 0 | ||
|
||
bo := tikv.NewBackoffer(tikv.GcOneRegionMaxBackoff, ctx) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Initialize it in following for loop seems better. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is hard to do because It should not be initialized when we meet region error. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. |
||
var key []byte | ||
for { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove the loop. |
||
select { | ||
case <-ctx.Done(): | ||
return errors.New("[gc worker] gc job canceled") | ||
case <-ticker.C: | ||
log.Infof("[gc worker] %s gc in process, safePoint: %v, regions: %v, ignored regions: %v, cost time: %s", | ||
identifier, safePoint, regions, errRegions, time.Since(startTime)) | ||
default: | ||
} | ||
|
||
loc, err := store.GetRegionCache().LocateKey(bo, key) | ||
if err != nil { | ||
return errors.Trace(err) | ||
} | ||
resp, err := store.SendReq(bo, req, loc.Region, tikv.ReadTimeoutLong) | ||
if err != nil { | ||
return errors.Trace(err) | ||
} | ||
regionErr, err := resp.GetRegionError() | ||
|
||
var regionErr *errorpb.Error | ||
regionErr, err = doGCForOneRegion(bo, store, req, loc.Region) | ||
|
||
if err != nil { | ||
return errors.Trace(err) | ||
} | ||
if regionErr != nil { | ||
errRegions++ | ||
log.Warnf("[gc worker] %s failed to do gc on region(%s, %s), ignore it", identifier, string(loc.StartKey), string(loc.EndKey)) | ||
gcFailureCounter.WithLabelValues("gc").Inc() | ||
} else if regionErr != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if regionErr != nil { There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is better to check err first, because if |
||
err = bo.Backoff(tikv.BoRegionMiss, errors.New(regionErr.String())) | ||
if err != nil { | ||
return errors.Trace(err) | ||
errRegions++ | ||
log.Warnf("[gc worker] %s failed to do gc on region(%s, %s), ignore it", identifier, string(loc.StartKey), string(loc.EndKey)) | ||
gcFailureCounter.WithLabelValues("gc").Inc() | ||
} else { | ||
continue | ||
} | ||
continue | ||
} | ||
gcResp := resp.GC | ||
if gcResp == nil { | ||
return errors.Trace(tikv.ErrBodyMissing) | ||
} | ||
if gcResp.GetError() != nil { | ||
return errors.Errorf("unexpected gc error: %s", gcResp.GetError()) | ||
} else { | ||
regions++ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if err != nil { |
||
} | ||
regions++ | ||
|
||
key = loc.EndKey | ||
if len(key) == 0 { | ||
break | ||
} | ||
bo = tikv.NewBackoffer(tikv.GcOneRegionMaxBackoff, ctx) | ||
} | ||
log.Infof("[gc worker] %s finish gc, safePoint: %v, regions: %v, cost time: %s", identifier, safePoint, regions, time.Since(startTime)) | ||
log.Infof("[gc worker] %s finish gc, safePoint: %v, regions: %v, ignored regions: %v, cost time: %s", | ||
identifier, safePoint, regions, errRegions, time.Since(startTime)) | ||
gcHistogram.WithLabelValues("do_gc").Observe(time.Since(startTime).Seconds()) | ||
return nil | ||
} | ||
|
||
func doGCForOneRegion(bo *tikv.Backoffer, store tikv.Storage, req *tikvrpc.Request, region tikv.RegionVerID) (*errorpb.Error, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems we should move the construction of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think for the completion, it should be move in here, but if we have 1M regions, this cost very large extra memory. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The cost is acceptable. If i pass an request not gc, the behavior is not consistent with the function name. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, I will change it |
||
resp, err := store.SendReq(bo, req, region, tikv.GCTimeout) | ||
if err != nil { | ||
return nil, errors.Trace(err) | ||
} | ||
regionErr, err := resp.GetRegionError() | ||
if err != nil { | ||
return nil, errors.Trace(err) | ||
} | ||
if regionErr != nil { | ||
return regionErr, nil | ||
} | ||
|
||
gcResp := resp.GC | ||
if gcResp == nil { | ||
return nil, errors.Trace(tikv.ErrBodyMissing) | ||
} | ||
if gcResp.GetError() != nil { | ||
return nil, errors.Errorf("unexpected gc error: %s", gcResp.GetError()) | ||
} | ||
|
||
return nil, nil | ||
} | ||
|
||
func (w *GCWorker) checkLeader() (bool, error) { | ||
gcWorkerCounter.WithLabelValues("check_leader").Inc() | ||
session := createSession(w.store) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why change this value?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think gc command can definitely returns in 20 secs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, I will change it for one request