Skip to content

Commit b100812

Browse files
tangentahawkingrei
authored andcommitted
ddl: record get owner TS and compare it before runReorgJob quit (pingcap#55049)
close pingcap#54897
1 parent 75fc104 commit b100812

File tree

3 files changed

+36
-4
lines changed

3 files changed

+36
-4
lines changed

pkg/ddl/ddl.go

+14-1
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,19 @@ type reorgContexts struct {
519519
sync.RWMutex
520520
// reorgCtxMap maps job ID to reorg context.
521521
reorgCtxMap map[int64]*reorgCtx
522+
beOwnerTS int64
523+
}
524+
525+
func (r *reorgContexts) getOwnerTS() int64 {
526+
r.RLock()
527+
defer r.RUnlock()
528+
return r.beOwnerTS
529+
}
530+
531+
func (r *reorgContexts) setOwnerTS(ts int64) {
532+
r.Lock()
533+
r.beOwnerTS = ts
534+
r.Unlock()
522535
}
523536

524537
func (dc *ddlCtx) getReorgCtx(jobID int64) *reorgCtx {
@@ -536,7 +549,7 @@ func (dc *ddlCtx) newReorgCtx(jobID int64, rowCount int64) *reorgCtx {
536549
return existedRC
537550
}
538551
rc := &reorgCtx{}
539-
rc.doneCh = make(chan error, 1)
552+
rc.doneCh = make(chan reorgFnResult, 1)
540553
// initial reorgCtx
541554
rc.setRowCount(rowCount)
542555
rc.mu.warnings = make(map[errors.ErrorID]*terror.Error)

pkg/ddl/job_scheduler.go

+1
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ func (l *ownerListener) OnBecomeOwner() {
112112
sessPool: l.ddl.sessPool,
113113
delRangeMgr: l.ddl.delRangeMgr,
114114
}
115+
l.ddl.reorgCtx.setOwnerTS(time.Now().Unix())
115116
l.scheduler.start()
116117
}
117118

pkg/ddl/reorg.go

+21-3
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ type reorgCtx struct {
6565
// If the reorganization job is done, we will use this channel to notify outer.
6666
// TODO: Now we use goroutine to simulate reorganization jobs, later we may
6767
// use a persistent job list.
68-
doneCh chan error
68+
doneCh chan reorgFnResult
6969
// rowCount is used to simulate a job's row count.
7070
rowCount int64
7171
jobState model.JobState
@@ -80,6 +80,13 @@ type reorgCtx struct {
8080
references atomicutil.Int32
8181
}
8282

83+
// reorgFnResult records the DDL owner TS before executing reorg function, in order to help
84+
// receiver determine if the result is from reorg function of previous DDL owner in this instance.
85+
type reorgFnResult struct {
86+
ownerTS int64
87+
err error
88+
}
89+
8390
func newReorgExprCtx() exprctx.ExprContext {
8491
evalCtx := contextstatic.NewStaticEvalContext(
8592
contextstatic.WithSQLMode(mysql.ModeNone),
@@ -251,11 +258,13 @@ func (w *worker) runReorgJob(
251258
return dbterror.ErrCancelledDDLJob
252259
}
253260

261+
beOwnerTS := w.ddlCtx.reorgCtx.getOwnerTS()
254262
rc = w.newReorgCtx(reorgInfo.Job.ID, reorgInfo.Job.GetRowCount())
255263
w.wg.Add(1)
256264
go func() {
257265
defer w.wg.Done()
258-
rc.doneCh <- reorgFn()
266+
err := reorgFn()
267+
rc.doneCh <- reorgFnResult{ownerTS: beOwnerTS, err: err}
259268
}()
260269
}
261270

@@ -271,7 +280,16 @@ func (w *worker) runReorgJob(
271280

272281
// wait reorganization job done or timeout
273282
select {
274-
case err := <-rc.doneCh:
283+
case res := <-rc.doneCh:
284+
err := res.err
285+
curTS := w.ddlCtx.reorgCtx.getOwnerTS()
286+
if res.ownerTS != curTS {
287+
d.removeReorgCtx(job.ID)
288+
logutil.DDLLogger().Warn("owner ts mismatch, return timeout error and retry",
289+
zap.Int64("prevTS", res.ownerTS),
290+
zap.Int64("curTS", curTS))
291+
return dbterror.ErrWaitReorgTimeout
292+
}
275293
// Since job is cancelled,we don't care about its partial counts.
276294
if rc.isReorgCanceled() || terror.ErrorEqual(err, dbterror.ErrCancelledDDLJob) {
277295
d.removeReorgCtx(job.ID)

0 commit comments

Comments
 (0)