Skip to content

Commit baf1600

Browse files
authored
ddl: part 1 of add-index integrating dxf dynamic modify param (pingcap#59046)
ref pingcap#57229, ref pingcap#57497
1 parent 9971301 commit baf1600

19 files changed

+512
-64
lines changed

Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,7 @@ gen_mock: mockgen
501501
tools/bin/mockgen -embed -package mockexecute github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/execute StepExecutor > pkg/disttask/framework/mock/execute/execute_mock.go
502502
tools/bin/mockgen -package mock github.com/pingcap/tidb/pkg/disttask/importinto MiniTaskExecutor > pkg/disttask/importinto/mock/import_mock.go
503503
tools/bin/mockgen -package mock github.com/pingcap/tidb/pkg/disttask/framework/planner LogicalPlan,PipelineSpec > pkg/disttask/framework/mock/plan_mock.go
504+
tools/bin/mockgen -package mock github.com/pingcap/tidb/pkg/disttask/framework/storage Manager > pkg/disttask/framework/mock/storage_manager_mock.go
504505
tools/bin/mockgen -package mock github.com/pingcap/tidb/pkg/util/sqlexec RestrictedSQLExecutor > pkg/util/sqlexec/mock/restricted_sql_executor_mock.go
505506
tools/bin/mockgen -package mockstorage github.com/pingcap/tidb/br/pkg/storage ExternalStorage > br/pkg/mock/storage/storage.go
506507
tools/bin/mockgen -package mock github.com/pingcap/tidb/pkg/ddl SchemaLoader > pkg/ddl/mock/schema_loader_mock.go

pkg/ddl/BUILD.bazel

+3
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ go_test(
246246
"index_change_test.go",
247247
"index_cop_test.go",
248248
"index_modify_test.go",
249+
"index_nokit_test.go",
249250
"integration_test.go",
250251
"job_scheduler_test.go",
251252
"job_scheduler_testkit_test.go",
@@ -290,10 +291,12 @@ go_test(
290291
"//pkg/ddl/schematracker",
291292
"//pkg/ddl/serverstate",
292293
"//pkg/ddl/session",
294+
"//pkg/ddl/systable",
293295
"//pkg/ddl/testargsv1",
294296
"//pkg/ddl/testutil",
295297
"//pkg/ddl/util",
296298
"//pkg/distsql/context",
299+
"//pkg/disttask/framework/mock",
297300
"//pkg/disttask/framework/proto",
298301
"//pkg/disttask/framework/scheduler",
299302
"//pkg/disttask/framework/storage",

pkg/ddl/backfilling.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,7 @@ func (s *localRowCntListener) SetTotal(total int) {
913913
}
914914

915915
// UpdateDDLJobReorgCfgInterval is the interval to check and update reorg configuration.
916-
const UpdateDDLJobReorgCfgInterval = 2 * time.Second
916+
var UpdateDDLJobReorgCfgInterval = 2 * time.Second
917917

918918
// writePhysicalTableRecord handles the "add index" or "modify/change column" reorganization state for a non-partitioned table or a partition.
919919
// For a partitioned table, it should be handled partition by partition.

pkg/ddl/column.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -520,10 +520,11 @@ var TestReorgGoroutineRunning = make(chan struct{})
520520

521521
// updateCurrentElement update the current element for reorgInfo.
522522
func (w *worker) updateCurrentElement(
523-
ctx context.Context,
523+
jobCtx *jobContext,
524524
t table.Table,
525525
reorgInfo *reorgInfo,
526526
) error {
527+
ctx := jobCtx.stepCtx
527528
failpoint.Inject("mockInfiniteReorgLogic", func() {
528529
TestReorgGoroutineRunning <- struct{}{}
529530
<-ctx.Done()
@@ -587,7 +588,7 @@ func (w *worker) updateCurrentElement(
587588
if err != nil {
588589
return errors.Trace(err)
589590
}
590-
err = w.addTableIndex(ctx, t, reorgInfo)
591+
err = w.addTableIndex(jobCtx, t, reorgInfo)
591592
if err != nil {
592593
return errors.Trace(err)
593594
}

pkg/ddl/index.go

+151-9
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ import (
2020
"context"
2121
"encoding/hex"
2222
"encoding/json"
23+
goerrors "errors"
2324
"fmt"
2425
"os"
2526
"slices"
2627
"strings"
2728
"sync/atomic"
2829
"time"
2930

31+
"github.com/docker/go-units"
3032
"github.com/pingcap/errors"
3133
"github.com/pingcap/failpoint"
3234
"github.com/pingcap/kvproto/pkg/kvrpcpb"
@@ -36,6 +38,7 @@ import (
3638
"github.com/pingcap/tidb/pkg/ddl/logutil"
3739
"github.com/pingcap/tidb/pkg/ddl/notifier"
3840
sess "github.com/pingcap/tidb/pkg/ddl/session"
41+
"github.com/pingcap/tidb/pkg/ddl/systable"
3942
ddlutil "github.com/pingcap/tidb/pkg/ddl/util"
4043
"github.com/pingcap/tidb/pkg/disttask/framework/handle"
4144
"github.com/pingcap/tidb/pkg/disttask/framework/proto"
@@ -1523,7 +1526,7 @@ func runReorgJobAndHandleErr(
15231526
func() {
15241527
addIndexErr = dbterror.ErrCancelledDDLJob.GenWithStack("add table `%v` index `%v` panic", tbl.Meta().Name, allIndexInfos[0].Name)
15251528
}, false)
1526-
return w.addTableIndex(jobCtx.stepCtx, tbl, reorgInfo)
1529+
return w.addTableIndex(jobCtx, tbl, reorgInfo)
15271530
})
15281531
if err != nil {
15291532
if dbterror.ErrPausedDDLJob.Equal(err) {
@@ -2386,14 +2389,15 @@ func (w *worker) addPhysicalTableIndex(
23862389

23872390
// addTableIndex handles the add index reorganization state for a table.
23882391
func (w *worker) addTableIndex(
2389-
ctx context.Context,
2392+
jobCtx *jobContext,
23902393
t table.Table,
23912394
reorgInfo *reorgInfo,
23922395
) error {
2396+
ctx := jobCtx.stepCtx
23932397
// TODO: Support typeAddIndexMergeTmpWorker.
23942398
if reorgInfo.ReorgMeta.IsDistReorg && !reorgInfo.mergingTmpIdx {
23952399
if reorgInfo.ReorgMeta.ReorgTp == model.ReorgTypeLitMerge {
2396-
err := w.executeDistTask(ctx, t, reorgInfo)
2400+
err := w.executeDistTask(jobCtx, t, reorgInfo)
23972401
if err != nil {
23982402
return err
23992403
}
@@ -2484,11 +2488,12 @@ func checkDuplicateForUniqueIndex(ctx context.Context, t table.Table, reorgInfo
24842488
return nil
24852489
}
24862490

2487-
func (w *worker) executeDistTask(stepCtx context.Context, t table.Table, reorgInfo *reorgInfo) error {
2491+
func (w *worker) executeDistTask(jobCtx *jobContext, t table.Table, reorgInfo *reorgInfo) error {
24882492
if reorgInfo.mergingTmpIdx {
24892493
return errors.New("do not support merge index")
24902494
}
24912495

2496+
stepCtx := jobCtx.stepCtx
24922497
taskType := proto.Backfill
24932498
taskKey := fmt.Sprintf("ddl/%s/%d", taskType, reorgInfo.Job.ID)
24942499
g, ctx := errgroup.WithContext(w.workCtx)
@@ -2511,19 +2516,32 @@ func (w *worker) executeDistTask(stepCtx context.Context, t table.Table, reorgIn
25112516
return err
25122517
}
25132518
task, err := taskManager.GetTaskByKeyWithHistory(w.workCtx, taskKey)
2514-
if err != nil && err != storage.ErrTaskNotFound {
2519+
if err != nil && !goerrors.Is(err, storage.ErrTaskNotFound) {
25152520
return err
25162521
}
2522+
2523+
var (
2524+
taskID int64
2525+
lastConcurrency, lastBatchSize, lastMaxWriteSpeed int
2526+
)
25172527
if task != nil {
25182528
// It's possible that the task state is succeed but the ddl job is paused.
2519-
// When task in succeed state, we can skip the dist task execution/scheduing process.
2529+
// When task in succeed state, we can skip the dist task execution/scheduling process.
25202530
if task.State == proto.TaskStateSucceed {
25212531
w.updateDistTaskRowCount(taskKey, reorgInfo.Job.ID)
25222532
logutil.DDLLogger().Info(
25232533
"task succeed, start to resume the ddl job",
25242534
zap.String("task-key", taskKey))
25252535
return nil
25262536
}
2537+
taskMeta := &BackfillTaskMeta{}
2538+
if err := json.Unmarshal(task.Meta, taskMeta); err != nil {
2539+
return errors.Trace(err)
2540+
}
2541+
taskID = task.ID
2542+
lastConcurrency = task.Concurrency
2543+
lastBatchSize = taskMeta.Job.ReorgMeta.GetBatchSize()
2544+
lastMaxWriteSpeed = taskMeta.Job.ReorgMeta.GetMaxWriteSpeed()
25272545
g.Go(func() error {
25282546
defer close(done)
25292547
backoffer := backoff.NewExponential(scheduler.RetrySQLInterval, 2, scheduler.RetrySQLMaxInterval)
@@ -2547,11 +2565,10 @@ func (w *worker) executeDistTask(stepCtx context.Context, t table.Table, reorgIn
25472565
} else {
25482566
job := reorgInfo.Job
25492567
workerCntLimit := job.ReorgMeta.GetConcurrency()
2550-
cpuCount, err := handle.GetCPUCountOfNode(ctx)
2568+
concurrency, err := adjustConcurrency(ctx, taskManager, workerCntLimit)
25512569
if err != nil {
25522570
return err
25532571
}
2554-
concurrency := min(workerCntLimit, cpuCount)
25552572
logutil.DDLLogger().Info("adjusted add-index task concurrency",
25562573
zap.Int("worker-cnt", workerCntLimit), zap.Int("task-concurrency", concurrency),
25572574
zap.String("task-key", taskKey))
@@ -2569,9 +2586,19 @@ func (w *worker) executeDistTask(stepCtx context.Context, t table.Table, reorgIn
25692586
return err
25702587
}
25712588

2589+
task, err := handle.SubmitTask(ctx, taskKey, taskType, concurrency, reorgInfo.ReorgMeta.TargetScope, metaData)
2590+
if err != nil {
2591+
return err
2592+
}
2593+
2594+
taskID = task.ID
2595+
lastConcurrency = concurrency
2596+
lastBatchSize = taskMeta.Job.ReorgMeta.GetBatchSize()
2597+
lastMaxWriteSpeed = taskMeta.Job.ReorgMeta.GetMaxWriteSpeed()
2598+
25722599
g.Go(func() error {
25732600
defer close(done)
2574-
err := submitAndWaitTask(ctx, taskKey, taskType, concurrency, reorgInfo.ReorgMeta.TargetScope, metaData)
2601+
err := handle.WaitTaskDoneOrPaused(ctx, task.ID)
25752602
failpoint.InjectCall("pauseAfterDistTaskFinished")
25762603
if err := w.isReorgRunnable(stepCtx, true); err != nil {
25772604
if dbterror.ErrPausedDDLJob.Equal(err) {
@@ -2616,10 +2643,125 @@ func (w *worker) executeDistTask(stepCtx context.Context, t table.Table, reorgIn
26162643
}
26172644
}
26182645
})
2646+
2647+
g.Go(func() error {
2648+
modifyTaskParamLoop(ctx, jobCtx.sysTblMgr, taskManager, done,
2649+
reorgInfo.Job.ID, taskID, lastConcurrency, lastBatchSize, lastMaxWriteSpeed)
2650+
return nil
2651+
})
2652+
26192653
err = g.Wait()
26202654
return err
26212655
}
26222656

2657+
// Note: we can achieve the same effect by calling ModifyTaskByID directly inside
2658+
// the process of 'ADMIN ALTER DDL JOB xxx', so we can eliminate the goroutine,
2659+
// but if the task hasn't been created we need to make sure the task is created
2660+
// with config after ALTER DDL JOB is executed. A possible solution is to make
2661+
// the DXF task submission and 'ADMIN ALTER DDL JOB xxx' txn conflict with each
2662+
// other when they overlap in time, by modify the job at the same time when submit
2663+
// task, as we are using optimistic txn. But this will cause WRITE CONFLICT with
2664+
// outer txn in transitOneJobStep.
2665+
func modifyTaskParamLoop(
2666+
ctx context.Context,
2667+
sysTblMgr systable.Manager,
2668+
taskManager storage.Manager,
2669+
done chan struct{},
2670+
jobID, taskID int64,
2671+
lastConcurrency, lastBatchSize, lastMaxWriteSpeed int,
2672+
) {
2673+
logger := logutil.DDLLogger().With(zap.Int64("jobId", jobID), zap.Int64("taskId", taskID))
2674+
ticker := time.NewTicker(UpdateDDLJobReorgCfgInterval)
2675+
defer ticker.Stop()
2676+
for {
2677+
select {
2678+
case <-done:
2679+
return
2680+
case <-ticker.C:
2681+
}
2682+
2683+
latestJob, err := sysTblMgr.GetJobByID(ctx, jobID)
2684+
if err != nil {
2685+
if goerrors.Is(err, systable.ErrNotFound) {
2686+
logger.Info("job not found, might already finished")
2687+
return
2688+
}
2689+
logger.Error("get job failed, will retry later", zap.Error(err))
2690+
continue
2691+
}
2692+
2693+
modifies := make([]proto.Modification, 0, 3)
2694+
workerCntLimit := latestJob.ReorgMeta.GetConcurrency()
2695+
concurrency, err := adjustConcurrency(ctx, taskManager, workerCntLimit)
2696+
if err != nil {
2697+
logger.Error("adjust concurrency failed", zap.Error(err))
2698+
continue
2699+
}
2700+
if concurrency != lastConcurrency {
2701+
modifies = append(modifies, proto.Modification{
2702+
Type: proto.ModifyConcurrency,
2703+
To: int64(concurrency),
2704+
})
2705+
}
2706+
batchSize := latestJob.ReorgMeta.GetBatchSize()
2707+
if batchSize != lastBatchSize {
2708+
modifies = append(modifies, proto.Modification{
2709+
Type: proto.ModifyBatchSize,
2710+
To: int64(batchSize),
2711+
})
2712+
}
2713+
maxWriteSpeed := latestJob.ReorgMeta.GetMaxWriteSpeed()
2714+
if maxWriteSpeed != lastMaxWriteSpeed {
2715+
modifies = append(modifies, proto.Modification{
2716+
Type: proto.ModifyMaxWriteSpeed,
2717+
To: int64(maxWriteSpeed),
2718+
})
2719+
}
2720+
if len(modifies) == 0 {
2721+
continue
2722+
}
2723+
currTask, err := taskManager.GetTaskByID(ctx, taskID)
2724+
if err != nil {
2725+
if goerrors.Is(err, storage.ErrTaskNotFound) {
2726+
logger.Info("task not found, might already finished")
2727+
return
2728+
}
2729+
logger.Error("get task failed, will retry later", zap.Error(err))
2730+
continue
2731+
}
2732+
if !currTask.State.CanMoveToModifying() {
2733+
// user might modify param again while another modify is ongoing.
2734+
logger.Info("task state is not suitable for modifying, will retry later",
2735+
zap.String("state", currTask.State.String()))
2736+
continue
2737+
}
2738+
if err = taskManager.ModifyTaskByID(ctx, taskID, &proto.ModifyParam{
2739+
PrevState: currTask.State,
2740+
Modifications: modifies,
2741+
}); err != nil {
2742+
logger.Error("modify task failed", zap.Error(err))
2743+
continue
2744+
}
2745+
logger.Info("modify task success",
2746+
zap.Int("oldConcurrency", lastConcurrency), zap.Int("newConcurrency", concurrency),
2747+
zap.Int("oldBatchSize", lastBatchSize), zap.Int("newBatchSize", batchSize),
2748+
zap.String("oldMaxWriteSpeed", units.HumanSize(float64(lastMaxWriteSpeed))),
2749+
zap.String("newMaxWriteSpeed", units.HumanSize(float64(maxWriteSpeed))),
2750+
)
2751+
lastConcurrency = concurrency
2752+
lastBatchSize = batchSize
2753+
lastMaxWriteSpeed = maxWriteSpeed
2754+
}
2755+
}
2756+
2757+
func adjustConcurrency(ctx context.Context, taskMgr storage.Manager, workerCnt int) (int, error) {
2758+
cpuCount, err := taskMgr.GetCPUCountOfNode(ctx)
2759+
if err != nil {
2760+
return 0, err
2761+
}
2762+
return min(workerCnt, cpuCount), nil
2763+
}
2764+
26232765
// EstimateTableRowSizeForTest is used for test.
26242766
var EstimateTableRowSizeForTest = estimateTableRowSize
26252767

0 commit comments

Comments
 (0)