pingcap
diff --git a/‎disttask/framework/dispatcher/dispatcher.go
+12-8 b/‎disttask/framework/dispatcher/dispatcher.go
+12-8
diff --git a/‎disttask/framework/dispatcher/main_test.go
+1-1 b/‎disttask/framework/dispatcher/main_test.go
+1-1
diff --git a/‎disttask/framework/handle/BUILD.bazel
+4 b/‎disttask/framework/handle/BUILD.bazel
+4
diff --git a/‎disttask/framework/handle/handle.go
+31 b/‎disttask/framework/handle/handle.go
+31
diff --git a/‎disttask/framework/handle/handle_test.go
+66 b/‎disttask/framework/handle/handle_test.go
+66
diff --git a/‎disttask/importinto/BUILD.bazel
+5-1 b/‎disttask/importinto/BUILD.bazel
+5-1
@@ -44,8 +44,12 @@ const (
 var (
 	checkTaskFinishedInterval = 500 * time.Millisecond
 	nonRetrySQLTime           = 1
-	retrySQLTimes             = 30
-	retrySQLInterval          = 3 * time.Second
+	// RetrySQLTimes is the max retry times when executing SQL.
+	RetrySQLTimes = 30
+	// RetrySQLInterval is the initial interval between two SQL retries.
+	RetrySQLInterval = 3 * time.Second
+	// RetrySQLMaxInterval is the max interval between two SQL retries.
+	RetrySQLMaxInterval = 30 * time.Second
 )
 
 // TaskHandle provides the interface for operations needed by Dispatcher.
@@ -191,7 +195,7 @@ func (d *BaseDispatcher) onReverting() error {
 	if prevStageFinished {
 		// Finish the rollback step.
 		logutil.Logger(d.logCtx).Info("update the task to reverted state")
-		return d.updateTask(proto.TaskStateReverted, nil, retrySQLTimes)
+		return d.updateTask(proto.TaskStateReverted, nil, RetrySQLTimes)
 	}
 	// Wait all subtasks in this stage finished.
 	d.OnTick(d.ctx, d.task)
@@ -321,7 +325,7 @@ func (d *BaseDispatcher) updateTask(taskState string, newSubTasks []*proto.Subta
 			logutil.Logger(d.logCtx).Warn("updateTask first failed", zap.String("from", prevState), zap.String("to", d.task.State),
 				zap.Int("retry times", retryTimes), zap.Error(err))
 		}
-		time.Sleep(retrySQLInterval)
+		time.Sleep(RetrySQLInterval)
 	}
 	if err != nil && retryTimes != nonRetrySQLTime {
 		logutil.Logger(d.logCtx).Warn("updateTask failed",
@@ -354,7 +358,7 @@ func (d *BaseDispatcher) dispatchSubTask4Revert(task *proto.Task, meta []byte) e
 	for _, id := range instanceIDs {
 		subTasks = append(subTasks, proto.NewSubtask(task.ID, task.Type, id, meta))
 	}
-	return d.updateTask(proto.TaskStateReverting, subTasks, retrySQLTimes)
+	return d.updateTask(proto.TaskStateReverting, subTasks, RetrySQLTimes)
 }
 
 func (d *BaseDispatcher) onNextStage() error {
@@ -377,7 +381,7 @@ func (d *BaseDispatcher) dispatchSubTask(task *proto.Task, metas [][]byte) error
 		task.Concurrency = MaxSubtaskConcurrency
 	}
 
-	retryTimes := retrySQLTimes
+	retryTimes := RetrySQLTimes
 	// 2. Special handling for the new tasks.
 	if task.State == proto.TaskStatePending {
 		// TODO: Consider using TS.
@@ -428,7 +432,7 @@ func (d *BaseDispatcher) dispatchSubTask(task *proto.Task, metas [][]byte) error
 		logutil.Logger(d.logCtx).Debug("create subtasks", zap.String("instanceID", instanceID))
 		subTasks = append(subTasks, proto.NewSubtask(task.ID, task.Type, instanceID, meta))
 	}
-	return d.updateTask(proto.TaskStateRunning, subTasks, retrySQLTimes)
+	return d.updateTask(proto.TaskStateRunning, subTasks, RetrySQLTimes)
 }
 
 func (d *BaseDispatcher) handlePlanErr(err error) error {
@@ -438,7 +442,7 @@ func (d *BaseDispatcher) handlePlanErr(err error) error {
 	}
 	d.task.Error = err
 	// state transform: pending -> failed.
-	return d.updateTask(proto.TaskStateFailed, nil, retrySQLTimes)
+	return d.updateTask(proto.TaskStateFailed, nil, RetrySQLTimes)
 }
 
 // GenerateSchedulerNodes generate a eligible TiDB nodes.
 
@@ -43,7 +43,7 @@ func TestMain(m *testing.M) {
 	// Make test more fast.
 	checkTaskRunningInterval = checkTaskRunningInterval / 10
 	checkTaskFinishedInterval = checkTaskFinishedInterval / 10
-	retrySQLInterval = retrySQLInterval / 20
+	RetrySQLInterval = RetrySQLInterval / 20
 
 	opts := []goleak.Option{
 		goleak.IgnoreTopFunction("github.com/golang/glog.(*fileSink).flushDaemon"),
 
@@ -8,6 +8,7 @@ go_library(
     deps = [
         "//disttask/framework/proto",
         "//disttask/framework/storage",
+        "//util/backoff",
         "//util/logutil",
         "@com_github_pingcap_errors//:errors",
         "@org_uber_go_zap//:zap",
@@ -24,7 +25,10 @@ go_test(
         "//disttask/framework/proto",
         "//disttask/framework/storage",
         "//testkit",
+        "//util/backoff",
         "@com_github_ngaut_pools//:pools",
+        "@com_github_pingcap_errors//:errors",
+        "@com_github_pingcap_log//:log",
         "@com_github_stretchr_testify//require",
         "@com_github_tikv_client_go_v2//util",
     ],
 
@@ -21,6 +21,7 @@ import (
 	"github.com/pingcap/errors"
 	"github.com/pingcap/tidb/disttask/framework/proto"
 	"github.com/pingcap/tidb/disttask/framework/storage"
+	"github.com/pingcap/tidb/util/backoff"
 	"github.com/pingcap/tidb/util/logutil"
 	"go.uber.org/zap"
 )
@@ -118,3 +119,33 @@ func CancelGlobalTask(taskKey string) error {
 	}
 	return globalTaskManager.CancelGlobalTask(globalTask.ID)
 }
+
+// RunWithRetry runs a function with retry, when retry exceed max retry time, it
+// returns the last error met.
+// if the function fails with err, it should return a bool to indicate whether
+// the error is retryable.
+// if context done, it will stop early and return ctx.Err().
+func RunWithRetry(
+	ctx context.Context,
+	maxRetry int,
+	backoffer backoff.Backoffer,
+	logger *zap.Logger,
+	f func(context.Context) (bool, error),
+) error {
+	var lastErr error
+	for i := 0; i < maxRetry; i++ {
+		retryable, err := f(ctx)
+		if err == nil || !retryable {
+			return err
+		}
+		lastErr = err
+		logger.Warn("met retryable error", zap.Int("retry-count", i),
+			zap.Int("max-retry", maxRetry), zap.Error(err))
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(backoffer.Backoff(i)):
+		}
+	}
+	return lastErr
+}
@@ -16,14 +16,19 @@ package handle_test
 
 import (
 	"context"
+	"math"
+	"sync/atomic"
 	"testing"
 	"time"
 
 	"github.com/ngaut/pools"
+	"github.com/pingcap/errors"
+	"github.com/pingcap/log"
 	"github.com/pingcap/tidb/disttask/framework/handle"
 	"github.com/pingcap/tidb/disttask/framework/proto"
 	"github.com/pingcap/tidb/disttask/framework/storage"
 	"github.com/pingcap/tidb/testkit"
+	"github.com/pingcap/tidb/util/backoff"
 	"github.com/stretchr/testify/require"
 	"github.com/tikv/client-go/v2/util"
 )
@@ -56,3 +61,64 @@ func TestHandle(t *testing.T) {
 
 	require.NoError(t, handle.CancelGlobalTask("1"))
 }
+
+func TestRunWithRetry(t *testing.T) {
+	ctx := context.Background()
+
+	// retry count exceed
+	backoffer := backoff.NewExponential(100*time.Millisecond, 1, time.Second)
+	err := handle.RunWithRetry(ctx, 3, backoffer, log.L(),
+		func(ctx context.Context) (bool, error) {
+			return true, errors.New("mock error")
+		},
+	)
+	require.ErrorContains(t, err, "mock error")
+
+	// non-retryable error
+	var end atomic.Bool
+	go func() {
+		defer end.Store(true)
+		backoffer = backoff.NewExponential(100*time.Millisecond, 1, time.Second)
+		err = handle.RunWithRetry(ctx, math.MaxInt, backoffer, log.L(),
+			func(ctx context.Context) (bool, error) {
+				return false, errors.New("mock error")
+			},
+		)
+		require.Error(t, err)
+	}()
+	require.Eventually(t, func() bool {
+		return end.Load()
+	}, 5*time.Second, 100*time.Millisecond)
+
+	// fail with retryable error once, then success
+	end.Store(false)
+	go func() {
+		defer end.Store(true)
+		backoffer = backoff.NewExponential(100*time.Millisecond, 1, time.Second)
+		var i int
+		err = handle.RunWithRetry(ctx, math.MaxInt, backoffer, log.L(),
+			func(ctx context.Context) (bool, error) {
+				if i == 0 {
+					i++
+					return true, errors.New("mock error")
+				}
+				return false, nil
+			},
+		)
+		require.NoError(t, err)
+	}()
+	require.Eventually(t, func() bool {
+		return end.Load()
+	}, 5*time.Second, 100*time.Millisecond)
+
+	// context done
+	subctx, cancel := context.WithCancel(ctx)
+	cancel()
+	backoffer = backoff.NewExponential(100*time.Millisecond, 1, time.Second)
+	err = handle.RunWithRetry(subctx, math.MaxInt, backoffer, log.L(),
+		func(ctx context.Context) (bool, error) {
+			return true, errors.New("mock error")
+		},
+	)
+	require.ErrorIs(t, err, context.Canceled)
+}
@@ -48,6 +48,7 @@ go_library(
         "//sessionctx/variable",
         "//table/tables",
         "//util",
+        "//util/backoff",
         "//util/dbterror/exeerrors",
         "//util/etcd",
         "//util/logutil",
@@ -68,6 +69,7 @@ go_test(
     timeout = "short",
     srcs = [
         "dispatcher_test.go",
+        "dispatcher_testkit_test.go",
         "encode_and_sort_operator_test.go",
         "planner_test.go",
         "subtask_executor_test.go",
@@ -76,11 +78,12 @@ go_test(
     embed = [":importinto"],
     flaky = True,
     race = "on",
-    shard_count = 6,
+    shard_count = 7,
     deps = [
         "//br/pkg/lightning/checkpoints",
         "//br/pkg/lightning/mydump",
         "//br/pkg/lightning/verification",
+        "//disttask/framework/dispatcher",
         "//disttask/framework/mock/execute",
         "//disttask/framework/planner",
         "//disttask/framework/proto",
@@ -94,6 +97,7 @@ go_test(
         "//parser/mysql",
         "//testkit",
         "//util/logutil",
+        "//util/sqlexec",
         "@com_github_ngaut_pools//:pools",
         "@com_github_pingcap_errors//:errors",
         "@com_github_pingcap_failpoint//:failpoint",
Original file line number	Diff line number	Diff line change
`@@ -44,8 +44,12 @@ const (`
`44`	`44`	`var (`
`45`	`45`	`checkTaskFinishedInterval = 500 * time.Millisecond`
`46`	`46`	`nonRetrySQLTime = 1`
`47`		`- retrySQLTimes = 30`
`48`		`- retrySQLInterval = 3 * time.Second`
	`47`	`+ // RetrySQLTimes is the max retry times when executing SQL.`
	`48`	`+ RetrySQLTimes = 30`
	`49`	`+ // RetrySQLInterval is the initial interval between two SQL retries.`
	`50`	`+ RetrySQLInterval = 3 * time.Second`
	`51`	`+ // RetrySQLMaxInterval is the max interval between two SQL retries.`
	`52`	`+ RetrySQLMaxInterval = 30 * time.Second`
`49`	`53`	`)`
`50`	`54`
`51`	`55`	`// TaskHandle provides the interface for operations needed by Dispatcher.`
`@@ -191,7 +195,7 @@ func (d *BaseDispatcher) onReverting() error {`
`191`	`195`	`if prevStageFinished {`
`192`	`196`	`// Finish the rollback step.`
`193`	`197`	`logutil.Logger(d.logCtx).Info("update the task to reverted state")`
`194`		`- return d.updateTask(proto.TaskStateReverted, nil, retrySQLTimes)`
	`198`	`+ return d.updateTask(proto.TaskStateReverted, nil, RetrySQLTimes)`
`195`	`199`	`}`
`196`	`200`	`// Wait all subtasks in this stage finished.`
`197`	`201`	`d.OnTick(d.ctx, d.task)`
`@@ -321,7 +325,7 @@ func (d BaseDispatcher) updateTask(taskState string, newSubTasks []proto.Subta`
`321`	`325`	`logutil.Logger(d.logCtx).Warn("updateTask first failed", zap.String("from", prevState), zap.String("to", d.task.State),`
`322`	`326`	`zap.Int("retry times", retryTimes), zap.Error(err))`
`323`	`327`	`}`
`324`		`- time.Sleep(retrySQLInterval)`
	`328`	`+ time.Sleep(RetrySQLInterval)`
`325`	`329`	`}`
`326`	`330`	`if err != nil && retryTimes != nonRetrySQLTime {`
`327`	`331`	`logutil.Logger(d.logCtx).Warn("updateTask failed",`
`@@ -354,7 +358,7 @@ func (d BaseDispatcher) dispatchSubTask4Revert(task proto.Task, meta []byte) e`
`354`	`358`	`for _, id := range instanceIDs {`
`355`	`359`	`subTasks = append(subTasks, proto.NewSubtask(task.ID, task.Type, id, meta))`
`356`	`360`	`}`
`357`		`- return d.updateTask(proto.TaskStateReverting, subTasks, retrySQLTimes)`
	`361`	`+ return d.updateTask(proto.TaskStateReverting, subTasks, RetrySQLTimes)`
`358`	`362`	`}`
`359`	`363`
`360`	`364`	`func (d *BaseDispatcher) onNextStage() error {`
`@@ -377,7 +381,7 @@ func (d BaseDispatcher) dispatchSubTask(task proto.Task, metas [][]byte) error`
`377`	`381`	`task.Concurrency = MaxSubtaskConcurrency`
`378`	`382`	`}`
`379`	`383`
`380`		`- retryTimes := retrySQLTimes`
	`384`	`+ retryTimes := RetrySQLTimes`
`381`	`385`	`// 2. Special handling for the new tasks.`
`382`	`386`	`if task.State == proto.TaskStatePending {`
`383`	`387`	`// TODO: Consider using TS.`
`@@ -428,7 +432,7 @@ func (d BaseDispatcher) dispatchSubTask(task proto.Task, metas [][]byte) error`
`428`	`432`	`logutil.Logger(d.logCtx).Debug("create subtasks", zap.String("instanceID", instanceID))`
`429`	`433`	`subTasks = append(subTasks, proto.NewSubtask(task.ID, task.Type, instanceID, meta))`
`430`	`434`	`}`
`431`		`- return d.updateTask(proto.TaskStateRunning, subTasks, retrySQLTimes)`
	`435`	`+ return d.updateTask(proto.TaskStateRunning, subTasks, RetrySQLTimes)`
`432`	`436`	`}`
`433`	`437`
`434`	`438`	`func (d *BaseDispatcher) handlePlanErr(err error) error {`
`@@ -438,7 +442,7 @@ func (d *BaseDispatcher) handlePlanErr(err error) error {`
`438`	`442`	`}`
`439`	`443`	`d.task.Error = err`
`440`	`444`	`// state transform: pending -> failed.`
`441`		`- return d.updateTask(proto.TaskStateFailed, nil, retrySQLTimes)`
	`445`	`+ return d.updateTask(proto.TaskStateFailed, nil, RetrySQLTimes)`
`442`	`446`	`}`
`443`	`447`
`444`	`448`	`// GenerateSchedulerNodes generate a eligible TiDB nodes.`