pingcap · ti-chi-bot · Dec 3, 2024 · Nov 27, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/pkg/disttask/framework/integrationtests/BUILD.bazel b/pkg/disttask/framework/integrationtests/BUILD.bazel
@@ -17,7 +17,7 @@ go_test(
     ],
     flaky = True,
     race = "off",
-    shard_count = 23,
+    shard_count = 22,
     deps = [
         "//pkg/config",
         "//pkg/ddl",

diff --git a/pkg/disttask/framework/integrationtests/framework_ha_test.go b/pkg/disttask/framework/integrationtests/framework_ha_test.go
@@ -36,26 +36,25 @@ func submitTaskAndCheckSuccessForHA(ctx context.Context, t *testing.T, taskKey s
 }
 
 func TestHANodeRandomShutdown(t *testing.T) {
-	testfailpoint.Enable(t, "github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/mockTiDBShutdown", "return()")
 	c := testutil.NewDXFContextWithRandomNodes(t, 4, 15)
 	registerExampleTask(t, c.MockCtrl, testutil.GetMockHATestSchedulerExt(c.MockCtrl), c.TestContext, nil)
 
 	// we keep [1, 10] nodes running, as we only have 10 subtask at stepOne
 	keepCount := int(math.Min(float64(c.NodeCount()-1), float64(c.Rand.Intn(10)+1)))
 	nodeNeedDown := c.GetRandNodeIDs(c.NodeCount() - keepCount)
 	t.Logf("started %d nodes, and we keep %d nodes, nodes that need shutdown: %v", c.NodeCount(), keepCount, nodeNeedDown)
-	taskexecutor.MockTiDBDown = func(execID string, _ *proto.TaskBase) bool {
-		if _, ok := nodeNeedDown[execID]; ok {
-			c.AsyncShutdown(execID)
-			return true
-		}
-		return false
-	}
+	testfailpoint.EnableCall(t, "github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/mockTiDBShutdown",
+		func(e taskexecutor.TaskExecutor, execID string, _ *proto.TaskBase) {
+			if _, ok := nodeNeedDown[execID]; ok {
+				c.AsyncShutdown(execID)
+				e.Cancel()
+			}
+		},
+	)
 	submitTaskAndCheckSuccessForHA(c.Ctx, t, "😊", c.TestContext)
 }
 
 func TestHARandomShutdownInDifferentStep(t *testing.T) {
-	testfailpoint.Enable(t, "github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/mockTiDBShutdown", "return()")
 	c := testutil.NewDXFContextWithRandomNodes(t, 6, 15)
 
 	registerExampleTask(t, c.MockCtrl, testutil.GetMockHATestSchedulerExt(c.MockCtrl), c.TestContext, nil)
@@ -64,22 +63,23 @@ func TestHARandomShutdownInDifferentStep(t *testing.T) {
 	nodeNeedDownAtStepTwo := c.GetRandNodeIDs(c.NodeCount()/2 - 1)
 	t.Logf("started %d nodes, shutdown nodes at step 1: %v, shutdown nodes at step 2: %v",
 		c.NodeCount(), nodeNeedDownAtStepOne, nodeNeedDownAtStepTwo)
-	taskexecutor.MockTiDBDown = func(execID string, task *proto.TaskBase) bool {
-		var targetNodes map[string]struct{}
-		switch task.Step {
-		case proto.StepOne:
-			targetNodes = nodeNeedDownAtStepOne
-		case proto.StepTwo:
-			targetNodes = nodeNeedDownAtStepTwo
-		default:
-			return false
-		}
-		if _, ok := targetNodes[execID]; ok {
-			c.AsyncShutdown(execID)
-			return true
-		}
-		return false
-	}
+	testfailpoint.EnableCall(t, "github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/mockTiDBShutdown",
+		func(e taskexecutor.TaskExecutor, execID string, task *proto.TaskBase) {
+			var targetNodes map[string]struct{}
+			switch task.Step {
+			case proto.StepOne:
+				targetNodes = nodeNeedDownAtStepOne
+			case proto.StepTwo:
+				targetNodes = nodeNeedDownAtStepTwo
+			default:
+				return
+			}
+			if _, ok := targetNodes[execID]; ok {
+				c.AsyncShutdown(execID)
+				e.Cancel()
+			}
+		},
+	)
 	submitTaskAndCheckSuccessForHA(c.Ctx, t, "😊", c.TestContext)
 }
 

diff --git a/pkg/disttask/framework/integrationtests/framework_test.go b/pkg/disttask/framework/integrationtests/framework_test.go
@@ -234,31 +234,16 @@ func TestGC(t *testing.T) {
 	}, 10*time.Second, 500*time.Millisecond)
 }
 
-func TestFrameworkSubtaskFinishedCancel(t *testing.T) {
-	c := testutil.NewTestDXFContext(t, 3, 16, true)
-
-	registerExampleTask(t, c.MockCtrl, testutil.GetMockBasicSchedulerExt(c.MockCtrl), c.TestContext, nil)
-	var counter atomic.Int32
-	testfailpoint.EnableCall(t, "github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/afterOnFinishedCalled",
-		func(e *taskexecutor.BaseTaskExecutor) {
-			if counter.Add(1) == 1 {
-				e.CancelRunningSubtask()
-			}
-		},
-	)
-	task := testutil.SubmitAndWaitTask(c.Ctx, t, "key1", "", 1)
-	require.Equal(t, proto.TaskStateReverted, task.State)
-}
-
 func TestFrameworkRunSubtaskCancelOrFailed(t *testing.T) {
 	c := testutil.NewTestDXFContext(t, 3, 16, true)
 
 	registerExampleTask(t, c.MockCtrl, testutil.GetMockBasicSchedulerExt(c.MockCtrl), c.TestContext, nil)
 	t.Run("meet cancel on run subtask", func(t *testing.T) {
 		var counter atomic.Int32
 		testfailpoint.EnableCall(t, "github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/changeRunSubtaskError",
-			func(errP *error) {
+			func(e taskexecutor.TaskExecutor, errP *error) {
 				if counter.Add(1) == 1 {
+					e.CancelRunningSubtask()
 					*errP = taskexecutor.ErrCancelSubtask
 				}
 			},
@@ -270,7 +255,7 @@ func TestFrameworkRunSubtaskCancelOrFailed(t *testing.T) {
 	t.Run("meet some error on run subtask", func(t *testing.T) {
 		var counter atomic.Int32
 		testfailpoint.EnableCall(t, "github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/changeRunSubtaskError",
-			func(errP *error) {
+			func(_ taskexecutor.TaskExecutor, errP *error) {
 				if counter.Add(1) == 1 {
 					*errP = errors.New("MockExecutorRunErr")
 				}

diff --git a/pkg/disttask/framework/taskexecutor/BUILD.bazel b/pkg/disttask/framework/taskexecutor/BUILD.bazel
@@ -19,7 +19,6 @@ go_library(
         "//pkg/disttask/framework/scheduler",
         "//pkg/disttask/framework/storage",
         "//pkg/disttask/framework/taskexecutor/execute",
-        "//pkg/lightning/common",
         "//pkg/lightning/log",
         "//pkg/metrics",
         "//pkg/sessionctx/variable",
@@ -51,7 +50,7 @@ go_test(
     ],
     embed = [":taskexecutor"],
     flaky = True,
-    shard_count = 17,
+    shard_count = 16,
     deps = [
         "//pkg/disttask/framework/mock",
         "//pkg/disttask/framework/mock/execute",
@@ -74,7 +73,5 @@ go_test(
         "@org_golang_google_grpc//status",
         "@org_uber_go_goleak//:goleak",
         "@org_uber_go_mock//gomock",
-        "@org_uber_go_zap//:zap",
-        "@org_uber_go_zap//zaptest/observer",
     ],
 )
diff --git a/pkg/disttask/framework/taskexecutor/execute/interface.go b/pkg/disttask/framework/taskexecutor/execute/interface.go
@@ -33,7 +33,9 @@ type StepExecutor interface {
 	StepExecFrameworkInfo
 
 	// Init is used to initialize the environment.
-	// if failed, task executor will retry later.
+	// task executor will retry if the returned error is retryable, see
+	// IsRetryableError in TaskExecutor.Extension, else framework will mark random
+	// subtask as failed, to trigger task failure.
 	Init(context.Context) error
 	// RunSubtask is used to run the subtask.
 	RunSubtask(ctx context.Context, subtask *proto.Subtask) error
@@ -42,9 +44,13 @@ type StepExecutor interface {
 	RealtimeSummary() *SubtaskSummary
 
 	// OnFinished is used to handle the subtask when it is finished.
-	// The subtask meta can be updated in place.
+	// The subtask meta can be updated in place. only when OnFinished returns no
+	// err, a subtask can be marked as 'success', if it returns error, the subtask
+	// might be completely rerun, so don't put code that's prone to error in it.
 	OnFinished(ctx context.Context, subtask *proto.Subtask) error
-	// Cleanup is used to clean up the environment.
+	// Cleanup is used to clean up the environment for this step.
+	// the returned error will not affect task/subtask state, it's only logged,
+	// so don't put code that's prone to error in it.
 	Cleanup(context.Context) error
 }
 

diff --git a/pkg/disttask/framework/taskexecutor/interface.go b/pkg/disttask/framework/taskexecutor/interface.go
@@ -115,9 +115,7 @@ type Extension interface {
 	// the Executor will mark the subtask as failed.
 	IsIdempotent(subtask *proto.Subtask) bool
 	// GetStepExecutor returns the subtask executor for the subtask.
-	// Note:
-	// 1. summary is the summary manager of all subtask of the same type now.
-	// 2. should not retry the error from it.
+	// Note, the error returned is fatal, framework will fail the task directly.
 	GetStepExecutor(task *proto.Task) (execute.StepExecutor, error)
 	// IsRetryableError returns whether the error is transient.
 	// When error is transient, the framework won't mark subtasks as failed,