-
Notifications
You must be signed in to change notification settings - Fork 102
lightning: check and restore pd scheduler even if our task failed #1336
Changes from 5 commits
4f5e712
09ba51d
9c240b9
a1d7b1c
49aaf55
504b9af
8e6e89c
c85a70b
a2f959c
799f4e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -105,6 +105,7 @@ const ( | |
task_id BIGINT(20) UNSIGNED NOT NULL, | ||
pd_cfgs VARCHAR(2048) NOT NULL DEFAULT '', | ||
status VARCHAR(32) NOT NULL, | ||
state TINYINT(1) NOT NULL DEFAULT 0 COMMENT '0: normal, 1: exited before finish', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should the otherwise if we used Lightning before, and this CREATE TABLE IF NOT EXISTS means the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure. But since we don't GA this feature and if lightning exited before finished, the current logic may still not be recover except manually drop the meta schema. We also don't recommend change lightning binary during one import task. |
||
PRIMARY KEY (task_id) | ||
);` | ||
|
||
|
@@ -1195,6 +1196,7 @@ func (rc *Controller) restoreTables(ctx context.Context) error { | |
// we do not do switch back automatically | ||
cleanupFunc := func() {} | ||
switchBack := false | ||
taskFinished := false | ||
if rc.cfg.TikvImporter.Backend == config.BackendLocal { | ||
// disable some pd schedulers | ||
pdController, err := pdutil.NewPdController(ctx, rc.cfg.TiDB.PdAddr, | ||
|
@@ -1215,7 +1217,7 @@ func (rc *Controller) restoreTables(ctx context.Context) error { | |
if restoreFn != nil { | ||
// use context.Background to make sure this restore function can still be executed even if ctx is canceled | ||
restoreCtx := context.Background() | ||
needSwitchBack, err := mgr.CheckAndFinishRestore(restoreCtx) | ||
needSwitchBack, needCleanup, err := mgr.CheckAndFinishRestore(restoreCtx, taskFinished) | ||
if err != nil { | ||
logTask.Warn("check restore pd schedulers failed", zap.Error(err)) | ||
return | ||
|
@@ -1225,19 +1227,22 @@ func (rc *Controller) restoreTables(ctx context.Context) error { | |
if restoreE := restoreFn(restoreCtx); restoreE != nil { | ||
logTask.Warn("failed to restore removed schedulers, you may need to restore them manually", zap.Error(restoreE)) | ||
} | ||
|
||
logTask.Info("add back PD leader®ion schedulers") | ||
// clean up task metas | ||
if cleanupErr := mgr.Cleanup(restoreCtx); cleanupErr != nil { | ||
logTask.Warn("failed to clean task metas, you may need to restore them manually", zap.Error(cleanupErr)) | ||
} | ||
// cleanup table meta and schema db if needed. | ||
cleanupFunc = func() { | ||
if e := mgr.CleanupAllMetas(restoreCtx); err != nil { | ||
logTask.Warn("failed to clean table task metas, you may need to restore them manually", zap.Error(e)) | ||
if needCleanup { | ||
logTask.Info("cleanup task metas") | ||
if cleanupErr := mgr.Cleanup(restoreCtx); cleanupErr != nil { | ||
logTask.Warn("failed to clean task metas, you may need to restore them manually", zap.Error(cleanupErr)) | ||
} | ||
// cleanup table meta and schema db if needed. | ||
cleanupFunc = func() { | ||
if e := mgr.CleanupAllMetas(restoreCtx); err != nil { | ||
logTask.Warn("failed to clean table task metas, you may need to restore them manually", zap.Error(e)) | ||
} | ||
} | ||
} | ||
} | ||
|
||
logTask.Info("add back PD leader®ion schedulers") | ||
} | ||
|
||
pdController.Close() | ||
|
@@ -1435,6 +1440,7 @@ func (rc *Controller) restoreTables(ctx context.Context) error { | |
// finishSchedulers() | ||
// cancelFunc(switchBack) | ||
// finishFuncCalled = true | ||
Comment on lines
1437
to
1439
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need clean up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We originally want to restore pd schedulers and switch back tikv to normal mode after data import finished. Then the cluster can do possible rebalance during checksum and analyze. But In our test, these rebalance will bring non-trivial impact to checksum and analyze. So we need to investigate further to determine whether we can still do this. So I think we can keep these before coming up with a clear conclusion. |
||
taskFinished = true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is it used for? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If current task is exit before lightning finished (maybe met error or by user terminating), we should not clean up the task/table meta tables if all other lightning are finished There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I see. |
||
|
||
close(postProcessTaskChan) | ||
// otherwise, we should run all tasks in the post-process task chan | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the comment feels outdated.