From 16cd08ef47177196c11e662579a211cb4ec44253 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 11 Dec 2024 12:03:04 +0800 Subject: [PATCH 1/7] try fix test case with failpoint --- br/tests/br_restore_checkpoint/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index da45692cdcb62..aa9c144fc32ce 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -69,8 +69,8 @@ if [ $restore_fail -ne 1 ]; then fi # PITR with checkpoint but failed in the log restore metakv stage -export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/corrupt-files=return(\"only-last-table-files\");\ -github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/corrupt-files=return(\"only-last-table-files\")" +export GO_FAILPOINTS= $GO_FAILPOINTS";github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" restore_fail=0 run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 export GO_FAILPOINTS="" From f98d9e4700b0a95a6f9bc1872f8c78daa5267e52 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 11 Dec 2024 13:35:29 +0800 Subject: [PATCH 2/7] fix line --- br/tests/br_restore_checkpoint/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index aa9c144fc32ce..f8df5d6664c44 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -70,7 +70,7 @@ fi # PITR with checkpoint but failed in the log restore metakv stage export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/corrupt-files=return(\"only-last-table-files\")" -export GO_FAILPOINTS= $GO_FAILPOINTS";github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" +export GO_FAILPOINTS=$GO_FAILPOINTS";github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" restore_fail=0 run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 export GO_FAILPOINTS="" From 0599ea237e707c235f81f355bc05b04777b6d08a Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 11 Dec 2024 16:33:01 +0800 Subject: [PATCH 3/7] fix case --- br/tests/br_restore_checkpoint/run.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index f8df5d6664c44..ccf2e9e533c1a 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -70,12 +70,20 @@ fi # PITR with checkpoint but failed in the log restore metakv stage export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/corrupt-files=return(\"only-last-table-files\")" -export GO_FAILPOINTS=$GO_FAILPOINTS";github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" restore_fail=0 run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 export GO_FAILPOINTS="" if [ $restore_fail -ne 1 ]; then - echo 'PITR success' + echo 'PITR success, but should fail due to the not start with last-table-files only' + exit 1 +fi + +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" +restore_fail=0 +run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 +export GO_FAILPOINTS="" +if [ $restore_fail -ne 1 ]; then + echo 'PITR success, but should fail" exit 1 fi From 07332e8541452bd178e513bf0dc04c117dd5e033 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 12 Dec 2024 11:36:11 +0800 Subject: [PATCH 4/7] fix the nil checkpointrunner --- br/pkg/restore/snap_client/client.go | 24 +++++++++++++++++------ br/pkg/restore/snap_client/tikv_sender.go | 5 +++-- br/pkg/task/restore_raw.go | 4 ++-- br/pkg/task/restore_txn.go | 4 ++-- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index 3b5e29c5a1927..b08d4a891953a 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -75,7 +75,8 @@ const ( const minBatchDdlSize = 1 type SnapClient struct { - restorer restore.SstRestorer + restorer restore.SstRestorer + getRestorerFn func(*checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer // Tool clients used by SnapClient pdClient pd.Client pdHTTPClient pdhttp.Client @@ -148,7 +149,8 @@ type SnapClient struct { rewriteMode RewriteMode // checkpoint information for snapshot restore - checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType] + checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType] + checkpointChecksum map[int64]*checkpoint.ChecksumItem } @@ -168,7 +170,10 @@ func NewRestoreClient( } } -func (rc *SnapClient) GetRestorer() restore.SstRestorer { +func (rc *SnapClient) GetRestorer(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { + if rc.restorer == nil { + rc.restorer = rc.getRestorerFn(checkpointRunner) + } return rc.restorer } @@ -389,7 +394,10 @@ func (rc *SnapClient) InitCheckpoint( return checkpointSetWithTableID, nil, errors.Trace(err) } rc.checkpointRunner, err = checkpoint.StartCheckpointRunnerForRestore(ctx, se, checkpoint.SnapshotRestoreCheckpointDatabaseName) - return checkpointSetWithTableID, checkpointClusterConfig, errors.Trace(err) + if err != nil { + return checkpointSetWithTableID, nil, errors.Trace(err) + } + return checkpointSetWithTableID, checkpointClusterConfig, nil } func (rc *SnapClient) WaitForFinishCheckpoint(ctx context.Context, flush bool) { @@ -539,7 +547,9 @@ func (rc *SnapClient) initClients(ctx context.Context, backend *backuppb.Storage return errors.Trace(err) } // Raw/Txn restore are not support checkpoint for now - rc.restorer = restore.NewSimpleSstRestorer(ctx, fileImporter, rc.workerPool, nil) + rc.getRestorerFn = func(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { + return restore.NewSimpleSstRestorer(ctx, fileImporter, rc.workerPool, nil) + } } else { // or create a fileImporter with the cluster API version fileImporter, err = NewSnapFileImporter( @@ -547,7 +557,9 @@ func (rc *SnapClient) initClients(ctx context.Context, backend *backuppb.Storage if err != nil { return errors.Trace(err) } - rc.restorer = restore.NewMultiTablesRestorer(ctx, fileImporter, rc.workerPool, rc.checkpointRunner) + rc.getRestorerFn = func(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { + return restore.NewMultiTablesRestorer(ctx, fileImporter, rc.workerPool, checkpointRunner) + } } return nil } diff --git a/br/pkg/restore/snap_client/tikv_sender.go b/br/pkg/restore/snap_client/tikv_sender.go index 57f73835beda7..66909d8ac5a7b 100644 --- a/br/pkg/restore/snap_client/tikv_sender.go +++ b/br/pkg/restore/snap_client/tikv_sender.go @@ -385,9 +385,10 @@ func (rc *SnapClient) RestoreSSTFiles( } }) - retErr = rc.restorer.GoRestore(onProgress, tableIDWithFilesGroup...) + r := rc.GetRestorer(rc.checkpointRunner) + retErr = r.GoRestore(onProgress, tableIDWithFilesGroup...) if retErr != nil { return retErr } - return rc.restorer.WaitUntilFinish() + return r.WaitUntilFinish() } diff --git a/br/pkg/task/restore_raw.go b/br/pkg/task/restore_raw.go index acb2e48041e64..1680e60472f7b 100644 --- a/br/pkg/task/restore_raw.go +++ b/br/pkg/task/restore_raw.go @@ -163,11 +163,11 @@ func RunRestoreRaw(c context.Context, g glue.Glue, cmdName string, cfg *RestoreR defer restore.RestorePostWork(ctx, importModeSwitcher, restoreSchedulers, cfg.Online) start := time.Now() - err = client.GetRestorer().GoRestore(onProgress, restore.CreateUniqueFileSets(files)) + err = client.GetRestorer(nil).GoRestore(onProgress, restore.CreateUniqueFileSets(files)) if err != nil { return errors.Trace(err) } - err = client.GetRestorer().WaitUntilFinish() + err = client.GetRestorer(nil).WaitUntilFinish() if err != nil { return errors.Trace(err) } diff --git a/br/pkg/task/restore_txn.go b/br/pkg/task/restore_txn.go index 2af64a59602cc..16d8e099f659d 100644 --- a/br/pkg/task/restore_txn.go +++ b/br/pkg/task/restore_txn.go @@ -102,11 +102,11 @@ func RunRestoreTxn(c context.Context, g glue.Glue, cmdName string, cfg *Config) } defer restore.RestorePostWork(ctx, importModeSwitcher, restoreSchedulers, false) - err = client.GetRestorer().GoRestore(onProgress, restore.CreateUniqueFileSets(files)) + err = client.GetRestorer(nil).GoRestore(onProgress, restore.CreateUniqueFileSets(files)) if err != nil { return errors.Trace(err) } - err = client.GetRestorer().WaitUntilFinish() + err = client.GetRestorer(nil).WaitUntilFinish() if err != nil { return errors.Trace(err) } From 02f6a300832f61d962493220ebc351037ffb0950 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 12 Dec 2024 18:49:51 +0800 Subject: [PATCH 5/7] fix case --- br/tests/br_restore_checkpoint/run.sh | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index ccf2e9e533c1a..b7a9808bfd487 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -70,22 +70,18 @@ fi # PITR with checkpoint but failed in the log restore metakv stage export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/corrupt-files=return(\"only-last-table-files\")" +export GO_FAILPOINTS=$GO_FAILPOINTS";github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" restore_fail=0 run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 export GO_FAILPOINTS="" if [ $restore_fail -ne 1 ]; then - echo 'PITR success, but should fail due to the not start with last-table-files only' + echo 'PITR success, but should fail' exit 1 fi -export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" -restore_fail=0 -run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 -export GO_FAILPOINTS="" -if [ $restore_fail -ne 1 ]; then - echo 'PITR success, but should fail" - exit 1 -fi +# check the snapshot restore has checkpoint data +run_sql 'select count(*) from "__TiDB_BR_Temporary_Snapshot_Restore_Checkpoint.cpt_data";' +check_contains "count(*): 1" # PITR with checkpoint but failed in the log restore datakv stage # skip the snapshot restore stage From c159e534544222af3224b744866240fc58f21a67 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Mon, 16 Dec 2024 14:10:24 +0800 Subject: [PATCH 6/7] fix test --- br/tests/br_restore_checkpoint/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index b7a9808bfd487..7fe1a0e678773 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -80,7 +80,7 @@ if [ $restore_fail -ne 1 ]; then fi # check the snapshot restore has checkpoint data -run_sql 'select count(*) from "__TiDB_BR_Temporary_Snapshot_Restore_Checkpoint.cpt_data";' +run_sql 'select count(*) from '"__TiDB_BR_Temporary_Snapshot_Restore_Checkpoint"'.`cpt_data`;' check_contains "count(*): 1" # PITR with checkpoint but failed in the log restore datakv stage From 7df6ea6a0633e84e75f4775c599beac2a5a37429 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 19 Dec 2024 08:29:20 +0800 Subject: [PATCH 7/7] add comment --- br/pkg/restore/snap_client/client.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index b08d4a891953a..ae878b0e9e0ca 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -75,7 +75,8 @@ const ( const minBatchDdlSize = 1 type SnapClient struct { - restorer restore.SstRestorer + restorer restore.SstRestorer + // Use a closure to lazy load checkpoint runner getRestorerFn func(*checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer // Tool clients used by SnapClient pdClient pd.Client