From ab7cd3e6c7def556e7e0ce328ab395623e28495d Mon Sep 17 00:00:00 2001 From: hillium Date: Mon, 15 Apr 2024 16:39:16 +0800 Subject: [PATCH 1/4] fix adapt env for snapshot backup stuck when encountered error Signed-off-by: hillium --- br/pkg/task/operator/cmd.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/br/pkg/task/operator/cmd.go b/br/pkg/task/operator/cmd.go index 2c944203c457f..fe456e223c2f1 100644 --- a/br/pkg/task/operator/cmd.go +++ b/br/pkg/task/operator/cmd.go @@ -138,7 +138,11 @@ func AdaptEnvForSnapshotBackup(ctx context.Context, cfg *PauseGcConfig) error { cx.run(func() error { return pauseGCKeeper(cx) }) cx.run(func() error { log.Info("Pause scheduler waiting all connections established.") - <-initChan + select { + case <-initChan: + case <-cx.Done(): + return cx.Err() + } log.Info("Pause scheduler noticed connections established.") return pauseSchedulerKeeper(cx) }) From 921cbe1622c031c8bb4f6f6e5d3d8a338d3480bd Mon Sep 17 00:00:00 2001 From: hillium Date: Wed, 24 Apr 2024 16:09:47 +0800 Subject: [PATCH 2/4] added a test csae Signed-off-by: hillium --- br/pkg/backup/prepare_snap/prepare.go | 4 +++ br/pkg/task/operator/cmd.go | 15 +++------- tests/realtikvtest/brietest/operator_test.go | 31 ++++++++++++++++++++ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/br/pkg/backup/prepare_snap/prepare.go b/br/pkg/backup/prepare_snap/prepare.go index 405fc3a1502c2..227f36704b298 100644 --- a/br/pkg/backup/prepare_snap/prepare.go +++ b/br/pkg/backup/prepare_snap/prepare.go @@ -22,6 +22,7 @@ import ( "github.com/google/btree" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" brpb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/log" @@ -453,6 +454,9 @@ func (p *Preparer) pushWaitApply(reqs pendingRequests, region Region) { // PrepareConnections prepares the connections for each store. // This will pause the admin commands for each store. func (p *Preparer) PrepareConnections(ctx context.Context) error { + failpoint.Inject("PrepareConnectionsErr", func() { + failpoint.Return(errors.New("meow meow meow")) + }) log.Info("Preparing connections to stores.") stores, err := p.env.GetAllLiveStores(ctx) if err != nil { diff --git a/br/pkg/task/operator/cmd.go b/br/pkg/task/operator/cmd.go index fe456e223c2f1..cbe5c3ac2442b 100644 --- a/br/pkg/task/operator/cmd.go +++ b/br/pkg/task/operator/cmd.go @@ -5,14 +5,12 @@ package operator import ( "context" "crypto/tls" - "fmt" - "math/rand" - "os" "runtime/debug" "sync" "time" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" "github.com/pingcap/log" preparesnap "github.com/pingcap/tidb/br/pkg/backup/prepare_snap" berrors "github.com/pingcap/tidb/br/pkg/errors" @@ -148,6 +146,9 @@ func AdaptEnvForSnapshotBackup(ctx context.Context, cfg *PauseGcConfig) error { }) cx.run(func() error { return pauseAdminAndWaitApply(cx, initChan) }) go func() { + failpoint.Inject("SkipReadyHint", func() { + failpoint.Return() + }) cx.rdGrp.Wait() if cfg.OnAllReady != nil { cfg.OnAllReady() @@ -196,14 +197,6 @@ func pauseAdminAndWaitApply(cx *AdaptEnvForSnapshotBackupContext, afterConnectio return nil } -func getCallerName() string { - name, err := os.Hostname() - if err != nil { - name = fmt.Sprintf("UNKNOWN-%d", rand.Int63()) - } - return fmt.Sprintf("operator@%sT%d#%d", name, time.Now().Unix(), os.Getpid()) -} - func pauseGCKeeper(cx *AdaptEnvForSnapshotBackupContext) (err error) { // Note: should we remove the service safepoint as soon as this exits? sp := utils.BRServiceSafePoint{ diff --git a/tests/realtikvtest/brietest/operator_test.go b/tests/realtikvtest/brietest/operator_test.go index 863ae10f12ade..37eae760a96d8 100644 --- a/tests/realtikvtest/brietest/operator_test.go +++ b/tests/realtikvtest/brietest/operator_test.go @@ -23,6 +23,7 @@ import ( "time" "github.com/google/uuid" + "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/import_sstpb" "github.com/pingcap/kvproto/pkg/kvrpcpb" "github.com/pingcap/tidb/br/pkg/task" @@ -224,3 +225,33 @@ func TestOperator(t *testing.T) { verifySchedulerNotStopped(req, cfg) verifyGCNotStopped(req, cfg) } + +func TestFailure(t *testing.T) { + req := require.New(t) + req.NoError(failpoint.Enable("github.com/pingcap/tidb/br/pkg/backup/prepare_snap/PrepareConnectionsErr", "return()")) + // Make goleak happy. + req.NoError(failpoint.Enable("github.com/pingcap/tidb/br/pkg/task/operator/SkipReadyHint", "return()")) + defer func() { + req.NoError(failpoint.Disable("github.com/pingcap/tidb/br/pkg/backup/prepare_snap/PrepareConnectionsErr")) + req.NoError(failpoint.Disable("github.com/pingcap/tidb/br/pkg/task/operator/SkipReadyHint")) + }() + + cfg := operator.PauseGcConfig{ + Config: task.Config{ + PD: []string{"127.0.0.1:2379"}, + }, + TTL: 5 * time.Minute, + SafePoint: oracle.GoTimeToTS(time.Now()), + } + + verifyGCNotStopped(req, cfg) + verifySchedulerNotStopped(req, cfg) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + err := operator.AdaptEnvForSnapshotBackup(ctx, &cfg) + require.Error(t, err) + + verifyGCNotStopped(req, cfg) + verifySchedulerNotStopped(req, cfg) +} From dc191122dd1b8d89206f78b61f553bcff88ae448 Mon Sep 17 00:00:00 2001 From: hillium Date: Thu, 25 Apr 2024 16:12:32 +0800 Subject: [PATCH 3/4] make bazel_prepare Signed-off-by: hillium --- br/pkg/backup/prepare_snap/BUILD.bazel | 1 + br/pkg/task/operator/BUILD.bazel | 1 + 2 files changed, 2 insertions(+) diff --git a/br/pkg/backup/prepare_snap/BUILD.bazel b/br/pkg/backup/prepare_snap/BUILD.bazel index ec2f8bbaf1f56..2fffcacd5e088 100644 --- a/br/pkg/backup/prepare_snap/BUILD.bazel +++ b/br/pkg/backup/prepare_snap/BUILD.bazel @@ -17,6 +17,7 @@ go_library( "@com_github_docker_go_units//:go-units", "@com_github_google_btree//:btree", "@com_github_pingcap_errors//:errors", + "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/brpb", "@com_github_pingcap_kvproto//pkg/errorpb", "@com_github_pingcap_kvproto//pkg/metapb", diff --git a/br/pkg/task/operator/BUILD.bazel b/br/pkg/task/operator/BUILD.bazel index 52c99c845b57b..c7b8bbeb4ea23 100644 --- a/br/pkg/task/operator/BUILD.bazel +++ b/br/pkg/task/operator/BUILD.bazel @@ -16,6 +16,7 @@ go_library( "//br/pkg/task", "//br/pkg/utils", "@com_github_pingcap_errors//:errors", + "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_log//:log", "@com_github_spf13_pflag//:pflag", "@com_github_tikv_client_go_v2//tikv", From af4cab2a815d95cf1ee9f4f100a899e3cd851df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 25 Apr 2024 17:49:59 +0800 Subject: [PATCH 4/4] Update br/pkg/backup/prepare_snap/prepare.go Co-authored-by: BornChanger <97348524+BornChanger@users.noreply.github.com> --- br/pkg/backup/prepare_snap/prepare.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/br/pkg/backup/prepare_snap/prepare.go b/br/pkg/backup/prepare_snap/prepare.go index 227f36704b298..2435fb4986bdc 100644 --- a/br/pkg/backup/prepare_snap/prepare.go +++ b/br/pkg/backup/prepare_snap/prepare.go @@ -455,7 +455,7 @@ func (p *Preparer) pushWaitApply(reqs pendingRequests, region Region) { // This will pause the admin commands for each store. func (p *Preparer) PrepareConnections(ctx context.Context) error { failpoint.Inject("PrepareConnectionsErr", func() { - failpoint.Return(errors.New("meow meow meow")) + failpoint.Return(errors.New("mock PrepareConnectionsErr")) }) log.Info("Preparing connections to stores.") stores, err := p.env.GetAllLiveStores(ctx)