From f9881cc6d0421457a1a06ad352622d1abe74352e Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 10 Mar 2022 11:59:32 +0530 Subject: [PATCH 1/5] test: improve test to check that replication manager is stopped before we make any changes in MySQL Signed-off-by: Manan Gupta --- .../fakemysqldaemon/fakemysqldaemon.go | 6 +++ .../tabletmanager/rpc_replication_test.go | 42 +++++++++++++++++-- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go b/go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go index 4d49dc35cc8..7bea746f607 100644 --- a/go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go +++ b/go/vt/mysqlctl/fakemysqldaemon/fakemysqldaemon.go @@ -84,6 +84,9 @@ type FakeMysqlDaemon struct { // StartReplicationError is used by StartReplication StartReplicationError error + // PromoteLag is the time for which Promote will stall + PromoteLag time.Duration + // PrimaryStatusError is used by PrimaryStatus PrimaryStatusError error @@ -424,6 +427,9 @@ func (fmd *FakeMysqlDaemon) WaitSourcePos(_ context.Context, pos mysql.Position) // Promote is part of the MysqlDaemon interface func (fmd *FakeMysqlDaemon) Promote(hookExtraEnv map[string]string) (mysql.Position, error) { + if fmd.PromoteLag > 0 { + time.Sleep(fmd.PromoteLag) + } if fmd.PromoteError != nil { return mysql.Position{}, fmd.PromoteError } diff --git a/go/vt/vttablet/tabletmanager/rpc_replication_test.go b/go/vt/vttablet/tabletmanager/rpc_replication_test.go index a6cf48a6a61..b37d88518bf 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication_test.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication_test.go @@ -18,23 +18,59 @@ package tabletmanager import ( "context" + "fmt" "testing" + "time" "github.com/stretchr/testify/require" + "vitess.io/vitess/go/vt/mysqlctl/fakemysqldaemon" "vitess.io/vitess/go/vt/topo/memorytopo" ) -// TestPromoteReplicaHealthTicksStopped checks that the health ticks are not running on the -// replication manager after running PromoteReplica -func TestPromoteReplicaHealthTicksStopped(t *testing.T) { +// TestPromoteReplicaReplicationManagerSuccess checks that the replication manager is not running after running PromoteReplica +// We also assert that replication manager is stopped before we make any changes to MySQL. +func TestPromoteReplicaReplicationManagerSuccess(t *testing.T) { ctx := context.Background() ts := memorytopo.NewServer("cell1") statsTabletTypeCount.ResetAll() tm := newTestTM(t, ts, 100, keyspace, shard) defer tm.Stop() + // Stop the replication manager and set the interval to 100 milliseconds + tm.replManager.ticks.Stop() + tm.replManager.ticks.SetInterval(100 * time.Millisecond) + // Change the ticks function of the replication manager so that we can keep the count of how many times it is called + numTicksRan := 0 + tm.replManager.ticks.Start(func() { + numTicksRan++ + }) + // Set the promotion lag to a second and then run PromoteReplica + tm.MysqlDaemon.(*fakemysqldaemon.FakeMysqlDaemon).PromoteLag = time.Second _, err := tm.PromoteReplica(ctx, false) require.NoError(t, err) + // At the end we expect the replication manager to be stopped. require.False(t, tm.replManager.ticks.Running()) + // We want the replication manager to be stopped before we call Promote on the MySQL instance. + // Since that call will take over a second to complete, if the replication manager is running, then it will have ticked + // 9 to 10 times. If we had stopped it before, then the replication manager would have only ticked 1-2 times. + // So we assert that the numTicksRan is less than 5 to check that the replication manager was closed before we call Promote. + require.Less(t, numTicksRan, 5) +} + +// TestPromoteReplicaReplicationManagerFailure checks that the replication manager is running after running PromoteReplica fails. +func TestPromoteReplicaReplicationManagerFailure(t *testing.T) { + ctx := context.Background() + ts := memorytopo.NewServer("cell1") + statsTabletTypeCount.ResetAll() + tm := newTestTM(t, ts, 100, keyspace, shard) + defer tm.Stop() + + require.True(t, tm.replManager.ticks.Running()) + // Set the promotion lag to a second and then run PromoteReplica + tm.MysqlDaemon.(*fakemysqldaemon.FakeMysqlDaemon).PromoteError = fmt.Errorf("promote error") + _, err := tm.PromoteReplica(ctx, false) + require.Error(t, err) + // At the end we expect the replication manager to be stopped. + require.True(t, tm.replManager.ticks.Running()) } From 3aaa897c493e7a3e1dc5612f37b28cbc0d33cb80 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 10 Mar 2022 12:04:30 +0530 Subject: [PATCH 2/5] feat: stop the replication manager first in PromoteReplica call Signed-off-by: Manan Gupta --- go/vt/vttablet/tabletmanager/rpc_replication.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 37462c3a2e4..5306cef0154 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -837,6 +837,11 @@ func (tm *TabletManager) PromoteReplica(ctx context.Context, semiSync bool) (str } defer tm.unlock() + tm.replManager.SetTabletType(topodatapb.TabletType_PRIMARY) + defer func() { + tm.replManager.SetTabletType(tm.Tablet().Type) + }() + // If Orchestrator is configured then also tell it we're promoting a tablet so it needs to be in maintenance mode // Do this in the background, as it's best-effort. go func() { From a9be7db708bfe08c4aec04bbc18a36efddcab9c1 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 16 Mar 2022 12:12:57 +0530 Subject: [PATCH 3/5] docs: add comments to replication manager setTabletType Signed-off-by: Manan Gupta --- go/vt/vttablet/tabletmanager/replmanager.go | 2 ++ go/vt/vttablet/tabletmanager/rpc_replication.go | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/go/vt/vttablet/tabletmanager/replmanager.go b/go/vt/vttablet/tabletmanager/replmanager.go index c64bb5ebf05..58962faa933 100644 --- a/go/vt/vttablet/tabletmanager/replmanager.go +++ b/go/vt/vttablet/tabletmanager/replmanager.go @@ -65,6 +65,8 @@ func newReplManager(ctx context.Context, tm *TabletManager, interval time.Durati } } +// SetTabletType starts/stops the replication manager ticks based on the tablet type provided. +// It stops the ticks if the tablet type is not a replica type, starts the ticks otherwise. func (rm *replManager) SetTabletType(tabletType topodatapb.TabletType) { if *mysqlctl.DisableActiveReparents { return diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 5306cef0154..98d375ecf5c 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -837,8 +837,14 @@ func (tm *TabletManager) PromoteReplica(ctx context.Context, semiSync bool) (str } defer tm.unlock() + // SetTabletType only stops the replication manager ticks since we are going to promote this tablet + // to primary. We should do this before making any changes to MySQL, otherwise any replication manager + // tick would change the replication source, etc settings back. tm.replManager.SetTabletType(topodatapb.TabletType_PRIMARY) defer func() { + // If PromoteReplica was successful, then this would be a no-op, since we have already stopped + // the replication manager ticks. But, in case we are unsuccessful, we must restart the ticks + // so, we call SetTabletType as a deferred call. tm.replManager.SetTabletType(tm.Tablet().Type) }() From 3349b7750eebb6dcb5ed52d52e849b084420dc0c Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 16 Mar 2022 12:16:20 +0530 Subject: [PATCH 4/5] docs: add comment for SetReplicationSource in wrangler Signed-off-by: Manan Gupta --- go/vt/wrangler/tablet.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/go/vt/wrangler/tablet.go b/go/vt/wrangler/tablet.go index 8dbe6e9541f..9d311e07f1f 100644 --- a/go/vt/wrangler/tablet.go +++ b/go/vt/wrangler/tablet.go @@ -129,10 +129,12 @@ func (wr *Wrangler) StartReplication(ctx context.Context, tablet *topodatapb.Tab // SetReplicationSource is used to set the replication source on the specified tablet to the current shard primary (if available). // It also figures out if the tablet should be sending semi-sync ACKs or not and passes that to the tabletmanager RPC. -// It does not start the replication forcefully +// It does not start the replication forcefully. If we are unable to find the shard primary of the tablet from the topo server +// we exit out without any error. func (wr *Wrangler) SetReplicationSource(ctx context.Context, tablet *topodatapb.Tablet) error { shardPrimary, err := wr.getShardPrimaryForTablet(ctx, tablet) if err != nil { + // If we didn't find the shard primary, we return without any error return nil } semiSync := reparentutil.IsReplicaSemiSync(shardPrimary.Tablet, tablet) From 9c46153ba748a7cc133941f339f882031fd2ac8f Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 16 Mar 2022 12:21:40 +0530 Subject: [PATCH 5/5] docs: add comments to SetReplicationSource in reparentutil Signed-off-by: Manan Gupta --- go/vt/vtctl/reparentutil/replication.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/go/vt/vtctl/reparentutil/replication.go b/go/vt/vtctl/reparentutil/replication.go index b0bfcba125e..a98053f829e 100644 --- a/go/vt/vtctl/reparentutil/replication.go +++ b/go/vt/vtctl/reparentutil/replication.go @@ -159,9 +159,12 @@ func ReplicaWasRunning(stopStatus *replicationdatapb.StopReplicationStatus) (boo // tabletmanager RPC. // // It does not start the replication forcefully. +// If we are unable to find the shard primary of the tablet from the topo server +// we exit out without any error. func SetReplicationSource(ctx context.Context, ts *topo.Server, tmc tmclient.TabletManagerClient, tablet *topodatapb.Tablet) error { shardPrimary, err := topotools.GetShardPrimaryForTablet(ctx, ts, tablet) if err != nil { + // If we didn't find the shard primary, we return without any error return nil }