From 0e76ecec9eb2afd2f775dfa6692009660cbe16e5 Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Tue, 15 Sep 2020 22:45:16 +0200 Subject: [PATCH 1/5] Reparent test: refactored tests to setup/teardown everytime, improve code quality and hopefully fix or isolate current flakiness Signed-off-by: Rohit Nayak --- go/test/endtoend/reparent/main_test.go | 180 ---- .../reparent/reparent_range_based_test.go | 65 +- go/test/endtoend/reparent/reparent_test.go | 944 +++--------------- go/test/endtoend/reparent/utils_test.go | 538 ++++++++++ 4 files changed, 677 insertions(+), 1050 deletions(-) delete mode 100644 go/test/endtoend/reparent/main_test.go create mode 100644 go/test/endtoend/reparent/utils_test.go diff --git a/go/test/endtoend/reparent/main_test.go b/go/test/endtoend/reparent/main_test.go deleted file mode 100644 index 87a8a95abae..00000000000 --- a/go/test/endtoend/reparent/main_test.go +++ /dev/null @@ -1,180 +0,0 @@ -/* -Copyright 2019 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package reparent - -import ( - "context" - "flag" - "fmt" - "os" - "os/exec" - "path" - "testing" - - "vitess.io/vitess/go/vt/log" - - "github.com/stretchr/testify/require" - "vitess.io/vitess/go/mysql" - "vitess.io/vitess/go/sqltypes" - "vitess.io/vitess/go/test/endtoend/cluster" - tmc "vitess.io/vitess/go/vt/vttablet/grpctmclient" -) - -var ( - // ClusterInstance instance to be used for test with different params - clusterInstance *cluster.LocalProcessCluster - tmClient *tmc.Client - keyspaceName = "ks" - shardName = "0" - shard1Name = "0000000000000000-ffffffffffffffff" - keyspaceShard = keyspaceName + "/" + shardName - dbName = "vt_" + keyspaceName - username = "vt_dba" - hostname = "localhost" - cell1 = "zone1" - cell2 = "zone2" - insertSQL = "insert into vt_insert_test(id, msg) values (%d, 'test %d')" - sqlSchema = ` - create table vt_insert_test ( - id bigint, - msg varchar(64), - primary key (id) - ) Engine=InnoDB - ` - // Tablets for shard0 - tablet62344 *cluster.Vttablet - tablet62044 *cluster.Vttablet - tablet41983 *cluster.Vttablet - tablet31981 *cluster.Vttablet - - // Tablets for shard1 - masterTablet *cluster.Vttablet - replicaTablet *cluster.Vttablet -) - -func TestMain(m *testing.M) { - defer cluster.PanicHandler(nil) - flag.Parse() - - exitCode := func() int { - clusterInstance = cluster.NewCluster(cell1, hostname) - defer clusterInstance.Teardown() - - // Launch keyspace - keyspace := &cluster.Keyspace{Name: keyspaceName} - - // Start topo server - err := clusterInstance.StartTopo() - if err != nil { - return 1 - } - - // Adding another cell in the same cluster - err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+cell2) - if err != nil { - return 1 - } - err = clusterInstance.VtctlProcess.AddCellInfo(cell2) - if err != nil { - return 1 - } - - tablet62344 = clusterInstance.NewVttabletInstance("replica", 62344, "") - tablet62044 = clusterInstance.NewVttabletInstance("replica", 62044, "") - tablet41983 = clusterInstance.NewVttabletInstance("replica", 41983, "") - tablet31981 = clusterInstance.NewVttabletInstance("replica", 31981, cell2) - - shard0 := &cluster.Shard{Name: shardName} - shard0.Vttablets = []*cluster.Vttablet{tablet62344, tablet62044, tablet41983, tablet31981} - - // Initiate shard1 - required for ranged based reparenting - masterTablet = clusterInstance.NewVttabletInstance("replica", 0, "") - replicaTablet = clusterInstance.NewVttabletInstance("replica", 0, "") - - shard1 := &cluster.Shard{Name: shard1Name} - shard1.Vttablets = []*cluster.Vttablet{masterTablet, replicaTablet} - - clusterInstance.VtTabletExtraArgs = []string{ - "-lock_tables_timeout", "5s", - "-enable_semi_sync", - "-track_schema_versions=false", // remove this line once https://github.com/vitessio/vitess/issues/6474 is fixed - } - - // Initialize Cluster - err = clusterInstance.LaunchCluster(keyspace, []cluster.Shard{*shard0, *shard1}) - if err != nil { - return 1 - } - - //Start MySql - var mysqlCtlProcessList []*exec.Cmd - for _, shard := range clusterInstance.Keyspaces[0].Shards { - for _, tablet := range shard.Vttablets { - log.Infof("Starting MySql for tablet %v", tablet.Alias) - if proc, err := tablet.MysqlctlProcess.StartProcess(); err != nil { - return 1 - } else { - // ignore golint warning, we need the else block to use proc - mysqlCtlProcessList = append(mysqlCtlProcessList, proc) - } - } - } - - // Wait for mysql processes to start - for _, proc := range mysqlCtlProcessList { - if err := proc.Wait(); err != nil { - return 1 - } - } - - // We do not need semiSync for this test case. - clusterInstance.EnableSemiSync = false - - // create tablet manager client - tmClient = tmc.NewClient() - - return m.Run() - }() - os.Exit(exitCode) -} - -func getMysqlConnParam(tablet *cluster.Vttablet) mysql.ConnParams { - connParams := mysql.ConnParams{ - Uname: username, - DbName: dbName, - UnixSocket: path.Join(os.Getenv("VTDATAROOT"), fmt.Sprintf("/vt_%010d/mysql.sock", tablet.TabletUID)), - } - return connParams -} - -func runSQL(ctx context.Context, t *testing.T, sql string, tablet *cluster.Vttablet) *sqltypes.Result { - // Get Connection - tabletParams := getMysqlConnParam(tablet) - conn, err := mysql.Connect(ctx, &tabletParams) - require.Nil(t, err) - defer conn.Close() - - // runSQL - return execute(t, conn, sql) -} - -func execute(t *testing.T, conn *mysql.Conn, query string) *sqltypes.Result { - t.Helper() - qr, err := conn.ExecuteFetch(query, 1000, true) - require.Nil(t, err) - return qr -} diff --git a/go/test/endtoend/reparent/reparent_range_based_test.go b/go/test/endtoend/reparent/reparent_range_based_test.go index e124ba0fcaf..ebc7bde124a 100644 --- a/go/test/endtoend/reparent/reparent_range_based_test.go +++ b/go/test/endtoend/reparent/reparent_range_based_test.go @@ -18,74 +18,31 @@ package reparent import ( "context" - "fmt" - "strings" "testing" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "vitess.io/vitess/go/test/endtoend/cluster" ) +var ( + masterTablet *cluster.Vttablet + replicaTablet *cluster.Vttablet +) + func TestReparentGracefulRangeBased(t *testing.T) { defer cluster.PanicHandler(t) ctx := context.Background() - for _, tablet := range []cluster.Vttablet{*masterTablet, *replicaTablet} { - // create database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*masterTablet, *replicaTablet} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Force the replica to reparent assuming that all the datasets are identical. - err := clusterInstance.VtctlclientProcess.ExecuteCommand("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shard1Name), masterTablet.Alias) - require.NoError(t, err) - - // Validate topology - validateTopology(t, true) + shardName = "0000000000000000-ffffffffffffffff" + defer func() { shardName = "0" }() - // create Tables - runSQL(ctx, t, sqlSchema, masterTablet) - - checkMasterTablet(t, masterTablet) - - validateTopology(t, false) - - // Run this to make sure it succeeds. - output, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "ShardReplicationPositions", fmt.Sprintf("%s/%s", keyspaceName, shard1Name)) - require.NoError(t, err) - strArray := strings.Split(output, "\n") - if strArray[len(strArray)-1] == "" { - strArray = strArray[:len(strArray)-1] // Truncate slice, remove empty line - } - assert.Equal(t, 2, len(strArray)) // one master, one replica - assert.Contains(t, strArray[0], "master") // master first + setupRangeBasedCluster(ctx, t) + defer teardownCluster() // Perform a graceful reparent operation - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "PlannedReparentShard", - "-keyspace_shard", fmt.Sprintf("%s/%s", keyspaceName, shard1Name), - "-new_master", replicaTablet.Alias) + _, err := prs(t, replicaTablet) require.NoError(t, err) - - // Validate topology validateTopology(t, false) - checkMasterTablet(t, replicaTablet) - - // insert data into the new master, check the connected replica work - insertSQL := fmt.Sprintf(insertSQL, 1, 1) - runSQL(ctx, t, insertSQL, replicaTablet) - err = checkInsertedValues(ctx, t, masterTablet, 1) - require.NoError(t, err) + confirmReplication(t, replicaTablet, []*cluster.Vttablet{masterTablet}) } diff --git a/go/test/endtoend/reparent/reparent_test.go b/go/test/endtoend/reparent/reparent_test.go index a2a52fbf0c1..6a703caa845 100644 --- a/go/test/endtoend/reparent/reparent_test.go +++ b/go/test/endtoend/reparent/reparent_test.go @@ -18,644 +18,243 @@ package reparent import ( "context" - "encoding/json" "fmt" - "reflect" - "strings" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "vitess.io/vitess/go/json2" - "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/vt/log" - querypb "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) -func TestMasterToSpareStateChangeImpossible(t *testing.T) { - defer cluster.PanicHandler(t) - - // need at least one replica because of semi-sync - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044} { +var ( + cell1 = "zone1" + cell2 = "zone2" + shardName = "0" + keyspaceShard = keyspaceName + "/" + shardName - // Start the tablet - err := tablet.VttabletProcess.Setup() - require.NoError(t, err) + tab1, tab2, tab3, tab4 *cluster.Vttablet +) - // Create Database - err = tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - } - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } +func TestMasterToSpareStateChangeImpossible(t *testing.T) { + defer cluster.PanicHandler(t) + setupReparentCluster(t) + defer teardownCluster() - // Init Shard Master - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.NoError(t, err, out) // We cannot change a master to spare - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("ChangeTabletType", tablet62344.Alias, "spare") + out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("ChangeTabletType", tab1.Alias, "spare") require.Error(t, err, out) require.Contains(t, out, "type change MASTER -> SPARE is not an allowed transition for ChangeTabletType") - - //kill Tablets - err = tablet62344.VttabletProcess.TearDown() - require.NoError(t, err) - err = tablet62044.VttabletProcess.TearDown() - require.NoError(t, err) } func TestReparentDownMaster(t *testing.T) { defer cluster.PanicHandler(t) - ctx := context.Background() + setupReparentCluster(t) + defer teardownCluster() - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - // Create Database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Reset status, don't wait for the tablet status. We will check it later - tablet.VttabletProcess.ServingStatus = "" - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Init Shard Master - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) - - // create Tables - runSQL(ctx, t, sqlSchema, tablet62344) + ctx := context.Background() // Make the current master agent and database unavailable. - err = tablet62344.VttabletProcess.TearDown() - require.NoError(t, err) - err = tablet62344.MysqlctlProcess.Stop() - require.NoError(t, err) + stopTablet(t, tab1, true) // Perform a planned reparent operation, will try to contact // the current master and fail somewhat quickly - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "-action_timeout", "1s", - "PlannedReparentShard", - "-wait_replicas_timeout", "5s", - "-keyspace_shard", keyspaceShard, - "-new_master", tablet62044.Alias) + _, err := prsWithTimeout(t, tab2, false, "1s", "5s") require.Error(t, err) + validateTopology(t, false) + // Run forced reparent operation, this should now proceed unimpeded. - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "EmergencyReparentShard", - "-keyspace_shard", keyspaceShard, - "-new_master", tablet62044.Alias, - "-wait_replicas_timeout", "30s") + out, err := ers(t, tab2, "30s") log.Infof("EmergencyReparentShard Output: %v", out) - require.Nil(t, err) require.NoError(t, err) // Check that old master tablet is left around for human intervention. - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate") - require.Error(t, err) - require.Contains(t, out, "already has master") + confirmOldMasterIsHangingAround(t) // Now we'll manually remove it, simulating a human cleaning up a dead master. - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "DeleteTablet", - "-allow_master", - tablet62344.Alias) - require.NoError(t, err) + deleteTablet(t, tab1) // Now validate topo is correct. validateTopology(t, false) - - checkMasterTablet(t, tablet62044) - - // insert data into the new master, check the connected replica work - insertSQL := fmt.Sprintf(insertSQL, 2, 2) - runSQL(ctx, t, insertSQL, tablet62044) - err = checkInsertedValues(ctx, t, tablet41983, 2) - require.NoError(t, err) - err = checkInsertedValues(ctx, t, tablet31981, 2) - require.NoError(t, err) - - // bring back the old master as a replica, check that it catches up - tablet62344.MysqlctlProcess.InitMysql = false - err = tablet62344.MysqlctlProcess.Start() - require.NoError(t, err) - err = clusterInstance.VtctlclientProcess.InitTablet(tablet62344, tablet62344.Cell, keyspaceName, hostname, shardName) - require.NoError(t, err) - - // As there is already a master the new replica will come directly in SERVING state - tablet62344.VttabletProcess.ServingStatus = "SERVING" - // Start the tablet - err = tablet62344.VttabletProcess.Setup() - require.NoError(t, err) - - err = checkInsertedValues(ctx, t, tablet62344, 2) - require.NoError(t, err) - - // Kill tablets - killTablets(t) + checkMasterTablet(t, tab2) + confirmReplication(t, tab2, []*cluster.Vttablet{tab3, tab4}) + resurrectTablet(ctx, t, tab1) } func TestReparentNoChoiceDownMaster(t *testing.T) { defer cluster.PanicHandler(t) + setupReparentCluster(t) + defer teardownCluster() + var err error ctx := context.Background() - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - // Create Database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Reset status, don't wait for the tablet status. We will check it later - tablet.VttabletProcess.ServingStatus = "" - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Init Shard Master - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) - - // create Tables - runSQL(ctx, t, sqlSchema, tablet62344) - - // insert data into the old master, check the connected replica work - insertSQL1 := fmt.Sprintf(insertSQL, 2, 2) - runSQL(ctx, t, insertSQL1, tablet62344) - err = checkInsertedValues(ctx, t, tablet62044, 2) - require.NoError(t, err) - err = checkInsertedValues(ctx, t, tablet41983, 2) - require.NoError(t, err) - err = checkInsertedValues(ctx, t, tablet31981, 2) - require.NoError(t, err) + confirmReplication(t, tab1, []*cluster.Vttablet{tab2, tab3, tab4}) // Make the current master agent and database unavailable. - err = tablet62344.VttabletProcess.TearDown() - require.NoError(t, err) - err = tablet62344.MysqlctlProcess.Stop() - require.NoError(t, err) + stopTablet(t, tab1, true) // Run forced reparent operation, this should now proceed unimpeded. - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "EmergencyReparentShard", - "-keyspace_shard", keyspaceShard, - "-wait_replicas_timeout", "30s") + out, err := ers(t, nil, "61s") require.NoError(t, err, out) // Check that old master tablet is left around for human intervention. - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate") - require.Error(t, err) - require.Contains(t, out, "already has master") - + confirmOldMasterIsHangingAround(t) // Now we'll manually remove the old master, simulating a human cleaning up a dead master. - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "DeleteTablet", - "-allow_master", - tablet62344.Alias) - require.NoError(t, err) - - // Now validate topo is correct. + deleteTablet(t, tab1) validateTopology(t, false) - - var newMasterTablet *cluster.Vttablet - for _, tablet := range []*cluster.Vttablet{tablet62044, tablet41983, tablet31981} { - if isHealthyMasterTablet(t, tablet) { - newMasterTablet = tablet - break - } - } - require.NotNil(t, newMasterTablet) + newMaster := getNewMaster(t) // Validate new master is not old master. - require.NotEqual(t, newMasterTablet.Alias, tablet62344.Alias) + require.NotEqual(t, newMaster.Alias, tab1.Alias) // Check new master has latest transaction. - err = checkInsertedValues(ctx, t, newMasterTablet, 2) + err = checkInsertedValues(ctx, t, newMaster, 2) require.NoError(t, err) // bring back the old master as a replica, check that it catches up - tablet62344.MysqlctlProcess.InitMysql = false - err = tablet62344.MysqlctlProcess.Start() - require.NoError(t, err) - err = clusterInstance.VtctlclientProcess.InitTablet(tablet62344, tablet62344.Cell, keyspaceName, hostname, shardName) - require.NoError(t, err) - - // As there is already a master the new replica will come directly in SERVING state - tablet62344.VttabletProcess.ServingStatus = "SERVING" - // Start the tablet - err = tablet62344.VttabletProcess.Setup() - require.NoError(t, err) - - err = checkInsertedValues(ctx, t, tablet62344, 2) - require.NoError(t, err) - - // Kill tablets - killTablets(t) + resurrectTablet(ctx, t, tab1) } func TestReparentIgnoreReplicas(t *testing.T) { defer cluster.PanicHandler(t) + setupReparentCluster(t) + defer teardownCluster() + var err error ctx := context.Background() - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - // Create Database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.Nil(t, err) - - // Reset status, don't wait for the tablet status. We will check it later - tablet.VttabletProcess.ServingStatus = "" - // Init Tablet - err = clusterInstance.VtctlclientProcess.InitTablet(&tablet, tablet.Cell, keyspaceName, hostname, shardName) - require.Nil(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.Nil(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.Nil(t, err) - } - - // Init Shard Master. - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.Nil(t, err, out) - - validateTopology(t, true) - - // Create Tables. - runSQL(ctx, t, sqlSchema, tablet62344) - - // insert data into the old master, check the connected replica work - insertSQL1 := fmt.Sprintf(insertSQL, 2, 2) - runSQL(ctx, t, insertSQL1, tablet62344) - err = checkInsertedValues(ctx, t, tablet62044, 2) - require.Nil(t, err) - err = checkInsertedValues(ctx, t, tablet41983, 2) - require.Nil(t, err) - err = checkInsertedValues(ctx, t, tablet31981, 2) - require.Nil(t, err) + confirmReplication(t, tab1, []*cluster.Vttablet{tab2, tab3, tab4}) // Make the current master agent and database unavailable. - err = tablet62344.VttabletProcess.TearDown() - require.Nil(t, err) - err = tablet62344.MysqlctlProcess.Stop() - require.Nil(t, err) + stopTablet(t, tab1, true) // Take down a replica - this should cause the emergency reparent to fail. - err = tablet41983.VttabletProcess.TearDown() - require.Nil(t, err) - err = tablet41983.MysqlctlProcess.Stop() - require.Nil(t, err) + stopTablet(t, tab3, true) // We expect this one to fail because we have an unreachable replica - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "EmergencyReparentShard", - "-keyspace_shard", keyspaceShard, - "-wait_replicas_timeout", "30s") + out, err := ers(t, nil, "30s") require.NotNil(t, err, out) // Now let's run it again, but set the command to ignore the unreachable replica. - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "EmergencyReparentShard", - "-keyspace_shard", keyspaceShard, - "-ignore_replicas", tablet41983.Alias, - "-wait_replicas_timeout", "30s") + out, err = ersIgnoreTablet(t, nil, "30s", tab3) require.Nil(t, err, out) // We'll bring back the replica we took down. - tablet41983.MysqlctlProcess.InitMysql = false - err = tablet41983.MysqlctlProcess.Start() - require.Nil(t, err) - err = clusterInstance.VtctlclientProcess.InitTablet(tablet41983, tablet41983.Cell, keyspaceName, hostname, shardName) - require.Nil(t, err) + restartTablet(t, tab3) // Check that old master tablet is left around for human intervention. - err = clusterInstance.VtctlclientProcess.ExecuteCommand("Validate") - require.Error(t, err) - - // Now we'll manually remove the old master, simulating a human cleaning up a dead master. - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "DeleteTablet", - "-allow_master", - tablet62344.Alias) - require.Nil(t, err) - - // Now validate topo is correct. + confirmOldMasterIsHangingAround(t) + deleteTablet(t, tab1) validateTopology(t, false) - var newMasterTablet *cluster.Vttablet - for _, tablet := range []*cluster.Vttablet{tablet62044, tablet41983, tablet31981} { - if isHealthyMasterTablet(t, tablet) { - newMasterTablet = tablet - break - } - } - require.NotNil(t, newMasterTablet) - + newMaster := getNewMaster(t) // Check new master has latest transaction. - err = checkInsertedValues(ctx, t, newMasterTablet, 2) + err = checkInsertedValues(ctx, t, newMaster, 2) require.Nil(t, err) // bring back the old master as a replica, check that it catches up - tablet62344.MysqlctlProcess.InitMysql = false - err = tablet62344.MysqlctlProcess.Start() - require.Nil(t, err) - err = clusterInstance.VtctlclientProcess.InitTablet(tablet62344, tablet62344.Cell, keyspaceName, hostname, shardName) - require.Nil(t, err) - - // As there is already a master the new replica will come directly in SERVING state - tablet62344.VttabletProcess.ServingStatus = "SERVING" - // Start the tablet - err = tablet62344.VttabletProcess.Setup() - require.Nil(t, err) - - err = checkInsertedValues(ctx, t, tablet62344, 2) - require.Nil(t, err) - - // Kill tablets - killTablets(t) + resurrectTablet(ctx, t, tab1) } func TestReparentCrossCell(t *testing.T) { - defer cluster.PanicHandler(t) - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - // create database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Force the replica to reparent assuming that all the datasets are identical. - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) - - checkMasterTablet(t, tablet62344) + setupReparentCluster(t) + defer teardownCluster() // Perform a graceful reparent operation to another cell. - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "PlannedReparentShard", - "-keyspace_shard", keyspaceShard, - "-new_master", tablet31981.Alias) - require.NoError(t, err, out) + _, err := prs(t, tab4) + require.NoError(t, err) validateTopology(t, false) - - checkMasterTablet(t, tablet31981) - - // Kill tablets - killTablets(t) - + checkMasterTablet(t, tab4) } func TestReparentGraceful(t *testing.T) { defer cluster.PanicHandler(t) - ctx := context.Background() - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - // create database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Force the replica to reparent assuming that all the datasets are identical. - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) - - // create Tables - runSQL(ctx, t, sqlSchema, tablet62344) - - checkMasterTablet(t, tablet62344) - - validateTopology(t, false) + setupReparentCluster(t) + defer teardownCluster() // Run this to make sure it succeeds. - output, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "ShardReplicationPositions", fmt.Sprintf("%s/%s", keyspaceName, shardName)) - require.NoError(t, err) - strArray := strings.Split(output, "\n") - if strArray[len(strArray)-1] == "" { - strArray = strArray[:len(strArray)-1] // Truncate slice, remove empty line - } + strArray := getShardReplicationPositions(t, keyspaceName, shardName, false) assert.Equal(t, 4, len(strArray)) // one master, three replicas assert.Contains(t, strArray[0], "master") // master first // Perform a graceful reparent operation - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "PlannedReparentShard", - "-keyspace_shard", fmt.Sprintf("%s/%s", keyspaceName, shardName), - "-new_master", tablet62044.Alias) - require.NoError(t, err) - + prs(t, tab2) validateTopology(t, false) - - checkMasterTablet(t, tablet62044) + checkMasterTablet(t, tab2) // A graceful reparent to the same master should be idempotent. - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "PlannedReparentShard", - "-keyspace_shard", fmt.Sprintf("%s/%s", keyspaceName, shardName), - "-new_master", tablet62044.Alias) - require.NoError(t, err) - + prs(t, tab2) validateTopology(t, false) + checkMasterTablet(t, tab2) - checkMasterTablet(t, tablet62044) - - // insert data into the new master, check the connected replica work - insertSQL := fmt.Sprintf(insertSQL, 1, 1) - runSQL(ctx, t, insertSQL, tablet62044) - err = checkInsertedValues(ctx, t, tablet41983, 1) - require.NoError(t, err) - err = checkInsertedValues(ctx, t, tablet62344, 1) - require.NoError(t, err) - - // Kill tablets - killTablets(t) + confirmReplication(t, tab2, []*cluster.Vttablet{tab1, tab3, tab4}) } func TestReparentReplicaOffline(t *testing.T) { defer cluster.PanicHandler(t) - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - // create database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Force the replica to reparent assuming that all the datasets are identical. - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", keyspaceShard, tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) - - checkMasterTablet(t, tablet62344) + setupReparentCluster(t) + defer teardownCluster() // Kill one tablet so we seem offline - err = tablet31981.VttabletProcess.TearDown() - require.NoError(t, err) + stopTablet(t, tab4, true) // Perform a graceful reparent operation. - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "PlannedReparentShard", - "-keyspace_shard", keyspaceShard, - "-new_master", tablet62044.Alias, - "-wait_replicas_timeout", "31s") - + out, err := prsWithTimeout(t, tab2, false, "", "31s") require.Error(t, err) - assert.Contains(t, out, "tablet zone2-0000031981 SetMaster failed") - - checkMasterTablet(t, tablet62044) - - killTablets(t) + assert.Contains(t, out, fmt.Sprintf("tablet %s SetMaster failed", tab4.Alias)) + checkMasterTablet(t, tab2) } func TestReparentAvoid(t *testing.T) { defer cluster.PanicHandler(t) - - // Remove tablet41983 from topology as that tablet is not required for this test - // Ignore error. Depending on previous tests this topo entry may or may not exist - // TODO: fix inter-test dependencies - _ = clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", tablet41983.Alias) - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet31981} { - // create database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Force the replica to reparent assuming that all the dataset's are identical. - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", keyspaceShard, tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) - - checkMasterTablet(t, tablet62344) + setupReparentCluster(t) + defer teardownCluster() + deleteTablet(t, tab3) // Perform a reparent operation with avoid_master pointing to non-master. It // should succeed without doing anything. - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "PlannedReparentShard", - "-keyspace_shard", keyspaceShard, - "-avoid_master", tablet62044.Alias) + _, err := prsAvoid(t, tab2) require.NoError(t, err) validateTopology(t, false) - - checkMasterTablet(t, tablet62344) + checkMasterTablet(t, tab1) // Perform a reparent operation with avoid_master pointing to master. - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "PlannedReparentShard", - "-keyspace_shard", keyspaceShard, - "-avoid_master", tablet62344.Alias) + _, err = prsAvoid(t, tab1) require.NoError(t, err) - validateTopology(t, false) - // 62044 is in the same cell and 31981 is in a different cell, so we must land on 62044 - checkMasterTablet(t, tablet62044) + // tab2 is in the same cell and tab4 is in a different cell, so we must land on tab2 + checkMasterTablet(t, tab2) // If we kill the tablet in the same cell as master then reparent -avoid_master will fail. - err = tablet62344.VttabletProcess.TearDown() - require.NoError(t, err) - - output, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "PlannedReparentShard", - "-keyspace_shard", keyspaceShard, - "-avoid_master", tablet62044.Alias) + stopTablet(t, tab1, true) + out, err := prsAvoid(t, tab2) require.Error(t, err) - assert.Contains(t, output, "cannot find a tablet to reparent to") - + assert.Contains(t, out, "cannot find a tablet to reparent to") validateTopology(t, false) - - checkMasterTablet(t, tablet62044) - - killTablets(t) + checkMasterTablet(t, tab2) } func TestReparentFromOutside(t *testing.T) { + defer cluster.PanicHandler(t) + setupReparentCluster(t) + defer teardownCluster() reparentFromOutside(t, false) } func TestReparentFromOutsideWithNoMaster(t *testing.T) { defer cluster.PanicHandler(t) + setupReparentCluster(t) + defer teardownCluster() reparentFromOutside(t, true) + // FIXME: @Deepthi: is this needed, since we teardown the cluster, does this achieve any additional test coverage? // We will have to restart mysql to avoid hanging/locks due to external Reparent - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { + for _, tablet := range []cluster.Vttablet{*tab1, *tab2, *tab3, *tab4} { log.Infof("Restarting MySql for tablet %v", tablet.Alias) err := tablet.MysqlctlProcess.Stop() require.NoError(t, err) @@ -674,194 +273,113 @@ func reparentFromOutside(t *testing.T, downMaster bool) { //Args: //downMaster: kills the old master first defer cluster.PanicHandler(t) + setupReparentCluster(t) + defer teardownCluster() ctx := context.Background() - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - // create database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Reparent as a starting point - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) - - checkMasterTablet(t, tablet62344) - // now manually reparent 1 out of 2 tablets - // 62044 will be the new master - // 31981 won't be re-parented, so it will be busted + // tab2 will be the new master + // tab3 won't be re-parented, so it will be busted if !downMaster { // commands to stop the current master demoteMasterCommands := "SET GLOBAL read_only = ON; FLUSH TABLES WITH READ LOCK; UNLOCK TABLES" - runSQL(ctx, t, demoteMasterCommands, tablet62344) + runSQL(ctx, t, demoteMasterCommands, tab1) //Get the position of the old master and wait for the new one to catch up. - err = waitForReplicationPosition(t, tablet62344, tablet62044) + err := waitForReplicationPosition(t, tab1, tab2) require.NoError(t, err) } // commands to convert a replica to a master promoteReplicaCommands := "STOP SLAVE; RESET SLAVE ALL; SET GLOBAL read_only = OFF;" - runSQL(ctx, t, promoteReplicaCommands, tablet62044) + runSQL(ctx, t, promoteReplicaCommands, tab2) // Get master position - _, gtID := cluster.GetMasterPosition(t, *tablet62044, hostname) + _, gtID := cluster.GetMasterPosition(t, *tab2, hostname) - // 62344 will now be a replica of 62044 + // tab1 will now be a replica of tab2 changeMasterCommands := fmt.Sprintf("RESET MASTER; RESET SLAVE; SET GLOBAL gtid_purged = '%s';"+ "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ - "START SLAVE;", gtID, hostname, tablet62044.MySQLPort) - runSQL(ctx, t, changeMasterCommands, tablet62344) + "START SLAVE;", gtID, hostname, tab2.MySQLPort) + runSQL(ctx, t, changeMasterCommands, tab1) - // Capture time when we made tablet62044 master + // Capture time when we made tab2 master baseTime := time.Now().UnixNano() / 1000000000 - // 41983 will be a replica of 62044 + // tab3 will be a replica of tab2 changeMasterCommands = fmt.Sprintf("STOP SLAVE; RESET MASTER; SET GLOBAL gtid_purged = '%s';"+ "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ - "START SLAVE;", gtID, hostname, tablet62044.MySQLPort) - runSQL(ctx, t, changeMasterCommands, tablet41983) + "START SLAVE;", gtID, hostname, tab2.MySQLPort) + runSQL(ctx, t, changeMasterCommands, tab3) // To test the downMaster, we kill the old master first and delete its tablet record if downMaster { - err := tablet62344.VttabletProcess.TearDown() + err := tab1.VttabletProcess.TearDown() require.NoError(t, err) err = clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", - "-allow_master", tablet62344.Alias) + "-allow_master", tab1.Alias) require.NoError(t, err) } // update topology with the new server - err = clusterInstance.VtctlclientProcess.ExecuteCommand("TabletExternallyReparented", - tablet62044.Alias) + err := clusterInstance.VtctlclientProcess.ExecuteCommand("TabletExternallyReparented", + tab2.Alias) require.NoError(t, err) - checkReparentFromOutside(t, tablet62044, downMaster, baseTime) + checkReparentFromOutside(t, tab2, downMaster, baseTime) if !downMaster { - err := tablet62344.VttabletProcess.TearDown() + err := tab1.VttabletProcess.TearDown() require.NoError(t, err) } - - killTablets(t) } func TestReparentWithDownReplica(t *testing.T) { defer cluster.PanicHandler(t) + setupReparentCluster(t) + defer teardownCluster() ctx := context.Background() - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - // Create Database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Init Shard Master - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) - - // create Tables - runSQL(ctx, t, sqlSchema, tablet62344) - // Stop replica mysql Process - err = tablet41983.MysqlctlProcess.Stop() + err := tab3.MysqlctlProcess.Stop() require.NoError(t, err) // Perform a graceful reparent operation. It will fail as one tablet is down. - output, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "PlannedReparentShard", - "-keyspace_shard", keyspaceShard, - "-new_master", tablet62044.Alias) + out, err := prs(t, tab2) require.Error(t, err) - assert.Contains(t, output, "TabletManager.SetMaster on zone1-0000041983 error") + assert.Contains(t, out, fmt.Sprintf("tablet %s SetMaster failed", tab3.Alias)) // insert data into the new master, check the connected replica work - insertSQL := fmt.Sprintf(insertSQL, 3, 3) - runSQL(ctx, t, insertSQL, tablet62044) - err = checkInsertedValues(ctx, t, tablet31981, 3) - require.NoError(t, err) - err = checkInsertedValues(ctx, t, tablet62344, 3) - require.NoError(t, err) + confirmReplication(t, tab2, []*cluster.Vttablet{tab1, tab4}) // restart mysql on the old replica, should still be connecting to the old master - tablet41983.MysqlctlProcess.InitMysql = false - err = tablet41983.MysqlctlProcess.Start() + tab3.MysqlctlProcess.InitMysql = false + err = tab3.MysqlctlProcess.Start() require.NoError(t, err) // Use the same PlannedReparentShard command to fix up the tablet. - err = clusterInstance.VtctlclientProcess.ExecuteCommand( - "PlannedReparentShard", - "-keyspace_shard", keyspaceShard, - "-new_master", tablet62044.Alias) + _, err = prs(t, tab2) require.NoError(t, err) // wait until it gets the data - err = checkInsertedValues(ctx, t, tablet41983, 3) + err = checkInsertedValues(ctx, t, tab3, 2) require.NoError(t, err) - - killTablets(t) } func TestChangeTypeSemiSync(t *testing.T) { defer cluster.PanicHandler(t) + setupReparentCluster(t) + defer teardownCluster() ctx := context.Background() // Create new names for tablets, so this test is less confusing. - master := tablet62344 - replica := tablet62044 - rdonly1 := tablet41983 - rdonly2 := tablet31981 - - for _, tablet := range []cluster.Vttablet{*master, *replica, *rdonly1, *rdonly2} { - // Create Database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range []cluster.Vttablet{*master, *replica, *rdonly1, *rdonly2} { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Init Shard Master - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), master.Alias) - require.NoError(t, err, out) + master, replica, rdonly1, rdonly2 := tab1, tab2, tab3, tab4 // Updated rdonly tablet and set tablet type to rdonly // TODO: replace with ChangeTabletType once ChangeSlaveType is removed - err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeSlaveType", rdonly1.Alias, "rdonly") + err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeSlaveType", rdonly1.Alias, "rdonly") require.NoError(t, err) err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", rdonly2.Alias, "rdonly") require.NoError(t, err) @@ -911,228 +429,22 @@ func TestChangeTypeSemiSync(t *testing.T) { require.NoError(t, err) checkDBvar(ctx, t, rdonly2, "rpl_semi_sync_slave_enabled", "ON") checkDBstatus(ctx, t, rdonly2, "Rpl_semi_sync_slave_status", "ON") - - killTablets(t) } func TestReparentDoesntHangIfMasterFails(t *testing.T) { defer cluster.PanicHandler(t) - tablets := []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} - for _, tablet := range tablets { - // Create Database - err := tablet.VttabletProcess.CreateDB(keyspaceName) - require.NoError(t, err) - - // Start the tablet - err = tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - for _, tablet := range tablets { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - - // Init Shard Master - out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("InitShardMaster", - "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablet62344.Alias) - require.NoError(t, err, out) - - validateTopology(t, true) + setupReparentCluster(t) + defer teardownCluster() // Change the schema of the _vt.reparent_journal table, so that // inserts into it will fail. That will make the master fail. - _, err = tablet62344.VttabletProcess.QueryTabletWithDB( + _, err := tab1.VttabletProcess.QueryTabletWithDB( "ALTER TABLE reparent_journal DROP COLUMN replication_position", "_vt") require.NoError(t, err) // Perform a planned reparent operation, the master will fail the // insert. The replicas should then abort right away. - out, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "PlannedReparentShard", - "-keyspace_shard", keyspaceShard, - "-new_master", tablet62044.Alias) + out, err := prs(t, tab2) require.Error(t, err) assert.Contains(t, out, "master failed to PopulateReparentJournal") - - killTablets(t) -} - -// Waits for tablet B to catch up to the replication position of tablet A. -func waitForReplicationPosition(t *testing.T, tabletA *cluster.Vttablet, tabletB *cluster.Vttablet) error { - posA, _ := cluster.GetMasterPosition(t, *tabletA, hostname) - timeout := time.Now().Add(5 * time.Second) - for time.Now().Before(timeout) { - posB, _ := cluster.GetMasterPosition(t, *tabletB, hostname) - if positionAtLeast(t, tabletB, posA, posB) { - return nil - } - time.Sleep(100 * time.Millisecond) - } - return fmt.Errorf("failed to catch up on replication position") -} - -func positionAtLeast(t *testing.T, tablet *cluster.Vttablet, a string, b string) bool { - isAtleast := false - val, err := tablet.MysqlctlProcess.ExecuteCommandWithOutput("position", "at_least", a, b) - require.NoError(t, err) - if strings.Contains(val, "true") { - isAtleast = true - } - return isAtleast -} - -func checkReparentFromOutside(t *testing.T, tablet *cluster.Vttablet, downMaster bool, baseTime int64) { - result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetShardReplication", cell1, keyspaceShard) - require.Nil(t, err, "error should be Nil") - if !downMaster { - assertNodeCount(t, result, int(3)) - } else { - assertNodeCount(t, result, int(2)) - } - - // make sure the master status page says it's the master - status := tablet.VttabletProcess.GetStatus() - assert.Contains(t, status, "Tablet Type: MASTER") - - // make sure the master health stream says it's the master too - // (health check is disabled on these servers, force it first) - err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", tablet.Alias) - require.NoError(t, err) - - streamHealth, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( - "VtTabletStreamHealth", - "-count", "1", tablet.Alias) - require.NoError(t, err) - - var streamHealthResponse querypb.StreamHealthResponse - err = json.Unmarshal([]byte(streamHealth), &streamHealthResponse) - require.NoError(t, err) - assert.Equal(t, streamHealthResponse.Target.TabletType, topodatapb.TabletType_MASTER) - assert.True(t, streamHealthResponse.TabletExternallyReparentedTimestamp >= baseTime) - -} - -func assertNodeCount(t *testing.T, result string, want int) { - resultMap := make(map[string]interface{}) - err := json.Unmarshal([]byte(result), &resultMap) - require.NoError(t, err) - - nodes := reflect.ValueOf(resultMap["nodes"]) - got := nodes.Len() - assert.Equal(t, want, got) -} - -func checkDBvar(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, variable string, status string) { - tabletParams := getMysqlConnParam(tablet) - conn, err := mysql.Connect(ctx, &tabletParams) - require.NoError(t, err) - defer conn.Close() - - qr := execute(t, conn, fmt.Sprintf("show variables like '%s'", variable)) - got := fmt.Sprintf("%v", qr.Rows) - want := fmt.Sprintf("[[VARCHAR(\"%s\") VARCHAR(\"%s\")]]", variable, status) - assert.Equal(t, want, got) -} - -func checkDBstatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, variable string, status string) { - tabletParams := getMysqlConnParam(tablet) - conn, err := mysql.Connect(ctx, &tabletParams) - require.NoError(t, err) - defer conn.Close() - - qr := execute(t, conn, fmt.Sprintf("show status like '%s'", variable)) - got := fmt.Sprintf("%v", qr.Rows) - want := fmt.Sprintf("[[VARCHAR(\"%s\") VARCHAR(\"%s\")]]", variable, status) - assert.Equal(t, want, got) -} - -func checkReplicaStatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet) { - qr := runSQL(ctx, t, "show slave status", tablet) - IOThreadRunning := fmt.Sprintf("%v", qr.Rows[0][10]) // Slave_IO_Running - SQLThreadRunning := fmt.Sprintf("%v", qr.Rows[0][10]) // Slave_SQL_Running - assert.Equal(t, IOThreadRunning, "VARCHAR(\"No\")") - assert.Equal(t, SQLThreadRunning, "VARCHAR(\"No\")") -} - -// Makes sure the tablet type is master, and its health check agrees. -func checkMasterTablet(t *testing.T, tablet *cluster.Vttablet) { - result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias) - require.NoError(t, err) - var tabletInfo topodatapb.Tablet - err = json2.Unmarshal([]byte(result), &tabletInfo) - require.NoError(t, err) - assert.Equal(t, topodatapb.TabletType_MASTER, tabletInfo.GetType()) - - // make sure the health stream is updated - result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", tablet.Alias) - require.NoError(t, err) - var streamHealthResponse querypb.StreamHealthResponse - - err = json2.Unmarshal([]byte(result), &streamHealthResponse) - require.NoError(t, err) - - assert.True(t, streamHealthResponse.GetServing()) - tabletType := streamHealthResponse.GetTarget().GetTabletType() - assert.Equal(t, topodatapb.TabletType_MASTER, tabletType) -} - -// isHealthyMasterTablet will return if tablet is master AND healthy. -func isHealthyMasterTablet(t *testing.T, tablet *cluster.Vttablet) bool { - result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias) - require.Nil(t, err) - var tabletInfo topodatapb.Tablet - err = json2.Unmarshal([]byte(result), &tabletInfo) - require.Nil(t, err) - if tabletInfo.GetType() != topodatapb.TabletType_MASTER { - return false - } - - // make sure the health stream is updated - result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", tablet.Alias) - require.Nil(t, err) - var streamHealthResponse querypb.StreamHealthResponse - - err = json2.Unmarshal([]byte(result), &streamHealthResponse) - require.Nil(t, err) - - assert.True(t, streamHealthResponse.GetServing()) - tabletType := streamHealthResponse.GetTarget().GetTabletType() - return tabletType == topodatapb.TabletType_MASTER -} - -func checkInsertedValues(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, index int) error { - // wait until it gets the data - timeout := time.Now().Add(10 * time.Second) - for time.Now().Before(timeout) { - selectSQL := fmt.Sprintf("select msg from vt_insert_test where id=%d", index) - qr := runSQL(ctx, t, selectSQL, tablet) - if len(qr.Rows) == 1 { - return nil - } - time.Sleep(300 * time.Millisecond) - } - return fmt.Errorf("data is not yet replicated") -} - -func validateTopology(t *testing.T, pingTablets bool) { - if pingTablets { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("Validate", "-ping-tablets=true") - require.NoError(t, err) - } else { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("Validate") - require.NoError(t, err) - } -} - -func killTablets(t *testing.T) { - for _, tablet := range []cluster.Vttablet{*tablet62344, *tablet62044, *tablet41983, *tablet31981} { - log.Infof("Calling TearDown on tablet %v", tablet.Alias) - err := tablet.VttabletProcess.TearDown() - require.NoError(t, err) - - // Reset status and type - tablet.VttabletProcess.ServingStatus = "" - tablet.Type = "replica" - } } diff --git a/go/test/endtoend/reparent/utils_test.go b/go/test/endtoend/reparent/utils_test.go new file mode 100644 index 00000000000..d6fde53c0ee --- /dev/null +++ b/go/test/endtoend/reparent/utils_test.go @@ -0,0 +1,538 @@ +/* +Copyright 2019 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package reparent + +import ( + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path" + "reflect" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "vitess.io/vitess/go/json2" + "vitess.io/vitess/go/vt/log" + querypb "vitess.io/vitess/go/vt/proto/query" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + + "github.com/stretchr/testify/require" + "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/test/endtoend/cluster" + tmc "vitess.io/vitess/go/vt/vttablet/grpctmclient" +) + +var ( + // ClusterInstance instance to be used for test with different params + clusterInstance *cluster.LocalProcessCluster + tmClient *tmc.Client + keyspaceName = "ks" + dbName = "vt_" + keyspaceName + username = "vt_dba" + hostname = "localhost" + insertSQL = "insert into vt_insert_test(id, msg) values (%d, 'test %d')" + sqlSchema = ` + create table vt_insert_test ( + id bigint, + msg varchar(64), + primary key (id) + ) Engine=InnoDB +` +) + +//region cluster setup/teardown +func setupRangeBasedCluster(ctx context.Context, t *testing.T) { + tablets := setupCluster(ctx, t, shardName, []string{cell1}, []int{2}) + masterTablet, replicaTablet = tablets[0], tablets[1] +} + +func setupReparentCluster(t *testing.T) { + tablets := setupCluster(context.Background(), t, shardName, []string{cell1, cell2}, []int{3, 1}) + tab1, tab2, tab3, tab4 = tablets[0], tablets[1], tablets[2], tablets[3] +} + +func teardownCluster() { + clusterInstance.Teardown() +} + +func setupCluster(ctx context.Context, t *testing.T, shardName string, cells []string, numTablets []int) []*cluster.Vttablet { + var tablets []*cluster.Vttablet + clusterInstance = cluster.NewCluster(cells[0], hostname) + keyspace := &cluster.Keyspace{Name: keyspaceName} + // Start topo server + err := clusterInstance.StartTopo() + if err != nil { + t.Fatalf("Error starting topo: %s", err.Error()) + } + err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+cells[0]) + if err != nil { + t.Fatalf("Error managing topo: %s", err.Error()) + } + numCell := 1 + for numCell < len(cells) { + err = clusterInstance.VtctlProcess.AddCellInfo(cells[numCell]) + if err != nil { + t.Fatalf("Error managing topo: %s", err.Error()) + } + numCell++ + } + + // Adding another cell in the same cluster + numCell = 0 + for numCell < len(cells) { + i := 0 + for i < numTablets[numCell] { + i++ + tablet := clusterInstance.NewVttabletInstance("replica", 100*(numCell+1)+i, cells[numCell]) + tablets = append(tablets, tablet) + } + numCell++ + } + + shard := &cluster.Shard{Name: shardName} + shard.Vttablets = tablets + + clusterInstance.VtTabletExtraArgs = []string{ + "-lock_tables_timeout", "5s", + "-enable_semi_sync", + "-track_schema_versions=true", + } + + // Initialize Cluster + err = clusterInstance.LaunchCluster(keyspace, []cluster.Shard{*shard}) + if err != nil { + t.Fatalf("Cannot launch cluster: %s", err.Error()) + } + + //Start MySql + var mysqlCtlProcessList []*exec.Cmd + for _, shard := range clusterInstance.Keyspaces[0].Shards { + for _, tablet := range shard.Vttablets { + log.Infof("Starting MySql for tablet %v", tablet.Alias) + proc, err := tablet.MysqlctlProcess.StartProcess() + if err != nil { + t.Fatalf("Error starting start mysql: %s", err.Error()) + } + mysqlCtlProcessList = append(mysqlCtlProcessList, proc) + } + } + + // Wait for mysql processes to start + for _, proc := range mysqlCtlProcessList { + if err := proc.Wait(); err != nil { + t.Fatalf("Error starting mysql: %s", err.Error()) + } + } + + // We do not need semiSync for this test case. + clusterInstance.EnableSemiSync = false + + // create tablet manager client + tmClient = tmc.NewClient() + setupShard(ctx, t, shardName, tablets) + return tablets +} + +func setupShard(ctx context.Context, t *testing.T, shardName string, tablets []*cluster.Vttablet) { + for _, tablet := range tablets { + // create database + err := tablet.VttabletProcess.CreateDB(keyspaceName) + require.NoError(t, err) + // Start the tablet + err = tablet.VttabletProcess.Setup() + require.NoError(t, err) + } + + for _, tablet := range tablets { + err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) + require.NoError(t, err) + } + + // Force the replica to reparent assuming that all the datasets are identical. + err := clusterInstance.VtctlclientProcess.ExecuteCommand("InitShardMaster", + "-force", fmt.Sprintf("%s/%s", keyspaceName, shardName), tablets[0].Alias) + require.NoError(t, err) + + validateTopology(t, true) + + // create Tables + runSQL(ctx, t, sqlSchema, tablets[0]) + + checkMasterTablet(t, tablets[0]) + + validateTopology(t, false) + time.Sleep(100 * time.Millisecond) // wait for replication to catchup + strArray := getShardReplicationPositions(t, keyspaceName, shardName, true) + assert.Equal(t, len(tablets), len(strArray)) + assert.Contains(t, strArray[0], "master") // master first +} + +//endregion + +//region database queries +func getMysqlConnParam(tablet *cluster.Vttablet) mysql.ConnParams { + connParams := mysql.ConnParams{ + Uname: username, + DbName: dbName, + UnixSocket: path.Join(os.Getenv("VTDATAROOT"), fmt.Sprintf("/vt_%010d/mysql.sock", tablet.TabletUID)), + } + return connParams +} + +func runSQL(ctx context.Context, t *testing.T, sql string, tablet *cluster.Vttablet) *sqltypes.Result { + tabletParams := getMysqlConnParam(tablet) + conn, err := mysql.Connect(ctx, &tabletParams) + require.Nil(t, err) + defer conn.Close() + return execute(t, conn, sql) +} + +func execute(t *testing.T, conn *mysql.Conn, query string) *sqltypes.Result { + t.Helper() + qr, err := conn.ExecuteFetch(query, 1000, true) + require.Nil(t, err) + return qr +} + +//endregion + +// region prs/ers + +func prs(t *testing.T, tab *cluster.Vttablet) (string, error) { + return prsWithTimeout(t, tab, false, "", "") +} + +func prsAvoid(t *testing.T, tab *cluster.Vttablet) (string, error) { + return prsWithTimeout(t, tab, true, "", "") +} + +func prsWithTimeout(t *testing.T, tab *cluster.Vttablet, avoid bool, actionTimeout, waitTimeout string) (string, error) { + args := []string{ + "PlannedReparentShard", + "-keyspace_shard", fmt.Sprintf("%s/%s", keyspaceName, shardName)} + if actionTimeout != "" { + args = append(args, "-action_timeout", actionTimeout) + } + if waitTimeout != "" { + args = append(args, "-wait_replicas_timeout", waitTimeout) + } + if avoid { + args = append(args, "-avoid_master") + } else { + args = append(args, "-new_master") + } + args = append(args, tab.Alias) + out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...) + return out, err +} + +func ers(t *testing.T, tab *cluster.Vttablet, timeout string) (string, error) { + return ersIgnoreTablet(t, tab, timeout, nil) +} + +func ersIgnoreTablet(t *testing.T, tab *cluster.Vttablet, timeout string, tabToIgnore *cluster.Vttablet) (string, error) { + args := []string{"EmergencyReparentShard", "-keyspace_shard", fmt.Sprintf("%s/%s", keyspaceName, shardName)} + if tab != nil { + args = append(args, "-new_master", tab.Alias) + } + if timeout != "" { + args = append(args, "-wait_replicas_timeout", "30s") + } + if tabToIgnore != nil { + args = append(args, "-ignore_replicas", tabToIgnore.Alias) + } + return clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...) +} + +func checkReparentFromOutside(t *testing.T, tablet *cluster.Vttablet, downMaster bool, baseTime int64) { + result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetShardReplication", cell1, keyspaceShard) + require.Nil(t, err, "error should be Nil") + if !downMaster { + assertNodeCount(t, result, int(3)) + } else { + assertNodeCount(t, result, int(2)) + } + + // make sure the master status page says it's the master + status := tablet.VttabletProcess.GetStatus() + assert.Contains(t, status, "Tablet Type: MASTER") + + // make sure the master health stream says it's the master too + // (health check is disabled on these servers, force it first) + err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", tablet.Alias) + require.NoError(t, err) + + streamHealth, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( + "VtTabletStreamHealth", + "-count", "1", tablet.Alias) + require.NoError(t, err) + + var streamHealthResponse querypb.StreamHealthResponse + err = json.Unmarshal([]byte(streamHealth), &streamHealthResponse) + require.NoError(t, err) + assert.Equal(t, streamHealthResponse.Target.TabletType, topodatapb.TabletType_MASTER) + assert.True(t, streamHealthResponse.TabletExternallyReparentedTimestamp >= baseTime) +} + +// endregion + +// region validations + +func validateTopology(t *testing.T, pingTablets bool) { + args := []string{"Validate"} + + if pingTablets { + args = append(args, "-ping-tablets=true") + } + out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput(args...) + require.Empty(t, out) + require.NoError(t, err) +} + +func confirmReplication(t *testing.T, master *cluster.Vttablet, replicas []*cluster.Vttablet) { + ctx := context.Background() + n := 2 // random value ... + // insert data into the new master, check the connected replica work + insertSQL := fmt.Sprintf(insertSQL, n, n) + runSQL(ctx, t, insertSQL, master) + time.Sleep(100 * time.Millisecond) + for _, tab := range replicas { + err := checkInsertedValues(ctx, t, tab, n) + require.NoError(t, err) + } +} + +func confirmOldMasterIsHangingAround(t *testing.T) { + out, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("Validate") + require.Error(t, err) + require.Contains(t, out, "already has master") +} + +// Waits for tablet B to catch up to the replication position of tablet A. +func waitForReplicationPosition(t *testing.T, tabletA *cluster.Vttablet, tabletB *cluster.Vttablet) error { + posA, _ := cluster.GetMasterPosition(t, *tabletA, hostname) + timeout := time.Now().Add(5 * time.Second) + for time.Now().Before(timeout) { + posB, _ := cluster.GetMasterPosition(t, *tabletB, hostname) + if positionAtLeast(t, tabletB, posA, posB) { + return nil + } + time.Sleep(100 * time.Millisecond) + } + return fmt.Errorf("failed to catch up on replication position") +} + +func positionAtLeast(t *testing.T, tablet *cluster.Vttablet, a string, b string) bool { + isAtleast := false + val, err := tablet.MysqlctlProcess.ExecuteCommandWithOutput("position", "at_least", a, b) + require.NoError(t, err) + if strings.Contains(val, "true") { + isAtleast = true + } + return isAtleast +} + +func assertNodeCount(t *testing.T, result string, want int) { + resultMap := make(map[string]interface{}) + err := json.Unmarshal([]byte(result), &resultMap) + require.NoError(t, err) + + nodes := reflect.ValueOf(resultMap["nodes"]) + got := nodes.Len() + assert.Equal(t, want, got) +} + +func checkDBvar(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, variable string, status string) { + tabletParams := getMysqlConnParam(tablet) + conn, err := mysql.Connect(ctx, &tabletParams) + require.NoError(t, err) + defer conn.Close() + + qr := execute(t, conn, fmt.Sprintf("show variables like '%s'", variable)) + got := fmt.Sprintf("%v", qr.Rows) + want := fmt.Sprintf("[[VARCHAR(\"%s\") VARCHAR(\"%s\")]]", variable, status) + assert.Equal(t, want, got) +} + +func checkDBstatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, variable string, status string) { + tabletParams := getMysqlConnParam(tablet) + conn, err := mysql.Connect(ctx, &tabletParams) + require.NoError(t, err) + defer conn.Close() + + qr := execute(t, conn, fmt.Sprintf("show status like '%s'", variable)) + got := fmt.Sprintf("%v", qr.Rows) + want := fmt.Sprintf("[[VARCHAR(\"%s\") VARCHAR(\"%s\")]]", variable, status) + assert.Equal(t, want, got) +} + +func checkReplicaStatus(ctx context.Context, t *testing.T, tablet *cluster.Vttablet) { + qr := runSQL(ctx, t, "show slave status", tablet) + IOThreadRunning := fmt.Sprintf("%v", qr.Rows[0][10]) // Slave_IO_Running + SQLThreadRunning := fmt.Sprintf("%v", qr.Rows[0][10]) // Slave_SQL_Running + assert.Equal(t, IOThreadRunning, "VARCHAR(\"No\")") + assert.Equal(t, SQLThreadRunning, "VARCHAR(\"No\")") +} + +// Makes sure the tablet type is master, and its health check agrees. +func checkMasterTablet(t *testing.T, tablet *cluster.Vttablet) { + result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias) + require.NoError(t, err) + var tabletInfo topodatapb.Tablet + err = json2.Unmarshal([]byte(result), &tabletInfo) + require.NoError(t, err) + assert.Equal(t, topodatapb.TabletType_MASTER, tabletInfo.GetType()) + + // make sure the health stream is updated + result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", tablet.Alias) + require.NoError(t, err) + var streamHealthResponse querypb.StreamHealthResponse + + err = json2.Unmarshal([]byte(result), &streamHealthResponse) + require.NoError(t, err) + + assert.True(t, streamHealthResponse.GetServing()) + tabletType := streamHealthResponse.GetTarget().GetTabletType() + assert.Equal(t, topodatapb.TabletType_MASTER, tabletType) +} + +// isHealthyMasterTablet will return if tablet is master AND healthy. +func isHealthyMasterTablet(t *testing.T, tablet *cluster.Vttablet) bool { + result, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", tablet.Alias) + require.Nil(t, err) + var tabletInfo topodatapb.Tablet + err = json2.Unmarshal([]byte(result), &tabletInfo) + require.Nil(t, err) + if tabletInfo.GetType() != topodatapb.TabletType_MASTER { + return false + } + + // make sure the health stream is updated + result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", tablet.Alias) + require.Nil(t, err) + var streamHealthResponse querypb.StreamHealthResponse + + err = json2.Unmarshal([]byte(result), &streamHealthResponse) + require.Nil(t, err) + + assert.True(t, streamHealthResponse.GetServing()) + tabletType := streamHealthResponse.GetTarget().GetTabletType() + return tabletType == topodatapb.TabletType_MASTER +} + +func checkInsertedValues(ctx context.Context, t *testing.T, tablet *cluster.Vttablet, index int) error { + // wait until it gets the data + timeout := time.Now().Add(5 * time.Second) + i := 0 + for time.Now().Before(timeout) { + selectSQL := fmt.Sprintf("select msg from vt_insert_test where id=%d", index) + qr := runSQL(ctx, t, selectSQL, tablet) + if len(qr.Rows) == 1 { + return nil + } + t := time.Duration(300 * i) + time.Sleep(t * time.Millisecond) + i++ + } + return fmt.Errorf("data is not yet replicated on tablet %s", tablet.Alias) +} + +// endregion + +// region tablet operations + +func stopTablet(t *testing.T, tab *cluster.Vttablet, stopDatabase bool) { + err := tab.VttabletProcess.TearDown() + require.NoError(t, err) + if stopDatabase { + err = tab.MysqlctlProcess.Stop() + require.NoError(t, err) + } +} + +func restartTablet(t *testing.T, tab *cluster.Vttablet) { + tab.MysqlctlProcess.InitMysql = false + err := tab.MysqlctlProcess.Start() + require.NoError(t, err) + err = clusterInstance.VtctlclientProcess.InitTablet(tab, tab.Cell, keyspaceName, hostname, shardName) + require.NoError(t, err) +} + +func resurrectTablet(ctx context.Context, t *testing.T, tab *cluster.Vttablet) { + tab.MysqlctlProcess.InitMysql = false + err := tab.MysqlctlProcess.Start() + require.NoError(t, err) + err = clusterInstance.VtctlclientProcess.InitTablet(tab, tab.Cell, keyspaceName, hostname, shardName) + require.NoError(t, err) + + // As there is already a master the new replica will come directly in SERVING state + tab1.VttabletProcess.ServingStatus = "SERVING" + // Start the tablet + err = tab.VttabletProcess.Setup() + require.NoError(t, err) + + err = checkInsertedValues(ctx, t, tab, 2) + require.NoError(t, err) +} + +func deleteTablet(t *testing.T, tab *cluster.Vttablet) { + err := clusterInstance.VtctlclientProcess.ExecuteCommand( + "DeleteTablet", + "-allow_master", + tab.Alias) + require.NoError(t, err) +} + +// endregion + +// region get info + +func getNewMaster(t *testing.T) *cluster.Vttablet { + var newMaster *cluster.Vttablet + for _, tablet := range []*cluster.Vttablet{tab2, tab3, tab4} { + if isHealthyMasterTablet(t, tablet) { + newMaster = tablet + break + } + } + require.NotNil(t, newMaster) + return newMaster +} + +func getShardReplicationPositions(t *testing.T, keyspaceName, shardName string, doPrint bool) []string { + output, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput( + "ShardReplicationPositions", fmt.Sprintf("%s/%s", keyspaceName, shardName)) + require.NoError(t, err) + strArray := strings.Split(output, "\n") + if strArray[len(strArray)-1] == "" { + strArray = strArray[:len(strArray)-1] // Truncate slice, remove empty line + } + if doPrint { + log.Infof("Positions:") + for _, pos := range strArray { + log.Infof("\t%s", pos) + } + } + return strArray +} + +// endregion From c12e632ad2e0145b7a7b4e98a4bc2fcab5148ce7 Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Wed, 16 Sep 2020 08:25:15 +0200 Subject: [PATCH 2/5] Reparent tests: remove extra setup Signed-off-by: Rohit Nayak --- go/test/endtoend/reparent/reparent_test.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/go/test/endtoend/reparent/reparent_test.go b/go/test/endtoend/reparent/reparent_test.go index 6a703caa845..e7dc254ca7c 100644 --- a/go/test/endtoend/reparent/reparent_test.go +++ b/go/test/endtoend/reparent/reparent_test.go @@ -272,10 +272,6 @@ func reparentFromOutside(t *testing.T, downMaster bool) { //- one replica will be busted and dead in the water and we'll call TabletExternallyReparented. //Args: //downMaster: kills the old master first - defer cluster.PanicHandler(t) - setupReparentCluster(t) - defer teardownCluster() - ctx := context.Background() // now manually reparent 1 out of 2 tablets From a1e810e3407a806e65e245bb12c73659a375696d Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Thu, 17 Sep 2020 01:01:54 +0200 Subject: [PATCH 3/5] address review comment Signed-off-by: Rohit Nayak --- go/test/endtoend/reparent/utils_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/go/test/endtoend/reparent/utils_test.go b/go/test/endtoend/reparent/utils_test.go index d6fde53c0ee..5204654dffa 100644 --- a/go/test/endtoend/reparent/utils_test.go +++ b/go/test/endtoend/reparent/utils_test.go @@ -143,9 +143,6 @@ func setupCluster(ctx context.Context, t *testing.T, shardName string, cells []s } } - // We do not need semiSync for this test case. - clusterInstance.EnableSemiSync = false - // create tablet manager client tmClient = tmc.NewClient() setupShard(ctx, t, shardName, tablets) From c290009378b447cdc35d909c719833e1f020e28d Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Thu, 17 Sep 2020 21:17:34 +0200 Subject: [PATCH 4/5] Testing if conflict can be fixed by adding main_test.go back Signed-off-by: Rohit Nayak --- go/test/endtoend/reparent/main_test.go | 1 + 1 file changed, 1 insertion(+) create mode 100644 go/test/endtoend/reparent/main_test.go diff --git a/go/test/endtoend/reparent/main_test.go b/go/test/endtoend/reparent/main_test.go new file mode 100644 index 00000000000..f423d4a546f --- /dev/null +++ b/go/test/endtoend/reparent/main_test.go @@ -0,0 +1 @@ +package reparent From ebec8e4a284eb4a2fe39e3af97c929a95a735372 Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Thu, 17 Sep 2020 21:36:55 +0200 Subject: [PATCH 5/5] Delete main_test.go again Signed-off-by: Rohit Nayak --- go/test/endtoend/reparent/main_test.go | 1 - 1 file changed, 1 deletion(-) delete mode 100644 go/test/endtoend/reparent/main_test.go diff --git a/go/test/endtoend/reparent/main_test.go b/go/test/endtoend/reparent/main_test.go deleted file mode 100644 index f423d4a546f..00000000000 --- a/go/test/endtoend/reparent/main_test.go +++ /dev/null @@ -1 +0,0 @@ -package reparent