Skip to content

Commit

Permalink
Merge pull request #5254 from planetscale/ds-fix-restore-crashloop
Browse files Browse the repository at this point in the history
Don't abort restore if master is unreachable
  • Loading branch information
deepthi authored Oct 1, 2019
2 parents d245883 + 1e1ef87 commit 169331a
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 2 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ require (
github.com/golang/mock v1.3.1
github.com/golang/protobuf v1.3.2
github.com/golang/snappy v0.0.0-20170215233205-553a64147049
github.com/google/btree v1.0.0 // indirect
github.com/gorilla/websocket v0.0.0-20160912153041-2d1e4548da23
github.com/grpc-ecosystem/go-grpc-middleware v1.1.0
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
Expand Down
4 changes: 4 additions & 0 deletions go/vt/mysqlctl/builtinbackupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,9 @@ func (be *BuiltinBackupEngine) ExecuteBackup(ctx context.Context, params BackupP
}
if !replicationPosition.Equal(masterPos) {
for {
if err := ctx.Err(); err != nil {
return usable, err
}
status, err := mysqld.SlaveStatus()
if err != nil {
return usable, err
Expand All @@ -375,6 +378,7 @@ func (be *BuiltinBackupEngine) ExecuteBackup(ctx context.Context, params BackupP
if !newPos.Equal(replicationPosition) {
break
}
time.Sleep(1 * time.Second)
}
}
}
Expand Down
11 changes: 10 additions & 1 deletion go/vt/vttablet/tabletmanager/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,12 @@ func (agent *ActionAgent) startReplication(ctx context.Context, pos mysql.Positi
defer remoteCancel()
posStr, err := tmc.MasterPosition(remoteCtx, ti.Tablet)
if err != nil {
return vterrors.Wrap(err, "can't get master replication position")
// It is possible that though MasterAlias is set, the master tablet is unreachable
// Log a warning and let tablet restore in that case
// If we had instead considered this fatal, all tablets would crash-loop
// until a master appears, which would make it impossible to elect a master.
log.Warningf("Can't get master replication position after restore: %v", err)
return nil
}
masterPos, err := mysql.DecodePosition(posStr)
if err != nil {
Expand All @@ -241,6 +246,9 @@ func (agent *ActionAgent) startReplication(ctx context.Context, pos mysql.Positi

if !pos.Equal(masterPos) {
for {
if err := ctx.Err(); err != nil {
return err
}
status, err := agent.MysqlDaemon.SlaveStatus()
if err != nil {
return vterrors.Wrap(err, "can't get slave status")
Expand All @@ -249,6 +257,7 @@ func (agent *ActionAgent) startReplication(ctx context.Context, pos mysql.Positi
if !newPos.Equal(pos) {
break
}
time.Sleep(1 * time.Second)
}
}

Expand Down
166 changes: 165 additions & 1 deletion go/vt/wrangler/testlib/backup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func TestBackupRestore(t *testing.T) {
},
}

// start master so that slave can fetch master position from it
// start master so that replica can fetch master position from it
master.StartActionLoop(t, wr)
defer master.StopActionLoop(t)

Expand Down Expand Up @@ -210,3 +210,167 @@ func TestBackupRestore(t *testing.T) {
}

}

func TestRestoreUnreachableMaster(t *testing.T) {
// Initialize our environment
ctx := context.Background()
db := fakesqldb.New(t)
defer db.Close()
ts := memorytopo.NewServer("cell1")
wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient())
vp := NewVtctlPipe(t, ts)
defer vp.Close()

// Set up mock query results.
db.AddQuery("CREATE DATABASE IF NOT EXISTS _vt", &sqltypes.Result{})
db.AddQuery("BEGIN", &sqltypes.Result{})
db.AddQuery("COMMIT", &sqltypes.Result{})
db.AddQueryPattern(`SET @@session\.sql_log_bin = .*`, &sqltypes.Result{})
db.AddQueryPattern(`CREATE TABLE IF NOT EXISTS _vt\.shard_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`CREATE TABLE IF NOT EXISTS _vt\.local_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`ALTER TABLE _vt\.local_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`ALTER TABLE _vt\.shard_metadata .*`, &sqltypes.Result{})
db.AddQueryPattern(`UPDATE _vt\.local_metadata SET db_name=.*`, &sqltypes.Result{})
db.AddQueryPattern(`UPDATE _vt\.shard_metadata SET db_name=.*`, &sqltypes.Result{})
db.AddQueryPattern(`INSERT INTO _vt\.local_metadata .*`, &sqltypes.Result{})

// Initialize our temp dirs
root, err := ioutil.TempDir("", "backuptest")
if err != nil {
t.Fatalf("os.TempDir failed: %v", err)
}
defer os.RemoveAll(root)

// Initialize BackupStorage
fbsRoot := path.Join(root, "fbs")
*filebackupstorage.FileBackupStorageRoot = fbsRoot
*backupstorage.BackupStorageImplementation = "file"

// Initialize the fake mysql root directories
sourceInnodbDataDir := path.Join(root, "source_innodb_data")
sourceInnodbLogDir := path.Join(root, "source_innodb_log")
sourceDataDir := path.Join(root, "source_data")
sourceDataDbDir := path.Join(sourceDataDir, "vt_db")
for _, s := range []string{sourceInnodbDataDir, sourceInnodbLogDir, sourceDataDbDir} {
if err := os.MkdirAll(s, os.ModePerm); err != nil {
t.Fatalf("failed to create directory %v: %v", s, err)
}
}
if err := ioutil.WriteFile(path.Join(sourceInnodbDataDir, "innodb_data_1"), []byte("innodb data 1 contents"), os.ModePerm); err != nil {
t.Fatalf("failed to write file innodb_data_1: %v", err)
}
if err := ioutil.WriteFile(path.Join(sourceInnodbLogDir, "innodb_log_1"), []byte("innodb log 1 contents"), os.ModePerm); err != nil {
t.Fatalf("failed to write file innodb_log_1: %v", err)
}
if err := ioutil.WriteFile(path.Join(sourceDataDbDir, "db.opt"), []byte("db opt file"), os.ModePerm); err != nil {
t.Fatalf("failed to write file db.opt: %v", err)
}

// create a master tablet, set its master position
master := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_MASTER, db)
master.FakeMysqlDaemon.ReadOnly = false
master.FakeMysqlDaemon.Replicating = false
master.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}

// start master so that replica can fetch master position from it
master.StartActionLoop(t, wr)

// create a single tablet, set it up so we can do backups
// set its position same as that of master so that backup doesn't wait for catchup
sourceTablet := NewFakeTablet(t, wr, "cell1", 1, topodatapb.TabletType_REPLICA, db)
sourceTablet.FakeMysqlDaemon.ReadOnly = true
sourceTablet.FakeMysqlDaemon.Replicating = true
sourceTablet.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}
sourceTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
"STOP SLAVE",
"START SLAVE",
}
sourceTablet.StartActionLoop(t, wr)
defer sourceTablet.StopActionLoop(t)

sourceTablet.Agent.Cnf = &mysqlctl.Mycnf{
DataDir: sourceDataDir,
InnodbDataHomeDir: sourceInnodbDataDir,
InnodbLogGroupHomeDir: sourceInnodbLogDir,
}

// run the backup
if err := vp.Run([]string{"Backup", topoproto.TabletAliasString(sourceTablet.Tablet.Alias)}); err != nil {
t.Fatalf("Backup failed: %v", err)
}

// create a destination tablet, set it up so we can do restores
destTablet := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_REPLICA, db)
destTablet.FakeMysqlDaemon.ReadOnly = true
destTablet.FakeMysqlDaemon.Replicating = true
destTablet.FakeMysqlDaemon.CurrentMasterPosition = mysql.Position{
GTIDSet: mysql.MariadbGTIDSet{
mysql.MariadbGTID{
Domain: 2,
Server: 123,
Sequence: 457,
},
},
}
destTablet.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
"STOP SLAVE",
"RESET SLAVE ALL",
"FAKE SET SLAVE POSITION",
"FAKE SET MASTER",
"START SLAVE",
}
destTablet.FakeMysqlDaemon.FetchSuperQueryMap = map[string]*sqltypes.Result{
"SHOW DATABASES": {},
}
destTablet.FakeMysqlDaemon.SetSlavePositionPos = sourceTablet.FakeMysqlDaemon.CurrentMasterPosition
destTablet.FakeMysqlDaemon.SetMasterInput = topoproto.MysqlAddr(master.Tablet)

destTablet.StartActionLoop(t, wr)
defer destTablet.StopActionLoop(t)

destTablet.Agent.Cnf = &mysqlctl.Mycnf{
DataDir: sourceDataDir,
InnodbDataHomeDir: sourceInnodbDataDir,
InnodbLogGroupHomeDir: sourceInnodbLogDir,
BinLogPath: path.Join(root, "bin-logs/filename_prefix"),
RelayLogPath: path.Join(root, "relay-logs/filename_prefix"),
RelayLogIndexPath: path.Join(root, "relay-log.index"),
RelayLogInfoPath: path.Join(root, "relay-log.info"),
}

// stop master so that it is unreachable
master.StopActionLoop(t)

// Restore should still succeed
if err := destTablet.Agent.RestoreData(ctx, logutil.NewConsoleLogger(), 0 /* waitForBackupInterval */, false /* deleteBeforeRestore */); err != nil {
t.Fatalf("RestoreData failed: %v", err)
}

// verify the full status
if err := destTablet.FakeMysqlDaemon.CheckSuperQueryList(); err != nil {
t.Errorf("destTablet.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err)
}
if !destTablet.FakeMysqlDaemon.Replicating {
t.Errorf("destTablet.FakeMysqlDaemon.Replicating not set")
}
if !destTablet.FakeMysqlDaemon.Running {
t.Errorf("destTablet.FakeMysqlDaemon.Running not set")
}

}

0 comments on commit 169331a

Please sign in to comment.