-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add flag to wait for backup instead of starting up empty. #4929
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -19,6 +19,7 @@ package tabletmanager | |||||
import ( | ||||||
"flag" | ||||||
"fmt" | ||||||
"time" | ||||||
|
||||||
"vitess.io/vitess/go/vt/vterrors" | ||||||
|
||||||
|
@@ -37,26 +38,27 @@ import ( | |||||
// It is only enabled if restore_from_backup is set. | ||||||
|
||||||
var ( | ||||||
restoreFromBackup = flag.Bool("restore_from_backup", false, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there") | ||||||
restoreConcurrency = flag.Int("restore_concurrency", 4, "(init restore parameter) how many concurrent files to restore at once") | ||||||
restoreFromBackup = flag.Bool("restore_from_backup", false, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there") | ||||||
restoreConcurrency = flag.Int("restore_concurrency", 4, "(init restore parameter) how many concurrent files to restore at once") | ||||||
waitForBackupInterval = flag.Duration("wait_for_backup_interval", 0, "(init restore parameter) if this is greater than 0, instead of starting up empty when no backups are found, keep checking at this interval for a backup to appear") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
if we want to keep with the bad name |
||||||
) | ||||||
|
||||||
// RestoreData is the main entry point for backup restore. | ||||||
// It will either work, fail gracefully, or return | ||||||
// an error in case of a non-recoverable error. | ||||||
// It takes the action lock so no RPC interferes. | ||||||
func (agent *ActionAgent) RestoreData(ctx context.Context, logger logutil.Logger, deleteBeforeRestore bool) error { | ||||||
func (agent *ActionAgent) RestoreData(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error { | ||||||
if err := agent.lock(ctx); err != nil { | ||||||
return err | ||||||
} | ||||||
defer agent.unlock() | ||||||
if agent.Cnf == nil { | ||||||
return fmt.Errorf("cannot perform restore without my.cnf, please restart vttablet with a my.cnf file specified") | ||||||
} | ||||||
return agent.restoreDataLocked(ctx, logger, deleteBeforeRestore) | ||||||
return agent.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore) | ||||||
} | ||||||
|
||||||
func (agent *ActionAgent) restoreDataLocked(ctx context.Context, logger logutil.Logger, deleteBeforeRestore bool) error { | ||||||
func (agent *ActionAgent) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error { | ||||||
// change type to RESTORE (using UpdateTabletFields so it's | ||||||
// always authorized) | ||||||
var originalType topodatapb.TabletType | ||||||
|
@@ -80,7 +82,28 @@ func (agent *ActionAgent) restoreDataLocked(ctx context.Context, logger logutil. | |||||
localMetadata := agent.getLocalMetadataValues(originalType) | ||||||
tablet := agent.Tablet() | ||||||
dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard) | ||||||
pos, err := mysqlctl.Restore(ctx, agent.Cnf, agent.MysqlDaemon, dir, *restoreConcurrency, agent.hookExtraEnv(), localMetadata, logger, deleteBeforeRestore, topoproto.TabletDbName(tablet)) | ||||||
|
||||||
// Loop until a backup exists, unless we were told to give up immediately. | ||||||
var pos mysql.Position | ||||||
var err error | ||||||
for { | ||||||
pos, err = mysqlctl.Restore(ctx, agent.Cnf, agent.MysqlDaemon, dir, *restoreConcurrency, agent.hookExtraEnv(), localMetadata, logger, deleteBeforeRestore, topoproto.TabletDbName(tablet)) | ||||||
if waitForBackupInterval == 0 { | ||||||
break | ||||||
} | ||||||
// We only retry a specific set of errors. The rest we return immediately. | ||||||
if err != mysqlctl.ErrNoBackup && err != mysqlctl.ErrNoCompleteBackup { | ||||||
break | ||||||
} | ||||||
|
||||||
log.Infof("No backup found. Waiting %v (from -wait_for_backup_interval flag) to check again.", waitForBackupInterval) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
select { | ||||||
case <-ctx.Done(): | ||||||
return ctx.Err() | ||||||
case <-time.After(waitForBackupInterval): | ||||||
} | ||||||
} | ||||||
|
||||||
switch err { | ||||||
case nil: | ||||||
// Starting from here we won't be able to recover if we get stopped by a cancelled | ||||||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -131,7 +131,7 @@ func (agent *ActionAgent) RestoreFromBackup(ctx context.Context, logger logutil. | |||||||
l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) | ||||||||
|
||||||||
// now we can run restore | ||||||||
err = agent.restoreDataLocked(ctx, l, true /* deleteBeforeRestore */) | ||||||||
err = agent.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
// re-run health check to be sure to capture any replication delay | ||||||||
agent.runHealthCheckLocked() | ||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The name "retry" is not accurate here. The current behavior of
-restore_from_backup
is that "no backup exists; starting up empty" is considered a normal, non-error result. It doesn't make sense that adding a retry interval will change this semantic because retries only apply to errors.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but you're retrying until success. that's generally what retries are for. this is changing the behavior of restore_from_backup, which only is a single time special case, and happens to startup from empty.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The behavior I'm adding is not "retrying until success". If the backup doesn't exist, "success" is currently defined as "start up empty". There is no need nor expectation to retry anything. The thing you tried (
-restore_from_backup
, which implicitly means "restore from backup if there is one or else start up empty") succeeded.I need a name that indicates I'm redefining what success means, not just adding a retry until the existing definition of success is met.
I don't follow what this means. Can you reword it if I've missed the point?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i must misunderstand the code, but it seems to look for backups until it finds one. what else does init_from_backup_retry_interval mean? what other success is there? if it can't find a backup, it keeps retrying right? it doesn't start up empty anymore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you look at the new behavior in isolation, the word "retry" makes sense. However, if you take into account the context of how the previous behavior worked -- what users have had to understand since
-restore_from_backup
was first introduced -- then the word "retry" is inaccurate and misleading. In other words, we can't call this "retry" because of legacy.I'm open to other names that are better than
wait_for_backup
, but not if the name actively misleads users.