Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

slack-19.0: add flag to control vtorc recoveries #538

Draft
wants to merge 12 commits into
base: slack-19.0
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go/flags/endtoend/vtorc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ vtorc \

Flags:
--allow-emergency-reparent Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary (default true)
--allow-recovery Allow recovery actions (default true)
--alsologtostderr log to standard error as well as files
--audit-file-location string File location where the audit logs are to be stored
--audit-purge-duration duration Duration for which audit logs are held before being purged. Should be in multiples of days (default 168h0m0s)
Expand Down
1 change: 1 addition & 0 deletions go/test/endtoend/vtorc/readtopologyinstance/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
"--topo_global_root", clusterInfo.ClusterInstance.VtctlProcess.TopoGlobalRoot,
}
servenv.ParseFlags("vtorc")
config.Config.AllowRecovery = true
config.Config.RecoveryPeriodBlockSeconds = 1
config.Config.InstancePollSeconds = 1
config.MarkConfigurationLoaded()
Expand Down
7 changes: 6 additions & 1 deletion go/vt/vtorc/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ var (
auditToBackend = false
auditToSyslog = false
auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days
allowRecovery = true
recoveryPeriodBlockDuration = 30 * time.Second
preventCrossCellFailover = false
waitReplicasTimeout = 30 * time.Second
Expand All @@ -76,6 +77,7 @@ func RegisterFlags(fs *pflag.FlagSet) {
fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database")
fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog")
fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days")
fs.BoolVar(&allowRecovery, "allow-recovery", allowRecovery, "Allow recovery actions")
fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery")
fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover")
fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs")
Expand Down Expand Up @@ -104,6 +106,7 @@ type Configuration struct {
WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS.
TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS.
TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server.
AllowRecovery bool // Allow recoveries.
RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs
}

Expand Down Expand Up @@ -134,12 +137,13 @@ func UpdateConfigValuesFromFlags() {
Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second)
Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second)
Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second)
Config.AllowRecovery = allowRecovery
Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second)
}

// ERSEnabled reports whether VTOrc is allowed to run ERS or not.
func ERSEnabled() bool {
return ersEnabled
return allowRecovery && ersEnabled
}

// SetERSEnabled sets the value for the ersEnabled variable. This should only be used from tests.
Expand Down Expand Up @@ -172,6 +176,7 @@ func newConfiguration() *Configuration {
AuditLogFile: "",
AuditToSyslog: false,
AuditToBackendDB: false,
AllowRecovery: true,
AuditPurgeDays: 7,
RecoveryPeriodBlockSeconds: 30,
PreventCrossDataCenterPrimaryFailover: false,
Expand Down
9 changes: 7 additions & 2 deletions go/vt/vtorc/logic/vtorc.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,14 +335,19 @@ func ContinuousDiscovery() {

healthTick := time.Tick(config.HealthPollSeconds * time.Second)
caretakingTick := time.Tick(time.Minute)
recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second)
tabletTopoTick := OpenTabletDiscovery()
var recoveryEntrance int64
var snapshotTopologiesTick <-chan time.Time
if config.Config.SnapshotTopologiesIntervalHours > 0 {
snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour)
}

recoveryTicker := time.NewTicker(time.Duration(config.Config.RecoveryPollSeconds) * time.Second)
defer recoveryTicker.Stop()
if !config.Config.AllowRecovery {
recoveryTicker.Stop()
}

runCheckAndRecoverOperationsTimeRipe := func() bool {
return time.Since(continuousDiscoveryStartTime) >= checkAndRecoverWaitPeriod
}
Expand Down Expand Up @@ -376,7 +381,7 @@ func ContinuousDiscovery() {
go ExpireTopologyRecoveryStepsHistory()
}
}()
case <-recoveryTick:
case <-recoveryTicker.C:
go func() {
if IsLeaderOrActive() {
go ClearActiveFailureDetections()
Expand Down
Loading