From 200454971a8904f8f57d12ff26cb46c4c86f1674 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Sat, 19 Oct 2024 00:02:50 +0200 Subject: [PATCH 1/6] `slack-19.0`: add flag to control `vtorc` recoveries Signed-off-by: Tim Vaillancourt --- go/flags/endtoend/vtorc.txt | 1 + go/vt/vtorc/config/config.go | 4 ++++ go/vt/vtorc/logic/vtorc.go | 3 +++ 3 files changed, 8 insertions(+) diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt index 2bc5916455c..8b2646d27da 100644 --- a/go/flags/endtoend/vtorc.txt +++ b/go/flags/endtoend/vtorc.txt @@ -17,6 +17,7 @@ vtorc \ Flags: --allow-emergency-reparent Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary (default true) + --allow-recovery Allow recovery actions (default true) --alsologtostderr log to standard error as well as files --audit-file-location string File location where the audit logs are to be stored --audit-purge-duration duration Duration for which audit logs are held before being purged. Should be in multiples of days (default 168h0m0s) diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index ba3c41ddc61..e8286c1a40a 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -56,6 +56,7 @@ var ( auditToBackend = false auditToSyslog = false auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days + allowRecovery = true recoveryPeriodBlockDuration = 30 * time.Second preventCrossCellFailover = false waitReplicasTimeout = 30 * time.Second @@ -76,6 +77,7 @@ func RegisterFlags(fs *pflag.FlagSet) { fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") + fs.BoolVar(&allowRecovery, "allow-recovery", allowRecovery, "Allow recovery actions") fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery") fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") @@ -104,6 +106,7 @@ type Configuration struct { WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. + AllowRecovery bool // Allow recoveries. RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs } @@ -134,6 +137,7 @@ func UpdateConfigValuesFromFlags() { Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) + Config.AllowRecovery = allowRecovery Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) } diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index f637956fbfd..abaeebd2afd 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -385,6 +385,9 @@ func ContinuousDiscovery() { } }() case <-recoveryTick: + if !config.Config.AllowRecovery { + continue + } go func() { if IsLeaderOrActive() { go ClearActiveFailureDetections() From 7794425aa6fd94fcaa0cc84b6112cbd5d58e80ae Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Tue, 22 Oct 2024 21:41:06 +0200 Subject: [PATCH 2/6] test fixes Signed-off-by: Tim Vaillancourt --- go/test/endtoend/vtorc/readtopologyinstance/main_test.go | 1 + go/vt/vtorc/config/config.go | 1 + 2 files changed, 2 insertions(+) diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index e3b55d64c6b..f79caeae08a 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -57,6 +57,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { "--topo_global_root", clusterInfo.ClusterInstance.VtctlProcess.TopoGlobalRoot, } servenv.ParseFlags("vtorc") + config.Config.AllowRecovery = true config.Config.RecoveryPeriodBlockSeconds = 1 config.Config.InstancePollSeconds = 1 config.MarkConfigurationLoaded() diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index e8286c1a40a..0fec8e8f521 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -176,6 +176,7 @@ func newConfiguration() *Configuration { AuditLogFile: "", AuditToSyslog: false, AuditToBackendDB: false, + AllowRecovery: true, AuditPurgeDays: 7, RecoveryPeriodBlockSeconds: 30, PreventCrossDataCenterPrimaryFailover: false, From 925f82e77706a3f0eeb1ccf35db371b7f571d3d2 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Tue, 22 Oct 2024 22:57:43 +0200 Subject: [PATCH 3/6] consider allowRecovery in ERSEnabled Signed-off-by: Tim Vaillancourt --- go/vt/vtorc/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 0fec8e8f521..8b36341d318 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -143,7 +143,7 @@ func UpdateConfigValuesFromFlags() { // ERSEnabled reports whether VTOrc is allowed to run ERS or not. func ERSEnabled() bool { - return ersEnabled + return allowRecovery && ersEnabled } // SetERSEnabled sets the value for the ersEnabled variable. This should only be used from tests. From 977d1b9e27d34a0a0c35ff24a2b397f37e4a4d24 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Wed, 23 Oct 2024 00:21:10 +0200 Subject: [PATCH 4/6] make recover ticker `*time.Ticker` and stop it Signed-off-by: Tim Vaillancourt --- go/vt/vtorc/logic/vtorc.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index abaeebd2afd..b661730be14 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -343,7 +343,6 @@ func ContinuousDiscovery() { healthTick := time.Tick(config.HealthPollSeconds * time.Second) caretakingTick := time.Tick(time.Minute) - recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) tabletTopoTick := OpenTabletDiscovery() var recoveryEntrance int64 var snapshotTopologiesTick <-chan time.Time @@ -351,6 +350,11 @@ func ContinuousDiscovery() { snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) } + recoveryTicker := time.NewTicker(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) + if config.Config.AllowRecovery { + recoveryTicker.Stop() + } + runCheckAndRecoverOperationsTimeRipe := func() bool { return time.Since(continuousDiscoveryStartTime) >= checkAndRecoverWaitPeriod } @@ -384,10 +388,7 @@ func ContinuousDiscovery() { go ExpireTopologyRecoveryStepsHistory() } }() - case <-recoveryTick: - if !config.Config.AllowRecovery { - continue - } + case <-recoveryTicker.C: go func() { if IsLeaderOrActive() { go ClearActiveFailureDetections() From 502b4aa69dd70b517190a4d9ad6ce372881377d8 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Wed, 23 Oct 2024 00:22:31 +0200 Subject: [PATCH 5/6] make recover ticker `*time.Ticker` and stop it, pt 2 Signed-off-by: Tim Vaillancourt --- go/vt/vtorc/logic/vtorc.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index b661730be14..5715ada4fca 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -351,7 +351,7 @@ func ContinuousDiscovery() { } recoveryTicker := time.NewTicker(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) - if config.Config.AllowRecovery { + if !config.Config.AllowRecovery { recoveryTicker.Stop() } From 7d65980d74965b4c844e4d70a5643cabd00881d2 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Wed, 23 Oct 2024 17:58:33 +0200 Subject: [PATCH 6/6] use ticker Signed-off-by: Tim Vaillancourt --- go/vt/vtorc/logic/vtorc.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index 5715ada4fca..9cbc46eac68 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -351,6 +351,7 @@ func ContinuousDiscovery() { } recoveryTicker := time.NewTicker(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) + defer recoveryTicker.Stop() if !config.Config.AllowRecovery { recoveryTicker.Stop() }