From fc314fb1eaba0c53be733fc018c8ee9e77d60dd8 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Wed, 20 Jan 2021 10:16:00 -0500 Subject: [PATCH] kv: introduce a stopgap for lack of ReplicaState synchronization There's a scary lack of synchronization around how we set the ReplicaState for a given replica, and how we mark a replica as "initialized". What this means is that very temporarily, it's possible for the entry in Store.mu.replicas to be both "initialized" and have an empty ReplicaState. This was an existing problem, but is now more likely to bite us given the migrations infrastructure attempts to purge outdated replicas at start up time (when replicas are being initialized, and we're iterating through extan replicas in the Store.mu.replicas map). This issue has caused a bit of recent instability: #59180, #58489, \#58523, and #58378. While we work on a more considered fix to the problem (tracked in #58489), we can introduce stop the bleeding in the interim (and unskip some tests). Release note: None --- pkg/cmd/roachtest/acceptance.go | 1 - pkg/cmd/roachtest/versionupgrade.go | 11 +++++++---- pkg/kv/kvserver/replica.go | 5 +++++ pkg/kv/kvserver/store.go | 4 ++++ pkg/server/migration_test.go | 2 -- 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pkg/cmd/roachtest/acceptance.go b/pkg/cmd/roachtest/acceptance.go index d2553795ea03..29867680ff2c 100644 --- a/pkg/cmd/roachtest/acceptance.go +++ b/pkg/cmd/roachtest/acceptance.go @@ -78,7 +78,6 @@ func registerAcceptance(r *testRegistry) { // to head after 19.2 fails. minVersion: "v19.2.0", timeout: 30 * time.Minute, - skip: "https://github.com/cockroachdb/cockroach/issues/58489", }, } tags := []string{"default", "quick"} diff --git a/pkg/cmd/roachtest/versionupgrade.go b/pkg/cmd/roachtest/versionupgrade.go index fa5006409f83..c17ce0b7bec4 100644 --- a/pkg/cmd/roachtest/versionupgrade.go +++ b/pkg/cmd/roachtest/versionupgrade.go @@ -104,6 +104,9 @@ func runVersionUpgrade(ctx context.Context, t *test, c *cluster, buildVersion ve testFeaturesStep := versionUpgradeTestFeatures.step(c.All()) schemaChangeStep := runSchemaChangeWorkloadStep(c.All().randNode()[0], 10 /* maxOps */, 2 /* concurrency */) + // TODO(irfansharif): All schema change instances were commented out while + // of #58489 is being addressed. + _ = schemaChangeStep backupStep := func(ctx context.Context, t *test, u *versionUpgradeTest) { // This check was introduced for the system.tenants table and the associated // changes to full-cluster backup to include tenants. It mostly wants to @@ -151,7 +154,7 @@ func runVersionUpgrade(ctx context.Context, t *test, c *cluster, buildVersion ve testFeaturesStep, // Run a quick schemachange workload in between each upgrade. // The maxOps is 10 to keep the test runtime under 1-2 minutes. - schemaChangeStep, + // schemaChangeStep, backupStep, // Roll back again. Note that bad things would happen if the cluster had // ignored our request to not auto-upgrade. The `autoupgrade` roachtest @@ -159,18 +162,18 @@ func runVersionUpgrade(ctx context.Context, t *test, c *cluster, buildVersion ve // as they ought to. binaryUpgradeStep(c.All(), predecessorVersion), testFeaturesStep, - schemaChangeStep, + // schemaChangeStep, backupStep, // Roll nodes forward, this time allowing them to upgrade, and waiting // for it to happen. binaryUpgradeStep(c.All(), ""), allowAutoUpgradeStep(1), testFeaturesStep, - schemaChangeStep, + // schemaChangeStep, backupStep, waitForUpgradeStep(c.All()), testFeaturesStep, - schemaChangeStep, + // schemaChangeStep, backupStep, ) diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index f030793295f5..131a5cc5bb4d 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -802,6 +802,11 @@ func (r *Replica) GetGCThreshold() hlc.Timestamp { func (r *Replica) Version() roachpb.Version { r.mu.RLock() defer r.mu.RUnlock() + + if r.mu.state.Version == nil { + // TODO(irfansharif): This is a stop-gap for #58523. + return roachpb.Version{} + } return *r.mu.state.Version } diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index c083288c1fbd..64134932bb44 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -2810,6 +2810,10 @@ func (s *Store) PurgeOutdatedReplicas(ctx context.Context, version roachpb.Versi qp := quotapool.NewIntPool("purge-outdated-replicas", 50) g := ctxgroup.WithContext(ctx) s.VisitReplicas(func(repl *Replica) (wantMore bool) { + if (repl.Version() == roachpb.Version{}) { + // TODO(irfansharif): This is a stop gap for #58523. + return true + } if !repl.Version().Less(version) { // Nothing to do here. return true diff --git a/pkg/server/migration_test.go b/pkg/server/migration_test.go index 76e8a08c1e70..42b79fad3703 100644 --- a/pkg/server/migration_test.go +++ b/pkg/server/migration_test.go @@ -23,7 +23,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" - "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/util/leaktest" ) @@ -201,7 +200,6 @@ func TestBumpClusterVersion(t *testing.T) { func TestMigrationPurgeOutdatedReplicas(t *testing.T) { defer leaktest.AfterTest(t)() - skip.WithIssue(t, 59180) const numStores = 3 var storeSpecs []base.StoreSpec