From bb3ff7ac64ce3b1d94b39fd393f318251904dba3 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 5 May 2021 17:48:52 +0200
Subject: [PATCH 1/7] kvserver: silence spammy logging

This would fire at least a few times when a cluster boots up, and
is fairly natural, so it shouldn't be a warning and also should
not be as frequent.

Release note: None
---
 pkg/kv/kvserver/replica_range_lease.go | 8 ++++++--
 pkg/kv/kvserver/store.go               | 7 ++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go
index 7c6d2d5dbc6a..650c7445e4d7 100644
--- a/pkg/kv/kvserver/replica_range_lease.go
+++ b/pkg/kv/kvserver/replica_range_lease.go
@@ -63,7 +63,11 @@ import (
 	"go.etcd.io/etcd/raft/v3"
 )
 
-var leaseStatusLogLimiter = log.Every(5 * time.Second)
+var leaseStatusLogLimiter = func() *log.EveryN {
+	e := log.Every(15 * time.Second)
+	e.ShouldLog() // waste the first shot
+	return &e
+}()
 
 // leaseRequestHandle is a handle to an asynchronous lease request.
 type leaseRequestHandle struct {
@@ -609,7 +613,7 @@ func (r *Replica) leaseStatus(
 			// use the lease nor do we want to attempt to acquire it.
 			var msg redact.StringBuilder
 			if !ok {
-				msg.Printf("can't determine lease status of %s due to node liveness error: %+v",
+				msg.Printf("can't determine lease status of %s due to node liveness error: %v",
 					lease.Replica, liveness.ErrRecordCacheMiss)
 			} else {
 				msg.Printf("can't determine lease status of %s because node liveness info for n%d is stale. lease: %s, liveness: %s",
diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go
index 142274140840..a885959acd64 100644
--- a/pkg/kv/kvserver/store.go
+++ b/pkg/kv/kvserver/store.go
@@ -1661,6 +1661,9 @@ func (s *Store) startGossip() {
 		},
 	}
 
+	cannotGossipEvery := log.Every(time.Minute)
+	cannotGossipEvery.ShouldLog() // only log next time after waiting out the delay
+
 	// Periodic updates run in a goroutine and signal a WaitGroup upon completion
 	// of their first iteration.
 	s.initComplete.Add(len(gossipFns))
@@ -1680,7 +1683,9 @@ func (s *Store) startGossip() {
 					if repl := s.LookupReplica(roachpb.RKey(gossipFn.key)); repl != nil {
 						annotatedCtx := repl.AnnotateCtx(ctx)
 						if err := gossipFn.fn(annotatedCtx, repl); err != nil {
-							log.Warningf(annotatedCtx, "could not gossip %s: %+v", gossipFn.description, err)
+							if cannotGossipEvery.ShouldLog() {
+								log.Infof(annotatedCtx, "could not gossip %s: %v", gossipFn.description, err)
+							}
 							if !errors.Is(err, errPeriodicGossipsDisabled) {
 								continue
 							}

From a46b7e180d5dc305028c4fb278c39c5ec4ce5b0a Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Thu, 6 May 2021 11:47:35 +0200
Subject: [PATCH 2/7] roachtest: update election-after-restart

The recently added in-process test TestElectionAfterRestart was not able
to reproduce the problems in this test. It is possible that including
the time for the cluster to boot up after restarting (and connecting to
each other, exchanging initial gossip, etc) was the reason for its
spurious failures. In its updated form, this test measures the duration
in isolation, and it has been made much more aggressive (restarting
the cluster 100 times, etc). It passes, with the majority duration being
in the 2s range, rarely ever reaching 4s. I looked at a >3s trace and
found nothing suspicious, it simply takes a little while to go through
all of the ranges with continuous progress being made.

Release note: None
---
 pkg/cmd/roachtest/election.go | 74 ++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/pkg/cmd/roachtest/election.go b/pkg/cmd/roachtest/election.go
index 28f629c83b59..33acd7bc9d5d 100644
--- a/pkg/cmd/roachtest/election.go
+++ b/pkg/cmd/roachtest/election.go
@@ -15,13 +15,13 @@ import (
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+	"github.com/stretchr/testify/require"
 )
 
 func registerElectionAfterRestart(r *testRegistry) {
 	r.Add(testSpec{
 		Name:    "election-after-restart",
 		Owner:   OwnerKV,
-		Skip:    "https://github.com/cockroachdb/cockroach/issues/54246",
 		Cluster: makeClusterSpec(3),
 		Run: func(ctx context.Context, t *test, c *cluster) {
 			t.Status("starting up")
@@ -36,6 +36,8 @@ func registerElectionAfterRestart(r *testRegistry) {
 
 			t.Status("creating table and splits")
 			c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
+				CREATE TABLE timing(ts TIMESTAMP);
+				INSERT INTO timing VALUES(now());
         CREATE DATABASE IF NOT EXISTS test;
         CREATE TABLE test.kv (k INT PRIMARY KEY, v INT);
         -- Prevent the merge queue from immediately discarding our splits.
@@ -55,42 +57,50 @@ func registerElectionAfterRestart(r *testRegistry) {
 			// settle before restarting.
 			time.Sleep(3 * time.Second)
 
-			t.Status("restarting")
-			c.Stop(ctx)
-			c.Start(ctx, t)
+			for i := 0; i < 100; i++ {
+				t.Status("restarting")
+				c.Stop(ctx)
+				c.Start(ctx, t)
 
-			// Each of the 100 ranges in this table must elect a leader for
-			// this query to complete. In naive raft, each of these
-			// elections would require waiting for a 3-second timeout, one
-			// at a time. This test verifies that our mechanisms to speed
-			// this up are working (we trigger elections eagerly, but not so
-			// eagerly that multiple elections conflict with each other).
-			start = timeutil.Now()
-			// Use a large CONNECT_TIMEOUT so that if the initial connection
-			// takes ages (perhaps due to some cli-internal query taking a
-			// very long time), we fail with the duration check below and
-			// not an opaque error from the cli.
-			buf, err := c.RunWithBuffer(ctx, t.l, c.Node(1), `COCKROACH_CONNECT_TIMEOUT=240 ./cockroach sql --insecure -e "
+				// Each of the 100 ranges in this table must elect a leader for
+				// this query to complete. In naive raft, each of these
+				// elections would require waiting for a 3-second timeout, one
+				// at a time. This test verifies that our mechanisms to speed
+				// this up are working (we trigger elections eagerly, but not so
+				// eagerly that multiple elections conflict with each other).
+				//
+				// Use a large CONNECT_TIMEOUT so that if the initial connection
+				// takes ages (perhaps due to some cli-internal query taking a
+				// very long time), we fail with the duration check below and
+				// not an opaque error from the cli.
+				buf, err := c.RunWithBuffer(ctx, t.l, c.Node(1), `COCKROACH_CONNECT_TIMEOUT=240 ./cockroach sql --insecure -e "
 SET TRACING = on;
-SELECT * FROM test.kv;
+UPDATE timing SET ts = now() WHERE true;
+UPDATE timing SET ts = now() WHERE true;
+SELECT count(1) FROM test.kv;
 SET TRACING = off;
 SHOW TRACE FOR SESSION;
 "`)
-			if err != nil {
-				t.Fatalf("%s\n\n%s", buf, err)
-			}
-			duration = timeutil.Since(start)
-			t.l.Printf("post-restart, query took %s\n", duration)
-			if expected := 15 * time.Second; duration > expected {
-				// In the happy case, this query runs in around 250ms. Prior
-				// to the introduction of this test, a bug caused most
-				// elections to fail and the query would take over 100
-				// seconds. There are still issues that can cause a few
-				// elections to fail (the biggest one as I write this is
-				// #26448), so we must use a generous timeout here. We may be
-				// able to tighten the bounds as we make more improvements.
-				t.l.Printf("%s\n", buf)
-				t.Fatalf("expected query to succeed in less than %s, took %s", expected, duration)
+				if err != nil {
+					t.Fatalf("%s\n\n%s", buf, err)
+				}
+
+				var duration time.Duration
+				require.NoError(t, c.Conn(ctx, 1).QueryRow(
+					`SELECT 1000*1000*extract(milliseconds FROM (now()-ts))::int FROM timing`,
+				).Scan(&duration))
+				t.l.Printf("post-restart, query took %s\n", duration)
+				if expected := 10 * time.Second; duration > expected {
+					// In the happy case, this query runs in around 250ms. Prior
+					// to the introduction of this test, a bug caused most
+					// elections to fail and the query would take over 100
+					// seconds. There are still issues that can cause a few
+					// elections to fail (the biggest one as I write this is
+					// #26448), so we must use a generous timeout here. We may be
+					// able to tighten the bounds as we make more improvements.
+					t.l.Printf("%s\n", buf)
+					t.Fatalf("expected query to succeed in less than %s, took %s", expected, duration)
+				}
 			}
 		},
 	})

From 5197551f02f7d9489fddcd63fd7a89db276e809c Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Thu, 6 May 2021 14:43:28 +0200
Subject: [PATCH 3/7] roachtest: remove election-after-restart

This test is superseded by kvserver.TestElectionAfterRestart.

Closes #54246.

Release note: None
---
 pkg/cmd/roachtest/BUILD.bazel |   1 -
 pkg/cmd/roachtest/election.go | 107 ----------------------------------
 pkg/cmd/roachtest/registry.go |   1 -
 3 files changed, 109 deletions(-)
 delete mode 100644 pkg/cmd/roachtest/election.go

diff --git a/pkg/cmd/roachtest/BUILD.bazel b/pkg/cmd/roachtest/BUILD.bazel
index c62bea45248a..84fc9dce56b1 100644
--- a/pkg/cmd/roachtest/BUILD.bazel
+++ b/pkg/cmd/roachtest/BUILD.bazel
@@ -31,7 +31,6 @@ go_library(
         "django.go",
         "django_blocklist.go",
         "drop.go",
-        "election.go",
         "encryption.go",
         "engine_switch.go",
         "event_log.go",
diff --git a/pkg/cmd/roachtest/election.go b/pkg/cmd/roachtest/election.go
deleted file mode 100644
index 33acd7bc9d5d..000000000000
--- a/pkg/cmd/roachtest/election.go
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright 2018 The Cockroach Authors.
-//
-// Use of this software is governed by the Business Source License
-// included in the file licenses/BSL.txt.
-//
-// As of the Change Date specified in that file, in accordance with
-// the Business Source License, use of this software will be governed
-// by the Apache License, Version 2.0, included in the file
-// licenses/APL.txt.
-
-package main
-
-import (
-	"context"
-	"time"
-
-	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
-	"github.com/stretchr/testify/require"
-)
-
-func registerElectionAfterRestart(r *testRegistry) {
-	r.Add(testSpec{
-		Name:    "election-after-restart",
-		Owner:   OwnerKV,
-		Cluster: makeClusterSpec(3),
-		Run: func(ctx context.Context, t *test, c *cluster) {
-			t.Status("starting up")
-			c.Put(ctx, cockroach, "./cockroach")
-			c.Start(ctx, t)
-
-			// If the initial ranges aren't fully replicated by the time we
-			// run our splits, replicating them after the splits will take
-			// longer, so wait for the initial replication before
-			// proceeding.
-			time.Sleep(3 * time.Second)
-
-			t.Status("creating table and splits")
-			c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
-				CREATE TABLE timing(ts TIMESTAMP);
-				INSERT INTO timing VALUES(now());
-        CREATE DATABASE IF NOT EXISTS test;
-        CREATE TABLE test.kv (k INT PRIMARY KEY, v INT);
-        -- Prevent the merge queue from immediately discarding our splits.
-        SET CLUSTER SETTING kv.range_merge.queue_enabled = false;
-        ALTER TABLE test.kv SPLIT AT SELECT generate_series(0, 10000, 100)"`)
-
-			start := timeutil.Now()
-			c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
-        SELECT * FROM test.kv"`)
-			duration := timeutil.Since(start)
-			t.l.Printf("pre-restart, query took %s\n", duration)
-
-			// If we restart before all the nodes have applied the splits,
-			// there will be a lot of snapshot attempts (which may fail)
-			// after the restart. This appears to slow down startup enough
-			// to fail the condition below, so wait a bit for the dust to
-			// settle before restarting.
-			time.Sleep(3 * time.Second)
-
-			for i := 0; i < 100; i++ {
-				t.Status("restarting")
-				c.Stop(ctx)
-				c.Start(ctx, t)
-
-				// Each of the 100 ranges in this table must elect a leader for
-				// this query to complete. In naive raft, each of these
-				// elections would require waiting for a 3-second timeout, one
-				// at a time. This test verifies that our mechanisms to speed
-				// this up are working (we trigger elections eagerly, but not so
-				// eagerly that multiple elections conflict with each other).
-				//
-				// Use a large CONNECT_TIMEOUT so that if the initial connection
-				// takes ages (perhaps due to some cli-internal query taking a
-				// very long time), we fail with the duration check below and
-				// not an opaque error from the cli.
-				buf, err := c.RunWithBuffer(ctx, t.l, c.Node(1), `COCKROACH_CONNECT_TIMEOUT=240 ./cockroach sql --insecure -e "
-SET TRACING = on;
-UPDATE timing SET ts = now() WHERE true;
-UPDATE timing SET ts = now() WHERE true;
-SELECT count(1) FROM test.kv;
-SET TRACING = off;
-SHOW TRACE FOR SESSION;
-"`)
-				if err != nil {
-					t.Fatalf("%s\n\n%s", buf, err)
-				}
-
-				var duration time.Duration
-				require.NoError(t, c.Conn(ctx, 1).QueryRow(
-					`SELECT 1000*1000*extract(milliseconds FROM (now()-ts))::int FROM timing`,
-				).Scan(&duration))
-				t.l.Printf("post-restart, query took %s\n", duration)
-				if expected := 10 * time.Second; duration > expected {
-					// In the happy case, this query runs in around 250ms. Prior
-					// to the introduction of this test, a bug caused most
-					// elections to fail and the query would take over 100
-					// seconds. There are still issues that can cause a few
-					// elections to fail (the biggest one as I write this is
-					// #26448), so we must use a generous timeout here. We may be
-					// able to tighten the bounds as we make more improvements.
-					t.l.Printf("%s\n", buf)
-					t.Fatalf("expected query to succeed in less than %s, took %s", expected, duration)
-				}
-			}
-		},
-	})
-}
diff --git a/pkg/cmd/roachtest/registry.go b/pkg/cmd/roachtest/registry.go
index 671dfaafecfc..af081fe1b442 100644
--- a/pkg/cmd/roachtest/registry.go
+++ b/pkg/cmd/roachtest/registry.go
@@ -34,7 +34,6 @@ func registerTests(r *testRegistry) {
 	registerDiskStalledDetection(r)
 	registerDjango(r)
 	registerDrop(r)
-	registerElectionAfterRestart(r)
 	registerEncryption(r)
 	registerEngineSwitch(r)
 	registerFlowable(r)

From 8d740d951b1d4090cee64992d2e21c891a5f94fe Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 26 May 2021 10:48:33 +0200
Subject: [PATCH 4/7] kvserver: make wrapper around rawnode.Campaign

Release note: None
---
 pkg/kv/kvserver/replica_raft.go | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
index d486fa43b471..69f5bb3adc71 100644
--- a/pkg/kv/kvserver/replica_raft.go
+++ b/pkg/kv/kvserver/replica_raft.go
@@ -889,7 +889,9 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 			//
 			// NB: this must be called after Advance() above since campaigning is
 			// a no-op in the presence of unapplied conf changes.
-			maybeCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup)
+			if maybeCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup) {
+				r.campaignLocked(ctx)
+			}
 		}
 
 		// If the Raft group still has more to process then we immediately
@@ -1605,12 +1607,16 @@ func (r *Replica) maybeCampaignOnWakeLocked(ctx context.Context) {
 	raftStatus := r.mu.internalRaftGroup.BasicStatus()
 	livenessMap, _ := r.store.livenessMap.Load().(liveness.IsLiveMap)
 	if shouldCampaignOnWake(leaseStatus, r.store.StoreID(), raftStatus, livenessMap, r.descRLocked(), r.requiresExpiringLeaseRLocked()) {
-		log.VEventf(ctx, 3, "campaigning")
-		if err := r.mu.internalRaftGroup.Campaign(); err != nil {
-			log.VEventf(ctx, 1, "failed to campaign: %s", err)
-		}
-		r.store.enqueueRaftUpdateCheck(r.RangeID)
+		r.campaignLocked(ctx)
+	}
+}
+
+func (r *Replica) campaignLocked(ctx context.Context) {
+	log.VEventf(ctx, 3, "campaigning")
+	if err := r.mu.internalRaftGroup.Campaign(); err != nil {
+		log.VEventf(ctx, 1, "failed to campaign: %s", err)
 	}
+	r.store.enqueueRaftUpdateCheck(r.RangeID)
 }
 
 // a lastUpdateTimesMap is maintained on the Raft leader to keep track of the
@@ -1908,7 +1914,7 @@ func maybeCampaignAfterConfChange(
 	storeID roachpb.StoreID,
 	desc *roachpb.RangeDescriptor,
 	raftGroup *raft.RawNode,
-) {
+) bool {
 	// If a config change was carried out, it's possible that the Raft
 	// leader was removed. Verify that, and if so, campaign if we are
 	// the first remaining voter replica. Without this, the range will
@@ -1921,21 +1927,22 @@ func maybeCampaignAfterConfChange(
 	if st.Lead == 0 {
 		// Leader unknown. This isn't what we expect in steady state, so we
 		// don't do anything.
-		return
+		return false
 	}
 	if !desc.IsInitialized() {
 		// We don't have an initialized, so we can't figure out who is supposed
 		// to campaign. It's possible that it's us and we're waiting for the
 		// initial snapshot, but it's hard to tell. Don't do anything.
-		return
+		return false
 	}
 	// If the leader is no longer in the descriptor but we are the first voter,
 	// campaign.
 	_, leaderStillThere := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(st.Lead))
 	if !leaderStillThere && storeID == desc.Replicas().VoterDescriptors()[0].StoreID {
-		log.VEventf(ctx, 3, "leader got removed by conf change; campaigning")
-		_ = raftGroup.Campaign()
+		log.VEventf(ctx, 3, "leader got removed by conf change")
+		return true
 	}
+	return false
 }
 
 func getNonDeterministicFailureExplanation(err error) string {

From bc7609b224ce6b5762191a716ed448e80a2e2d02 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 26 May 2021 10:49:36 +0200
Subject: [PATCH 5/7] kvserver: add `raft.timeoutcampaign` metric

This is incremented whenever a follower transitions to candidate, which
indicates that it hasn't heard from a leader in a while. These should be
rare, so it makes sense to be able to back that up with numbers.

This commit also adds a testing knob that is triggered in the same
situation.

Release note: None
---
 pkg/kv/kvserver/metrics.go       |  8 ++++++++
 pkg/kv/kvserver/replica_raft.go  | 10 ++++++++++
 pkg/kv/kvserver/testing_knobs.go |  5 +++++
 pkg/ts/catalog/chart_catalog.go  |  4 ++++
 4 files changed, 27 insertions(+)

diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go
index 691dfef3a58a..dca192837b4e 100644
--- a/pkg/kv/kvserver/metrics.go
+++ b/pkg/kv/kvserver/metrics.go
@@ -532,6 +532,12 @@ var (
 		Measurement: "Latency",
 		Unit:        metric.Unit_NANOSECONDS,
 	}
+	metaRaftTimeoutCampaign = metric.Metadata{
+		Name:        "raft.timeoutcampaign",
+		Help:        "Number of Raft replicas campaigning after missed heartbeats from leader",
+		Measurement: "Elections called after timeout",
+		Unit:        metric.Unit_COUNT,
+	}
 
 	// Raft message metrics.
 	metaRaftRcvdProp = metric.Metadata{
@@ -1145,6 +1151,7 @@ type StoreMetrics struct {
 	RaftHandleReadyLatency    *metric.Histogram
 	RaftApplyCommittedLatency *metric.Histogram
 	RaftSchedulerLatency      *metric.Histogram
+	RaftTimeoutCampaign       *metric.Counter
 
 	// Raft message metrics.
 	//
@@ -1514,6 +1521,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
 		RaftHandleReadyLatency:    metric.NewLatency(metaRaftHandleReadyLatency, histogramWindow),
 		RaftApplyCommittedLatency: metric.NewLatency(metaRaftApplyCommittedLatency, histogramWindow),
 		RaftSchedulerLatency:      metric.NewLatency(metaRaftSchedulerLatency, histogramWindow),
+		RaftTimeoutCampaign:       metric.NewCounter(metaRaftTimeoutCampaign),
 
 		// Raft message metrics.
 		RaftRcvdMessages: [...]*metric.Counter{
diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
index 69f5bb3adc71..190ab9d9cd3a 100644
--- a/pkg/kv/kvserver/replica_raft.go
+++ b/pkg/kv/kvserver/replica_raft.go
@@ -991,7 +991,17 @@ func (r *Replica) tick(ctx context.Context, livenessMap liveness.IsLiveMap) (boo
 	}
 
 	r.mu.ticks++
+	preTickState := r.mu.internalRaftGroup.BasicStatus().RaftState
 	r.mu.internalRaftGroup.Tick()
+	postTickState := r.mu.internalRaftGroup.BasicStatus().RaftState
+	if preTickState != postTickState {
+		if postTickState == raft.StatePreCandidate {
+			r.store.Metrics().RaftTimeoutCampaign.Inc(1)
+			if k := r.store.TestingKnobs(); k != nil && k.OnRaftTimeoutCampaign != nil {
+				k.OnRaftTimeoutCampaign(r.RangeID)
+			}
+		}
+	}
 
 	refreshAtDelta := r.store.cfg.RaftElectionTimeoutTicks
 	if knob := r.store.TestingKnobs().RefreshReasonTicksPeriod; knob > 0 {
diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go
index c110d497b537..967f3c828dac 100644
--- a/pkg/kv/kvserver/testing_knobs.go
+++ b/pkg/kv/kvserver/testing_knobs.go
@@ -334,6 +334,11 @@ type StoreTestingKnobs struct {
 	// TimeSeriesDataStore is an interface used by the store's time series
 	// maintenance queue to dispatch individual maintenance tasks.
 	TimeSeriesDataStore TimeSeriesDataStore
+
+	// Called whenever a range campaigns as a result of a tick. This
+	// is called under the replica lock and raftMu, so basically don't
+	// acquire any locks in this method.
+	OnRaftTimeoutCampaign func(roachpb.RangeID)
 }
 
 // ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface.
diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go
index caa14ce2aafe..73eb44987f62 100644
--- a/pkg/ts/catalog/chart_catalog.go
+++ b/pkg/ts/catalog/chart_catalog.go
@@ -1317,6 +1317,10 @@ var charts = []sectionDescription{
 				Title:   "Stuck Request Count",
 				Metrics: []string{"requests.slow.raft"},
 			},
+			{
+				Title:   "Heartbeat Timeouts",
+				Metrics: []string{"raft.timeoutcampaign"},
+			},
 			{
 				Title:   "Ticks Queued",
 				Metrics: []string{"raft.ticks"},

From 1a7c8e75417ca590198ff733d8531603080454fd Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 5 May 2021 17:43:53 +0200
Subject: [PATCH 6/7] kvserver: add TestElectionAfterRestart

This replaces the election-after-restart roachtest.

Release note: None
---
 pkg/kv/kvserver/client_raft_test.go      | 169 +++++++++++++++++++++++
 pkg/server/testing_knobs.go              |   3 +
 pkg/testutils/testcluster/testcluster.go |  11 ++
 3 files changed, 183 insertions(+)

diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go
index 8a5858f4fd4a..4a4f75247d77 100644
--- a/pkg/kv/kvserver/client_raft_test.go
+++ b/pkg/kv/kvserver/client_raft_test.go
@@ -5444,3 +5444,172 @@ func (n noopRaftMessageResponseSteam) Send(*kvserver.RaftMessageResponse) error
 }
 
 var _ kvserver.RaftMessageResponseStream = noopRaftMessageResponseSteam{}
+
+// TestElectionAfterRestart is an end-to-end test for shouldCampaignOnWakeLocked
+// (see TestReplicaShouldCampaignOnWake for the corresponding unit test). It sets
+// up a cluster, makes 100 ranges, restarts the cluster, and verifies that the
+// cluster serves a full table scan over these ranges without incurring any raft
+// elections that are triggered by a timeout.
+//
+// For technical reasons, this uses a single-node cluster. The test can also be
+// run on multi-node clusters, though it is very difficult to deflake it there
+// as there can be rare but hard to avoid election stalemates if requests arrive
+// on multiple nodes at once.
+func TestElectionAfterRestart(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+	ctx := context.Background()
+
+	// We use a single node to avoid rare flakes due to dueling elections.
+	// The code is set up to support multiple nodes, though the test will
+	// be flaky as we have no way to control that each range receives only
+	// one request at a single replica.
+	const numNodes = 1
+
+	// Hard-code the election timeouts here for a 6s timeout. We want to make sure
+	// that election timeouts never happen spuriously in this test as a result of
+	// running it with very little headroom (such as stress). We avoid an infinite
+	// timeout because that will make for a poor experience (hanging requests)
+	// when something does flake.
+	const electionTimeoutTicks = 30
+	const raftTickInterval = 200 * time.Millisecond
+
+	r := server.NewStickyInMemEnginesRegistry()
+	defer r.CloseAllStickyInMemEngines()
+
+	newTCArgs := func(parallel bool, replMode base.TestClusterReplicationMode, onTimeoutCampaign func(roachpb.RangeID)) base.TestClusterArgs {
+		return base.TestClusterArgs{
+			ReplicationMode: replMode,
+			ParallelStart:   parallel,
+			ServerArgs: base.TestServerArgs{
+				RaftConfig: base.RaftConfig{
+					RaftElectionTimeoutTicks: electionTimeoutTicks,
+					RaftTickInterval:         raftTickInterval,
+				},
+				Knobs: base.TestingKnobs{
+					Server: &server.TestingKnobs{
+						StickyEngineRegistry: r,
+					},
+					Store: &kvserver.StoreTestingKnobs{
+						OnRaftTimeoutCampaign: onTimeoutCampaign,
+					},
+				},
+			},
+		}
+	}
+
+	const numRanges = 100
+
+	rangeIDs := map[roachpb.RangeID]int{} // ranges in our table -> election timeouts seen
+	func() {
+		tc := testcluster.NewTestCluster(
+			t, numNodes, newTCArgs(false /* parallel */, base.ReplicationAuto, nil /* onTimeoutCampaign */))
+		tc.Start(t)
+		defer t.Log("stopped cluster")
+		defer tc.Stopper().Stop(ctx)
+		_, err := tc.Conns[0].Exec(`CREATE TABLE t(x, PRIMARY KEY(x)) AS TABLE generate_series(1, $1)`, numRanges-1)
+		require.NoError(t, err)
+		// Splitting in reverse order is faster (splitDelayHelper doesn't have to add any delays).
+		_, err = tc.Conns[0].Exec(`ALTER TABLE t SPLIT AT TABLE generate_series($1, 1, -1)`, numRanges-1)
+		require.NoError(t, err)
+		require.NoError(t, tc.WaitForFullReplication())
+
+		for _, row := range sqlutils.MakeSQLRunner(tc.Conns[0]).QueryStr(
+			t, `SELECT range_id FROM crdb_internal.ranges_no_leases WHERE table_name = 't';`,
+		) {
+			n, err := strconv.Atoi(row[0])
+			require.NoError(t, err)
+			rangeIDs[roachpb.RangeID(n)] = 0
+		}
+		require.Len(t, rangeIDs, numRanges)
+		t.Logf("created %d ranges", numRanges)
+
+		// Make sure that the ranges have all followers fully caught up. Otherwise,
+		// as we stop the server and restart it later, the follower that
+		// auto-campaigns may not be fully caught up and so will fail to win the
+		// election, so the raft group will have for someone else to campaign after
+		// a timeout and this is what we want to make sure doesn't happen in this
+		// test.
+		//
+		// Note that none of this is needed for numNodes=1, but we want to make sure
+		// that the test is not more flaky than it needs to be when run with
+		// numNodes>1.
+		testutils.SucceedsSoon(t, func() error {
+			for rangeID := range rangeIDs {
+				var err error
+				var lastIndex uint64
+				for _, srv := range tc.Servers {
+					_ = srv.Stores().VisitStores(func(s *kvserver.Store) error {
+						s.VisitReplicas(func(replica *kvserver.Replica) (more bool) {
+							if replica.RangeID != rangeID {
+								return
+							}
+
+							cur := replica.State(ctx).LastIndex
+							if lastIndex == 0 {
+								lastIndex = cur
+							}
+							if lastIndex > cur {
+								err = errors.Errorf("last indexes not equal: %d != %d", lastIndex, cur)
+							}
+							return err == nil // more
+						})
+						return nil
+					})
+				}
+				if err != nil {
+					return err
+				}
+			}
+			return nil
+		})
+		for _, srv := range tc.Servers {
+			require.NoError(t, srv.Stores().VisitStores(func(s *kvserver.Store) error {
+				return s.Engine().Flush()
+			}))
+		}
+		t.Log("waited for all followers to be caught up")
+	}()
+
+	// Annoyingly, with the increased tick interval in this test, this cluster takes
+	// a long time to re-start. The reason is that the "campaign" heuristic on the
+	// node liveness range (and maybe some other system ranges) typically fires before
+	// all nodes are available, meaning that the vote request messages are silently
+	// dropped. The group will then be in StatePreCandidate and will have to sit out
+	// an election timeout. One way to fix this could be to start n2 and n3
+	// before n1. Or we could selectively change the tick interval only for the
+	// ranges we care about, or allow changing it on a running cluster (but that
+	// last option is going to be flaky, since we only approximately control when
+	// the raft instances we care about are initialized after the restart).
+	tc := testcluster.NewTestCluster(
+		t,
+		numNodes,
+		newTCArgs(true /* parallel */, base.ReplicationManual, func(rangeID roachpb.RangeID) {
+			if _, ok := rangeIDs[rangeID]; ok {
+				rangeIDs[rangeID]++
+			} else {
+				if numNodes == 1 {
+					t.Errorf("saw election from untracked range r%d", rangeID)
+				} else {
+					// We don't want this to happen, but it can and it shouldn't fail
+					// the test. The duel is much more frequent on some of the system
+					// ranges.
+					t.Logf("ignoring election from untracked range r%d", rangeID)
+				}
+			}
+		}),
+	)
+	tc.Start(t)
+	t.Log("started cluster")
+	defer tc.Stopper().Stop(ctx)
+
+	runner := sqlutils.MakeSQLRunner(tc.Conns[0])
+	tBegin := timeutil.Now()
+	require.Equal(t, fmt.Sprint(numRanges-1), runner.QueryStr(t, `SELECT count(1) FROM t`)[0][0])
+	dur := timeutil.Since(tBegin)
+	t.Logf("scanned full table in %.2fs (%s/range)", dur.Seconds(), dur/time.Duration(numRanges))
+
+	for rangeID, n := range rangeIDs {
+		assert.Zero(t, n, "unexpected election after timeout on r%d", rangeID)
+	}
+}
diff --git a/pkg/server/testing_knobs.go b/pkg/server/testing_knobs.go
index 4202523dc4c1..ff092a4856ae 100644
--- a/pkg/server/testing_knobs.go
+++ b/pkg/server/testing_knobs.go
@@ -80,6 +80,9 @@ type TestingKnobs struct {
 	OnDecommissionedCallback func(livenesspb.Liveness)
 	// StickyEngineRegistry manages the lifecycle of sticky in memory engines,
 	// which can be enabled via base.StoreSpec.StickyInMemoryEngineID.
+	//
+	// When supplied to a TestCluster, StickyEngineIDs will be associated auto-
+	// matically to the StoreSpecs used.
 	StickyEngineRegistry StickyInMemEnginesRegistry
 	// Clock Source used to an inject a custom clock for testing the server. It is
 	// typically either an hlc.HybridManualClock or hlc.ManualClock.
diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go
index b4432a038bd2..933c02a93ffd 100644
--- a/pkg/testutils/testcluster/testcluster.go
+++ b/pkg/testutils/testcluster/testcluster.go
@@ -223,6 +223,17 @@ func NewTestCluster(t testing.TB, nodes int, clusterArgs base.TestClusterArgs) *
 			serverArgs = tc.clusterArgs.ServerArgs
 		}
 
+		if len(serverArgs.StoreSpecs) == 0 {
+			serverArgs.StoreSpecs = []base.StoreSpec{base.DefaultTestStoreSpec}
+		}
+		if knobs, ok := serverArgs.Knobs.Server.(*server.TestingKnobs); ok && knobs.StickyEngineRegistry != nil {
+			for j := range serverArgs.StoreSpecs {
+				if serverArgs.StoreSpecs[j].StickyInMemoryEngineID == "" {
+					serverArgs.StoreSpecs[j].StickyInMemoryEngineID = fmt.Sprintf("auto-node%d-store%d", i+1, j+1)
+				}
+			}
+		}
+
 		// If no localities are specified in the args, we'll generate some
 		// automatically.
 		if noLocalities {

From b7acf2fe4b125e086b3b701b1629867d745d3cec Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.schottdorf@gmail.com>
Date: Mon, 31 May 2021 17:24:34 +0200
Subject: [PATCH 7/7] kvserver: rename maybeCampaignOnConfChange

Release note: None
---
 pkg/kv/kvserver/replica_raft.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
index 190ab9d9cd3a..e3630b20444c 100644
--- a/pkg/kv/kvserver/replica_raft.go
+++ b/pkg/kv/kvserver/replica_raft.go
@@ -889,7 +889,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 			//
 			// NB: this must be called after Advance() above since campaigning is
 			// a no-op in the presence of unapplied conf changes.
-			if maybeCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup) {
+			if shouldCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup) {
 				r.campaignLocked(ctx)
 			}
 		}
@@ -1919,7 +1919,7 @@ func ComputeRaftLogSize(
 	return ms.SysBytes + totalSideloaded, nil
 }
 
-func maybeCampaignAfterConfChange(
+func shouldCampaignAfterConfChange(
 	ctx context.Context,
 	storeID roachpb.StoreID,
 	desc *roachpb.RangeDescriptor,