Skip to content

Commit beeebb1

Browse files
craig[bot]spencerkimball
andcommitted
Merge #26911
26911: storage: quiesce ranges which have non-live replicas r=spencerkimball a=spencerkimball Previously all replicas had to be completely up to date in order to quiesce ranges. This made the loss of a node in a cluster with many ranges an expensive proposition, as a significant number of ranges could be kept unquiesced for as long as the node was down. This change refreshes a liveness map from the `NodeLiveness` object on every Raft ticker loop and then passes that to `Replica.tick()` to allow the leader to disregard non-live nodes when making its should-quiesce determination. Release note (performance improvement): prevent dead nodes in clusters with many ranges from causing unnecessarily high CPU usage. Note that this PR requires #26908 to function properly Fixes #9446 Co-authored-by: Spencer Kimball <spencer.kimball@gmail.com>
2 parents 03268b4 + c47ace3 commit beeebb1

File tree

3 files changed

+133
-21
lines changed

3 files changed

+133
-21
lines changed

pkg/storage/replica.go

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,6 +1556,10 @@ func (r *Replica) Desc() *roachpb.RangeDescriptor {
15561556
return r.mu.state.Desc
15571557
}
15581558

1559+
func (r *Replica) descRLocked() *roachpb.RangeDescriptor {
1560+
return r.mu.state.Desc
1561+
}
1562+
15591563
// NodeID returns the ID of the node this replica belongs to.
15601564
func (r *Replica) NodeID() roachpb.NodeID {
15611565
return r.store.nodeDesc.NodeID
@@ -3633,6 +3637,12 @@ func (r *Replica) quiesceLocked() bool {
36333637
return true
36343638
}
36353639

3640+
func (r *Replica) unquiesce() {
3641+
r.mu.Lock()
3642+
defer r.mu.Unlock()
3643+
r.unquiesceLocked()
3644+
}
3645+
36363646
func (r *Replica) unquiesceLocked() {
36373647
if r.mu.quiescent {
36383648
ctx := r.AnnotateCtx(context.TODO())
@@ -4111,7 +4121,7 @@ func fatalOnRaftReadyErr(ctx context.Context, expl string, err error) {
41114121

41124122
// tick the Raft group, returning any error and true if the raft group exists
41134123
// and false otherwise.
4114-
func (r *Replica) tick() (bool, error) {
4124+
func (r *Replica) tick(livenessMap map[roachpb.NodeID]bool) (bool, error) {
41154125
r.unreachablesMu.Lock()
41164126
remotes := r.unreachablesMu.remotes
41174127
r.unreachablesMu.remotes = nil
@@ -4135,7 +4145,7 @@ func (r *Replica) tick() (bool, error) {
41354145
if r.mu.quiescent {
41364146
return false, nil
41374147
}
4138-
if r.maybeQuiesceLocked() {
4148+
if r.maybeQuiesceLocked(livenessMap) {
41394149
return false, nil
41404150
}
41414151

@@ -4194,31 +4204,36 @@ func (r *Replica) tick() (bool, error) {
41944204
// it will either still apply to the recipient or the recipient will have moved
41954205
// forward and the quiesce message will fall back to being a heartbeat.
41964206
//
4207+
// The supplied livenessMap maps from node ID to a boolean indicating
4208+
// liveness. A range may be quiesced in the presence of non-live
4209+
// replicas if the remaining live replicas all meet the quiesce
4210+
// requirements. When a node considered non-live becomes live, the
4211+
// node liveness instance invokes a callback which causes all nodes to
4212+
// wakes up any ranges containing replicas owned by the newly-live
4213+
// node, allowing the out-of-date replicas to be brought back up to date.
4214+
// If livenessMap is nil, liveness data will not be used, meaning no range
4215+
// will quiesce if any replicas are behind, whether or not they are live.
4216+
// If any entry in the livenessMap is nil, then the missing node ID is
4217+
// treated as not live.
4218+
//
41974219
// TODO(peter): There remains a scenario in which a follower is left unquiesced
41984220
// while the leader is quiesced: the follower's receive queue is full and the
41994221
// "quiesce" message is dropped. This seems very very unlikely because if the
42004222
// follower isn't keeping up with raft messages it is unlikely that the leader
42014223
// would quiesce. The fallout from this situation are undesirable raft
42024224
// elections which will cause throughput hiccups to the range, but not
42034225
// correctness issues.
4204-
//
4205-
// TODO(peter): When a node goes down, any range which has a replica on the
4206-
// down node will not quiesce. This could be a significant performance
4207-
// impact. Additionally, when the node comes back up we want to bring any
4208-
// replicas it contains back up to date. Right now this will be handled because
4209-
// those ranges never quiesce. One thought for handling both these scenarios is
4210-
// to hook into the StorePool and its notion of "down" nodes. But that might
4211-
// not be sensitive enough.
4212-
func (r *Replica) maybeQuiesceLocked() bool {
4226+
func (r *Replica) maybeQuiesceLocked(livenessMap map[roachpb.NodeID]bool) bool {
42134227
ctx := r.AnnotateCtx(context.TODO())
4214-
status, ok := shouldReplicaQuiesce(ctx, r, r.store.Clock().Now(), len(r.mu.proposals))
4228+
status, ok := shouldReplicaQuiesce(ctx, r, r.store.Clock().Now(), len(r.mu.proposals), livenessMap)
42154229
if !ok {
42164230
return false
42174231
}
42184232
return r.quiesceAndNotifyLocked(ctx, status)
42194233
}
42204234

42214235
type quiescer interface {
4236+
descRLocked() *roachpb.RangeDescriptor
42224237
raftStatusRLocked() *raft.Status
42234238
raftLastIndexLocked() (uint64, error)
42244239
hasRaftReadyRLocked() bool
@@ -4250,7 +4265,11 @@ func (r *Replica) maybeTransferRaftLeader(
42504265
// facilitate testing. Returns the raft.Status and true on success, and (nil,
42514266
// false) on failure.
42524267
func shouldReplicaQuiesce(
4253-
ctx context.Context, q quiescer, now hlc.Timestamp, numProposals int,
4268+
ctx context.Context,
4269+
q quiescer,
4270+
now hlc.Timestamp,
4271+
numProposals int,
4272+
livenessMap map[roachpb.NodeID]bool,
42544273
) (*raft.Status, bool) {
42554274
if numProposals != 0 {
42564275
if log.V(4) {
@@ -4315,15 +4334,30 @@ func shouldReplicaQuiesce(
43154334
}
43164335
return nil, false
43174336
}
4337+
43184338
var foundSelf bool
4319-
for id, progress := range status.Progress {
4320-
if id == status.ID {
4339+
for _, rep := range q.descRLocked().Replicas {
4340+
if uint64(rep.ReplicaID) == status.ID {
43214341
foundSelf = true
43224342
}
4323-
if progress.Match != status.Applied {
4343+
if progress, ok := status.Progress[uint64(rep.ReplicaID)]; !ok {
4344+
if log.V(4) {
4345+
log.Infof(ctx, "not quiescing: could not locate replica %d in progress: %+v",
4346+
rep.ReplicaID, progress)
4347+
}
4348+
return nil, false
4349+
} else if progress.Match != status.Applied {
4350+
// Skip any node in the descriptor which is not live.
4351+
if livenessMap != nil && !livenessMap[rep.NodeID] {
4352+
if log.V(4) {
4353+
log.Infof(ctx, "skipping node %d because not live. Progress=%+v",
4354+
rep.NodeID, progress)
4355+
}
4356+
continue
4357+
}
43244358
if log.V(4) {
43254359
log.Infof(ctx, "not quiescing: replica %d match (%d) != applied (%d)",
4326-
id, progress.Match, status.Applied)
4360+
rep.ReplicaID, progress.Match, status.Applied)
43274361
}
43284362
return nil, false
43294363
}

pkg/storage/replica_test.go

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8152,7 +8152,7 @@ func TestReplicaRefreshPendingCommandsTicks(t *testing.T) {
81528152
ticks := r.mu.ticks
81538153
r.mu.Unlock()
81548154
for ; (ticks % electionTicks) != 0; ticks++ {
8155-
if _, err := r.tick(); err != nil {
8155+
if _, err := r.tick(nil); err != nil {
81568156
t.Fatal(err)
81578157
}
81588158
}
@@ -8206,7 +8206,7 @@ func TestReplicaRefreshPendingCommandsTicks(t *testing.T) {
82068206
r.mu.Unlock()
82078207

82088208
// Tick raft.
8209-
if _, err := r.tick(); err != nil {
8209+
if _, err := r.tick(nil); err != nil {
82108210
t.Fatal(err)
82118211
}
82128212

@@ -9349,11 +9349,17 @@ func TestSplitMsgApps(t *testing.T) {
93499349
}
93509350

93519351
type testQuiescer struct {
9352+
desc roachpb.RangeDescriptor
93529353
numProposals int
93539354
status *raft.Status
93549355
lastIndex uint64
93559356
raftReady bool
93569357
ownsValidLease bool
9358+
livenessMap map[roachpb.NodeID]bool
9359+
}
9360+
9361+
func (q *testQuiescer) descRLocked() *roachpb.RangeDescriptor {
9362+
return &q.desc
93579363
}
93589364

93599365
func (q *testQuiescer) raftStatusRLocked() *raft.Status {
@@ -9390,6 +9396,13 @@ func TestShouldReplicaQuiesce(t *testing.T) {
93909396
// true. The transform function is intended to perform one mutation to
93919397
// this quiescer so that shouldReplicaQuiesce will return false.
93929398
q := &testQuiescer{
9399+
desc: roachpb.RangeDescriptor{
9400+
Replicas: []roachpb.ReplicaDescriptor{
9401+
{NodeID: 1, ReplicaID: 1},
9402+
{NodeID: 2, ReplicaID: 2},
9403+
{NodeID: 3, ReplicaID: 3},
9404+
},
9405+
},
93939406
status: &raft.Status{
93949407
ID: 1,
93959408
HardState: raftpb.HardState{
@@ -9409,9 +9422,14 @@ func TestShouldReplicaQuiesce(t *testing.T) {
94099422
lastIndex: logIndex,
94109423
raftReady: false,
94119424
ownsValidLease: true,
9425+
livenessMap: map[roachpb.NodeID]bool{
9426+
1: true,
9427+
2: true,
9428+
3: true,
9429+
},
94129430
}
94139431
q = transform(q)
9414-
_, ok := shouldReplicaQuiesce(context.Background(), q, hlc.Timestamp{}, q.numProposals)
9432+
_, ok := shouldReplicaQuiesce(context.Background(), q, hlc.Timestamp{}, q.numProposals, q.livenessMap)
94159433
if expected != ok {
94169434
t.Fatalf("expected %v, but found %v", expected, ok)
94179435
}
@@ -9471,6 +9489,28 @@ func TestShouldReplicaQuiesce(t *testing.T) {
94719489
q.raftReady = true
94729490
return q
94739491
})
9492+
// Create a mismatch between the raft progress replica IDs and the
9493+
// replica IDs in the range descriptor.
9494+
for i := 0; i < 3; i++ {
9495+
test(false, func(q *testQuiescer) *testQuiescer {
9496+
q.desc.Replicas[i].ReplicaID = roachpb.ReplicaID(4 + i)
9497+
return q
9498+
})
9499+
}
9500+
// Pass a nil liveness map.
9501+
test(true, func(q *testQuiescer) *testQuiescer {
9502+
q.livenessMap = nil
9503+
return q
9504+
})
9505+
// Verify quiesce even when replica progress doesn't match, if
9506+
// the replica is on a non-live node.
9507+
for _, i := range []uint64{1, 2, 3} {
9508+
test(true, func(q *testQuiescer) *testQuiescer {
9509+
q.livenessMap[roachpb.NodeID(i)] = false
9510+
q.status.Progress[i] = raft.Progress{Match: invalidIndex}
9511+
return q
9512+
})
9513+
}
94749514
}
94759515

94769516
func TestReplicaRecomputeStats(t *testing.T) {

pkg/storage/store.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,10 @@ type Store struct {
529529

530530
scheduler *raftScheduler
531531

532+
// livenessMap is a map from nodeID to a bool indicating
533+
// liveness. It is updated periodically in raftTickLoop().
534+
livenessMap atomic.Value
535+
532536
// cachedCapacity caches information on store capacity to prevent
533537
// expensive recomputations in case leases or replicas are rapidly
534538
// rebalancing.
@@ -1428,6 +1432,12 @@ func (s *Store) Start(ctx context.Context, stopper *stop.Stopper) error {
14281432
s.cfg.Transport.Listen(s.StoreID(), s)
14291433
s.processRaft(ctx)
14301434

1435+
// Register a callback to unquiesce any ranges with replicas on a
1436+
// node transitioning from non-live to live.
1437+
if s.cfg.NodeLiveness != nil {
1438+
s.cfg.NodeLiveness.RegisterCallback(s.nodeIsLiveCallback)
1439+
}
1440+
14311441
// Gossip is only ever nil while bootstrapping a cluster and
14321442
// in unittests.
14331443
if s.cfg.Gossip != nil {
@@ -3704,17 +3714,41 @@ func (s *Store) processTick(ctx context.Context, rangeID roachpb.RangeID) bool {
37043714
if !ok {
37053715
return false
37063716
}
3717+
livenessMap, _ := s.livenessMap.Load().(map[roachpb.NodeID]bool)
37073718

37083719
start := timeutil.Now()
37093720
r := (*Replica)(value)
3710-
exists, err := r.tick()
3721+
exists, err := r.tick(livenessMap)
37113722
if err != nil {
37123723
log.Error(ctx, err)
37133724
}
37143725
s.metrics.RaftTickingDurationNanos.Inc(timeutil.Since(start).Nanoseconds())
37153726
return exists // ready
37163727
}
37173728

3729+
// nodeIsLiveCallback is invoked when a node transitions from non-live
3730+
// to live. Iterate through all replicas and find any which belong to
3731+
// ranges containing the implicated node. Unquiesce if currently
3732+
// quiesced. Note that this mechanism can race with concurrent
3733+
// invocations of processTick, which may have a copy of the previous
3734+
// livenessMap where the now-live node is down. Those instances should
3735+
// be rare, however, and we expect the newly live node to eventually
3736+
// unquiesce the range.
3737+
func (s *Store) nodeIsLiveCallback(nodeID roachpb.NodeID) {
3738+
// Update the liveness map.
3739+
s.livenessMap.Store(s.cfg.NodeLiveness.GetIsLiveMap())
3740+
3741+
s.mu.replicas.Range(func(k int64, v unsafe.Pointer) bool {
3742+
r := (*Replica)(v)
3743+
for _, rep := range r.Desc().Replicas {
3744+
if rep.NodeID == nodeID {
3745+
r.unquiesce()
3746+
}
3747+
}
3748+
return true
3749+
})
3750+
}
3751+
37183752
func (s *Store) processRaft(ctx context.Context) {
37193753
if s.cfg.TestingKnobs.DisableProcessRaft {
37203754
return
@@ -3741,6 +3775,10 @@ func (s *Store) raftTickLoop(ctx context.Context) {
37413775
select {
37423776
case <-ticker.C:
37433777
rangeIDs = rangeIDs[:0]
3778+
// Update the liveness map.
3779+
if s.cfg.NodeLiveness != nil {
3780+
s.livenessMap.Store(s.cfg.NodeLiveness.GetIsLiveMap())
3781+
}
37443782

37453783
s.unquiescedReplicas.Lock()
37463784
// Why do we bother to ever queue a Replica on the Raft scheduler for

0 commit comments

Comments
 (0)