Skip to content

Commit a134caa

Browse files
author
Andrew Werner
committed
storage: use RWMutex in NodeLiveness
When at Mutex profiles for heavily loaded TPC-C clusters we noticed that a lot of time was being spent blocked on a RWMutex held by Replica.leaseGoodToGo which underneath was reading NodeLiveness state in a read-only way. This PR adds a RWMutex to NodeLiveness to eliminate contention. Prior to this change we observed nearly 60% of lock contention on leaseGoodToGo. After we observe closer to 20%. Release note: None
1 parent a17d619 commit a134caa

File tree

1 file changed

+15
-15
lines changed

1 file changed

+15
-15
lines changed

pkg/storage/node_liveness.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ type NodeLiveness struct {
145145
metrics LivenessMetrics
146146

147147
mu struct {
148-
syncutil.Mutex
148+
syncutil.RWMutex
149149
callbacks []IsLiveCallback
150150
nodes map[roachpb.NodeID]storagepb.Liveness
151151
heartbeatCallback HeartbeatCallback
@@ -412,9 +412,9 @@ func (nl *NodeLiveness) StartHeartbeat(
412412
retryOpts := base.DefaultRetryOptions()
413413
retryOpts.Closer = stopper.ShouldQuiesce()
414414

415-
nl.mu.Lock()
415+
nl.mu.RLock()
416416
nl.mu.heartbeatCallback = alive
417-
nl.mu.Unlock()
417+
nl.mu.RUnlock()
418418

419419
stopper.RunWorker(ctx, func(context.Context) {
420420
ambient := nl.ambientCtx
@@ -595,8 +595,8 @@ func (nl *NodeLiveness) heartbeatInternal(
595595
// liveness record successfully, nor received a gossip message containing
596596
// a former liveness update on restart.
597597
func (nl *NodeLiveness) Self() (*storagepb.Liveness, error) {
598-
nl.mu.Lock()
599-
defer nl.mu.Unlock()
598+
nl.mu.RLock()
599+
defer nl.mu.RUnlock()
600600
return nl.getLivenessLocked(nl.gossip.NodeID.Get())
601601
}
602602

@@ -614,9 +614,9 @@ type IsLiveMap map[roachpb.NodeID]IsLiveMapEntry
614614
// each node. This excludes nodes that were removed completely (dead +
615615
// decommissioning).
616616
func (nl *NodeLiveness) GetIsLiveMap() IsLiveMap {
617-
nl.mu.Lock()
618-
defer nl.mu.Unlock()
619617
lMap := IsLiveMap{}
618+
nl.mu.RLock()
619+
defer nl.mu.RUnlock()
620620
now := nl.clock.Now()
621621
maxOffset := nl.clock.MaxOffset()
622622
for nID, l := range nl.mu.nodes {
@@ -637,8 +637,8 @@ func (nl *NodeLiveness) GetIsLiveMap() IsLiveMap {
637637
// every node on the cluster known to gossip. Callers should consider
638638
// calling (statusServer).NodesWithLiveness() instead where possible.
639639
func (nl *NodeLiveness) GetLivenesses() []storagepb.Liveness {
640-
nl.mu.Lock()
641-
defer nl.mu.Unlock()
640+
nl.mu.RLock()
641+
defer nl.mu.RUnlock()
642642
livenesses := make([]storagepb.Liveness, 0, len(nl.mu.nodes))
643643
for _, l := range nl.mu.nodes {
644644
livenesses = append(livenesses, l)
@@ -650,8 +650,8 @@ func (nl *NodeLiveness) GetLivenesses() []storagepb.Liveness {
650650
// ErrNoLivenessRecord is returned in the event that nothing is yet
651651
// known about nodeID via liveness gossip.
652652
func (nl *NodeLiveness) GetLiveness(nodeID roachpb.NodeID) (*storagepb.Liveness, error) {
653-
nl.mu.Lock()
654-
defer nl.mu.Unlock()
653+
nl.mu.RLock()
654+
defer nl.mu.RUnlock()
655655
return nl.getLivenessLocked(nodeID)
656656
}
657657

@@ -932,8 +932,8 @@ func (nl *NodeLiveness) numLiveNodes() int64 {
932932
now := nl.clock.Now()
933933
maxOffset := nl.clock.MaxOffset()
934934

935-
nl.mu.Lock()
936-
defer nl.mu.Unlock()
935+
nl.mu.RLock()
936+
defer nl.mu.RUnlock()
937937

938938
self, err := nl.getLivenessLocked(selfID)
939939
if err == ErrNoLivenessRecord {
@@ -977,8 +977,8 @@ func (nl *NodeLiveness) AsLiveClock() closedts.LiveClockFn {
977977
// GetNodeCount returns a count of the number of nodes in the cluster,
978978
// including dead nodes, but excluding decommissioning or decommissioned nodes.
979979
func (nl *NodeLiveness) GetNodeCount() int {
980-
nl.mu.Lock()
981-
defer nl.mu.Unlock()
980+
nl.mu.RLock()
981+
defer nl.mu.RUnlock()
982982
var count int
983983
for _, l := range nl.mu.nodes {
984984
if !l.Decommissioning {

0 commit comments

Comments
 (0)