@@ -57,26 +57,74 @@ var snapshotIngestAsWriteThreshold = settings.RegisterByteSizeSetting(
57
57
}())
58
58
59
59
// replicaRaftStorage implements the raft.Storage interface.
60
+ //
61
+ // All mutating calls to raft.RawNode require that r.mu is held. All read-only
62
+ // calls to raft.RawNode require that r.mu is held at least for reads.
63
+ //
64
+ // All methods implementing raft.Storage are called from within, or on behalf of
65
+ // a RawNode. When called from within RawNode, r.mu is held necessarily (and
66
+ // maybe r.raftMu). Conceptually, r.mu only needs to be locked for reading, but
67
+ // implementation details may require an exclusive lock (see method comments).
68
+ // When called from outside RawNode (on behalf of a RawNode "snapshot"), the
69
+ // caller must hold r.raftMu and/or r.mu.
70
+ //
71
+ // RawNode has the in-memory "unstable" state which services most of its needs.
72
+ // Most RawNode.Step updates are completed in memory, while only holding r.mu.
73
+ //
74
+ // RawNode falls back to reading from Storage when it does not have the needed
75
+ // state in memory. For example, the leader may need to read log entries from
76
+ // storage to construct a log append request for a follower, or a follower may
77
+ // need to interact with its storage upon receiving such a request to check
78
+ // whether the appended log slice is consistent with raft rules.
79
+ //
80
+ // (1) RawNode guarantees that everything it reads from Storage has no in-flight
81
+ // writes. Raft always reads state that it knows to be stable (meaning it does
82
+ // not have pending writes) and, in some cases, also synced / durable. Storage
83
+ // acknowledges completed writes / syncs back to RawNode, under r.mu, so that
84
+ // RawNode can correctly implement this guarantee.
85
+ //
86
+ // (2) The content of raft.Storage is always mutated while holding r.raftMu,
87
+ // which is an un-contended "IO" mutex and is allowed to be held longer. Most
88
+ // writes are extracted from RawNode while holding r.raftMu and r.mu (in the
89
+ // Ready() loop), and handed over to storage under r.raftMu. There are a few
90
+ // cases when CRDB synthesizes the writes (e.g. during a range split / merge, or
91
+ // raft log truncations) under r.raftMu.
92
+ //
93
+ // The guarantees explain why holding only r.mu is sufficient for RawNode or its
94
+ // snapshot to be in a consistent state. Under r.mu, new writes are blocked,
95
+ // because of (2), and by (1) reads never conflict with the in-flight writes.
96
+ //
97
+ // However, r.mu is a widely used mutex, and not recommended for IO. When doing
98
+ // work on behalf RawNode that involves IO (like constructing log appends for a
99
+ // follower), we would like to release r.mu. The two guarantees make it possible
100
+ // to observe a consistent RawNode snapshot while only holding r.raftMu.
101
+ //
102
+ // While both r.raftMu and r.mu are held, we can take a shallow / COW copy of
103
+ // the RawNode or its relevant subset (e.g. the raft log; the Ready struct is
104
+ // also considered such). A subsequent release of r.mu allows RawNode to resume
105
+ // making progress. The raft.Storage does not observe any new writes while
106
+ // r.raftMu is still held, by the guarantee (2). Combined with guarantee (1), it
107
+ // means that both the original and the snapshot RawNode remain consistent. The
108
+ // shallow copy represents a valid past state of the RawNode.
109
+ //
110
+ // TODO(pav-kv): the snapshotting with only r.raftMu held is not implemented,
111
+ // but should be done soon.
112
+ //
113
+ // All the implementation methods assume that the required locks are held, and
114
+ // don't acquire them. The specific locking requirements are noted in each
115
+ // method's comment. The method names do not follow our "Locked" naming
116
+ // conventions, due to being an implementation of raft.Storage interface from a
117
+ // different package.
118
+ //
119
+ // Many of the methods defined in this file are wrappers around static
120
+ // functions. This is done to facilitate their use from Replica.Snapshot(),
121
+ // where it is important that all the data that goes into the snapshot comes
122
+ // from a consistent view of the database, and not the replica's in-memory state
123
+ // or via a reference to Replica.store.Engine().
60
124
type replicaRaftStorage Replica
61
125
62
126
var _ raft.Storage = (* replicaRaftStorage )(nil )
63
127
64
- // All calls to raft.RawNode require that both Replica.raftMu and
65
- // Replica.mu are held. All of the functions exposed via the
66
- // raft.Storage interface will in turn be called from RawNode, so none
67
- // of these methods may acquire either lock, but they may require
68
- // their caller to hold one or both locks (even though they do not
69
- // follow our "Locked" naming convention). Specific locking
70
- // requirements (e.g. whether r.mu must be held for reading or writing)
71
- // are noted in each method's comments.
72
- //
73
- // Many of the methods defined in this file are wrappers around static
74
- // functions. This is done to facilitate their use from
75
- // Replica.Snapshot(), where it is important that all the data that
76
- // goes into the snapshot comes from a consistent view of the
77
- // database, and not the replica's in-memory state or via a reference
78
- // to Replica.store.Engine().
79
-
80
128
// InitialState implements the raft.Storage interface.
81
129
func (r * replicaRaftStorage ) InitialState () (raftpb.HardState , raftpb.ConfState , error ) {
82
130
// The call must synchronize with raft IO. Called when raft is initialized
0 commit comments