raft: support sending MsgApp from a log snapshot

pav-kv · pav-kv · commit 84ae1cdcc320 · 2024-09-18T13:44:41.000+01:00
Epic: none
Release note: none
diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
@@ -1431,6 +1431,66 @@ func (r *Replica) tick(
 	return true, nil
 }
 
+// processMsgApps sends MsgApp to all peers whose send stream is ready to send.
+//
+// FIXME: find the right placement in RACv2 code. Potentially this just needs to
+// be inlined into the Ready handler.
+func (r *Replica) processMsgApps(_ context.Context) error {
+	r.raftMu.Lock()
+	defer r.raftMu.Unlock()
+
+	// We are the leader at the given term.
+	var term uint64 // FIXME: we should know it
+
+	// Grab the snapshot of the log, if we are still the leader of the term. This
+	// only locks Replica.mu for reads, and returns quickly. No IO is performed.
+	var logSnap raft.LogSnapshot
+	if !func() bool {
+		r.mu.RLock()
+		defer r.mu.Unlock()
+		rg := r.mu.internalRaftGroup
+		// We need to be the leader of the given term to be able to send MsgApps.
+		if rg.Term() != term || rg.Lead() != raftpb.PeerID(r.replicaID) {
+			return false
+		}
+		logSnap = rg.LogSnapshot()
+		return true
+	}() {
+		return nil
+	}
+
+	// We are still holding raftMu, so it is safe to use the log snapshot for
+	// constructing MsgApps. The log will not be mutated in storage. This will
+	// potentially incur storage reads.
+	//
+	// FIXME: iterate over all peers to whom we should send a MsgApp.
+	slices := make(map[roachpb.ReplicaID]raft.LogSlice, 5)
+	for peer := roachpb.ReplicaID(0); peer < 1; peer++ {
+		// FIXME: should know the parameters, as instructed by the send streams.
+		var after, last, maxSize uint64
+		slices[peer] = logSnap.LogSlice(after, last, maxSize)
+	}
+	if len(slices) == 0 { // nothing to send
+		return nil
+	}
+
+	// Now grab the Replica.mu again (for writes), and send the MsgApp messages.
+	// No IO happens here. The messages are stashed in RawNode message queue, and
+	// will be dispatched with the next Ready handling. Make sure to do all this
+	// right before the raft scheduler runs the Ready handler, to minimize
+	// latency.
+	return r.withRaftGroup(func(rn *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) {
+		for peer, slice := range slices {
+			// NB: the message sending can fail here, if we lost leadership in the
+			// meantime, or the Next index is misaligned with the passed-in slice.
+			//
+			// Potentially need to update the send stream accordingly from here.
+			_ = rn.SendMsgApp(raftpb.PeerID(peer), slice)
+		}
+		return true, nil
+	})
+}
+
 func (r *Replica) processRACv2PiggybackedAdmitted(ctx context.Context) {
 	r.raftMu.Lock()
 	defer r.raftMu.Unlock()
diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go
@@ -660,6 +660,30 @@ func (r *raft) maybeSendAppend(to pb.PeerID) bool {
 	return true
 }
 
+func (r *raft) sendMsgApp(to pb.PeerID, ls logSlice) bool {
+	if r.state != StateLeader || r.Term != ls.term {
+		return false
+	}
+	pr := r.trk.Progress(to)
+	if pr == nil || pr.State != tracker.StateReplicate || pr.Next != ls.prev.index+1 {
+		return false
+	}
+	commit := r.raftLog.committed
+	// Send the MsgApp, and update the progress accordingly.
+	r.send(pb.Message{
+		To:      to,
+		Type:    pb.MsgApp,
+		Index:   ls.prev.index,
+		LogTerm: ls.prev.term,
+		Entries: ls.entries,
+		Commit:  commit,
+		Match:   pr.Match,
+	})
+	pr.SentEntries(len(ls.entries), uint64(payloadsSize(ls.entries)))
+	pr.MaybeUpdateSentCommit(commit)
+	return true
+}
+
 // maybeSendSnapshot fetches a snapshot from Storage, and sends it to the given
 // node. Returns true iff the snapshot message has been emitted successfully.
 func (r *raft) maybeSendSnapshot(to pb.PeerID, pr *tracker.Progress) bool {
diff --git a/pkg/raft/rawnode.go b/pkg/raft/rawnode.go
@@ -115,6 +115,29 @@ func (rn *RawNode) Step(m pb.Message) error {
 	return rn.raft.Step(m)
 }
 
+// LogSnapshot returns the point-in-time state of the raft log.
+//
+// To use the returned log snapshot correctly (see SendMsgApp method), the
+// caller must ensure that the log storage between this call and the usage is
+// not mutated.
+func (rn *RawNode) LogSnapshot() LogSnapshot {
+	return rn.raft.raftLog.snap()
+}
+
+// SendMsgApp conditionally sends a MsgApp message containing the given log
+// slice to the given peer.
+//
+// The message can be sent only if all the conditions are true:
+//   - this node is the leader of term to which the slice corresponds
+//   - the given peer exists
+//   - the replication flow to the given peer is in StateReplicate
+//   - the first slice index matches the Next index to send to this peer
+//
+// Returns true iff the message was sent.
+func (rn *RawNode) SendMsgApp(to pb.PeerID, slice LogSlice) bool {
+	return rn.raft.sendMsgApp(to, slice)
+}
+
 // Ready returns the outstanding work that the application needs to handle. This
 // includes appending and applying entries or a snapshot, updating the HardState,
 // and sending messages. The returned Ready() *must* be handled and subsequently