From 6037462dfa080c23df509db0d766d5338eccef21 Mon Sep 17 00:00:00 2001 From: Michal Maslanka Date: Fri, 9 Feb 2024 11:04:18 +0100 Subject: [PATCH 1/2] r/consensus: extend confirmed term semantics to apply for the follower A confirmed term is used to determine if the state of a replica is up to date after the leader election. Only after the confirmed term is equal to the current term one can reason about the Raft group state. On the leader the confirmed term is updated after first successful replication of a batch subsequent to a leader election. After the replication succeed leader is guaranteed to have up to date committed and visible offsets. On the follower the confirmed term is updated only when an append entries request from the current leader may be accepted and follower may return success. Signed-off-by: Michal Maslanka --- src/v/raft/consensus.cc | 3 ++- src/v/raft/consensus.h | 26 +++++++++++++++----------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/v/raft/consensus.cc b/src/v/raft/consensus.cc index 50b5a132958a..7b882ad53eb8 100644 --- a/src/v/raft/consensus.cc +++ b/src/v/raft/consensus.cc @@ -1976,7 +1976,7 @@ consensus::do_append_entries(append_entries_request&& r) { maybe_update_last_visible_index(last_visible); _last_leader_visible_offset = std::max( request_metadata.last_visible_index, _last_leader_visible_offset); - + _confirmed_term = _term; if (_follower_recovery_state) { vlog( _ctxlog.debug, @@ -2095,6 +2095,7 @@ consensus::do_append_entries(append_entries_request&& r) { maybe_update_last_visible_index(last_visible); _last_leader_visible_offset = std::max( m.last_visible_index, _last_leader_visible_offset); + _confirmed_term = _term; return maybe_update_follower_commit_idx(model::offset(m.commit_index)) .then([this, m, ofs, target] { if (_follower_recovery_state) { diff --git a/src/v/raft/consensus.h b/src/v/raft/consensus.h index 5613a912c880..80f8c9b0b91a 100644 --- a/src/v/raft/consensus.h +++ b/src/v/raft/consensus.h @@ -782,17 +782,21 @@ class consensus { // consensus state model::offset _commit_index; model::term_id _term; - // It's common to use raft log as a foundation for state machines: - // when a node becomes a leader it replays the log, reconstructs - // the state and becomes ready to serve the requests. However it is - // not enough for a node to become a leader, it should successfully - // replicate a new record to be sure that older records stored in - // the local log were actually replicated and do not constitute an - // artifact of the previously crashed leader. Redpanda uses a confi- - // guration batch for the initial replication to gain certainty. When - // commit index moves past the configuration batch _confirmed_term - // gets updated. So when _term==_confirmed_term it's safe to use - // local log to reconstruct the state. + + /** + * A confirmed term is used to determine if the state of a replica is up to + * date after the leader election. Only after the confirmed term is equal to + * the current term one can reason about the Raft group state. + * + * On the leader the confirmed term is updated after first successful + * replication of a batch subsequent to a leader election. After the + * replication succeed leader is guaranteed to have up to date committed and + * visible offsets. + * + * On the follower the confirmed term is updated only when an append entries + * request from the current leader may be accepted and follower may return + * success. + */ model::term_id _confirmed_term; model::offset _flushed_offset{}; From e746f79bd85a288b378a1dcbcdb6d0c9cf25f853 Mon Sep 17 00:00:00 2001 From: Michal Maslanka Date: Fri, 9 Feb 2024 11:05:51 +0100 Subject: [PATCH 2/2] k/replicated_partition: use confirmed term as a source of leader epoch A leader epoch is used by Kafka clients to determine if a replica is up to date with the leader and to detect truncation. The leader epoch differs from Raft term as the term is updated when leader election starts. Whereas the leader epoch is updated after the state of the replica is determined. Therefore the leader epoch uses confirmed term instead of the simple term which is incremented every time the leader election starts. Signed-off-by: Michal Maslanka --- src/v/kafka/server/replicated_partition.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/v/kafka/server/replicated_partition.h b/src/v/kafka/server/replicated_partition.h index a1239557875e..646c5b52143e 100644 --- a/src/v/kafka/server/replicated_partition.h +++ b/src/v/kafka/server/replicated_partition.h @@ -173,9 +173,18 @@ class replicated_partition final : public kafka::partition_proxy::impl { ss::future> get_leader_epoch_last_offset(kafka::leader_epoch) const final; - + /** + * A leader epoch is used by Kafka clients to determine if a replica is up + * to date with the leader and to detect truncation. + * + * The leader epoch differs from Raft term as the term is updated when + * leader election starts. Whereas the leader epoch is updated after the + * state of the replica is determined. Therefore the leader epoch uses + * confirmed term instead of the simple term which is incremented every time + * the leader election starts. + */ kafka::leader_epoch leader_epoch() const final { - return leader_epoch_from_term(_partition->term()); + return leader_epoch_from_term(_partition->raft()->confirmed_term()); } ss::future validate_fetch_offset(