From 7f347934d25b5517002ff2843b0517e0b4785cbf Mon Sep 17 00:00:00 2001
From: drdr xp <drdr.xp@gmail.com>
Date: Thu, 16 Sep 2021 01:10:55 +0800
Subject: [PATCH] change: simplify membership change

- Change: if leadership is lost, the cluster is left with the **joint**
  config.
  One does not receive response of the change-membership request should
  always re-send to ensure membership config is applied.

- Change: remove joint-uniform logic from RaftCore, which brings a lot
  complexity to raft impl. This logic is now done in Raft(which is a
  shell to control RaftCore).

- Change: RaftCore.membership is changed to `ActiveMembership`, which
  includes a log id and a membership config.
  Making this change to let raft be able to check if a membership is
  committed by comparing the log index and its committed index.

- Change: when adding a existent non-voter, it returns an `Ok` value
  instead of an `Err`.

- Change: add arg `blocking` to `add_non_voter` and `change_membership`.
  A blocking `change_membership` still wait for the two config change
  log to commit.
  `blocking` only indicates if to wait for replication to non-voter to
  be up to date.

- Change: remove `non_voters`. Merge it into `nodes`.
  Now both voters and non-voters share the same replication handle.

- Change: remove field `ReplicationState.is_ready_to_join`, it
  can be just calculated when needed.

- Change: remove `is_stepping_down`, `membership.contains()` is quite
  enough.

- Change: remove `consensus_state`.
---
 async-raft/src/core/admin.rs                  | 281 +++++++-----------
 async-raft/src/core/append_entries.rs         |   9 +-
 async-raft/src/core/client.rs                 | 226 +++++++-------
 async-raft/src/core/mod.rs                    | 131 ++++----
 async-raft/src/core/replication.rs            | 154 +++-------
 async-raft/src/core/replication_state_test.rs |  37 +++
 async-raft/src/core/vote.rs                   |   5 +-
 async-raft/src/error.rs                       |  23 +-
 async-raft/src/lib.rs                         |   1 +
 async-raft/src/metrics.rs                     |  12 +-
 async-raft/src/metrics_wait_test.rs           |  22 +-
 async-raft/src/raft.rs                        | 186 ++++++++----
 async-raft/src/replication/mod.rs             |  48 +--
 async-raft/src/storage.rs                     |  16 +-
 async-raft/tests/add_remove_voter.rs          |  45 +--
 async-raft/tests/fixtures/mod.rs              |  52 +++-
 async-raft/tests/initialization.rs            |  12 +-
 async-raft/tests/leader_metrics.rs            |  72 +++--
 async-raft/tests/members.rs                   | 127 ++++++++
 .../tests/members_leader_fix_partial.rs       |  51 ++--
 async-raft/tests/non_voter_add.rs             | 119 ++++++++
 .../tests/snapshot_overrides_membership.rs    |   4 +-
 .../snapshot_uses_prev_snap_membership.rs     |   4 +-
 .../tests/state_machien_apply_membership.rs   |  34 ++-
 async-raft/tests/stepdown.rs                  |   2 +-
 memstore/src/lib.rs                           |  34 ++-
 memstore/src/test.rs                          |  38 ++-
 27 files changed, 1022 insertions(+), 723 deletions(-)
 create mode 100644 async-raft/src/core/replication_state_test.rs
 create mode 100644 async-raft/tests/members.rs
 create mode 100644 async-raft/tests/non_voter_add.rs

diff --git a/async-raft/src/core/admin.rs b/async-raft/src/core/admin.rs
index 064a7fc40..caf71b5e0 100644
--- a/async-raft/src/core/admin.rs
+++ b/async-raft/src/core/admin.rs
@@ -1,10 +1,10 @@
 use std::collections::BTreeSet;
-use std::collections::HashSet;
+use std::sync::Arc;
 
+use crate::core::client::ClientOrInternalResponseTx;
 use crate::core::client::ClientRequestEntry;
-use crate::core::ConsensusState;
+use crate::core::ActiveMembership;
 use crate::core::LeaderState;
-use crate::core::NonVoterReplicationState;
 use crate::core::NonVoterState;
 use crate::core::State;
 use crate::core::UpdateCurrentLeader;
@@ -12,10 +12,12 @@ use crate::error::ChangeConfigError;
 use crate::error::InitializeError;
 use crate::raft::ClientWriteRequest;
 use crate::raft::MembershipConfig;
+use crate::raft::RaftResponse;
 use crate::raft::ResponseTx;
 use crate::replication::RaftEvent;
 use crate::AppData;
 use crate::AppDataResponse;
+use crate::LogId;
 use crate::NodeId;
 use crate::RaftError;
 use crate::RaftNetwork;
@@ -40,15 +42,18 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
 
         // Build a new membership config from given init data & assign it as the new cluster
         // membership config in memory only.
-        self.core.membership = MembershipConfig {
-            members,
-            members_after_consensus: None,
+        self.core.membership = ActiveMembership {
+            log_id: LogId { term: 1, index: 1 },
+            membership: MembershipConfig {
+                members,
+                members_after_consensus: None,
+            },
         };
 
         // Become a candidate and start campaigning for leadership. If this node is the only node
         // in the cluster, then become leader without holding an election. If members len == 1, we
         // know it is our ID due to the above code where we ensure our own ID is present.
-        if self.core.membership.members.len() == 1 {
+        if self.core.membership.membership.members.len() == 1 {
             self.core.current_term += 1;
             self.core.voted_for = Some(self.core.id);
             self.core.set_target_state(State::Leader);
@@ -64,50 +69,72 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
 impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> LeaderState<'a, D, R, N, S> {
     /// Add a new node to the cluster as a non-voter, bringing it up-to-speed, and then responding
     /// on the given channel.
-    #[tracing::instrument(level = "trace", skip(self, tx))]
-    pub(super) fn add_member(&mut self, target: NodeId, tx: ResponseTx) {
-        // Ensure the node doesn't already exist in the current config, in the set of new nodes
-        // alreading being synced, or in the nodes being removed.
-        if self.core.membership.members.contains(&target)
-            || self
-                .core
-                .membership
-                .members_after_consensus
-                .as_ref()
-                .map(|new| new.contains(&target))
-                .unwrap_or(false)
-            || self.non_voters.contains_key(&target)
-        {
+    #[tracing::instrument(level = "debug", skip(self, tx))]
+    pub(super) fn add_member(&mut self, target: NodeId, tx: ResponseTx, blocking: bool) {
+        // Ensure the node doesn't already exist in the replication.
+        if target == self.core.id || self.nodes.contains_key(&target) {
             tracing::debug!("target node is already a cluster member or is being synced");
-            let _ = tx.send(Err(ChangeConfigError::Noop.into()));
+            let _ = tx.send(Ok(RaftResponse::NoChange));
             return;
         }
 
-        // Spawn a replication stream for the new member. Track state as a non-voter so that it
-        // can be updated to be added to the cluster config once it has been brought up-to-date.
-        let state = self.spawn_replication_stream(target);
-        self.non_voters.insert(target, NonVoterReplicationState {
-            state,
-            is_ready_to_join: false,
-            tx: Some(tx),
-        });
+        if blocking {
+            let state = self.spawn_replication_stream(target, Some(tx));
+            self.nodes.insert(target, state);
+        } else {
+            let state = self.spawn_replication_stream(target, None);
+            self.nodes.insert(target, state);
+
+            // non-blocking mode, do not know about the replication stat.
+            let _ = tx.send(Ok(RaftResponse::LogId {
+                log_id: LogId { term: 0, index: 0 },
+            }));
+        }
     }
 
     #[tracing::instrument(level = "trace", skip(self, tx))]
-    pub(super) async fn change_membership(&mut self, members: BTreeSet<NodeId>, tx: ResponseTx) {
+    pub(super) async fn change_membership(&mut self, members: BTreeSet<NodeId>, wait: bool, tx: ResponseTx) {
         // Ensure cluster will have at least one node.
         if members.is_empty() {
             let _ = tx.send(Err(ChangeConfigError::InoperableConfig.into()));
             return;
         }
 
-        // Only allow config updates when currently in a uniform consensus state.
-        match &self.consensus_state {
-            ConsensusState::Uniform => (),
-            ConsensusState::NonVoterSync { .. } | ConsensusState::Joint { .. } => {
-                let _ = tx.send(Err(ChangeConfigError::ConfigChangeInProgress.into()));
+        // The last membership config is not committed yet.
+        // Can not process the next one.
+        if self.core.commit_index < self.core.membership.log_id.index {
+            let _ = tx.send(Err(ChangeConfigError::ConfigChangeInProgress {
+                membership_log_id: self.core.membership.log_id,
+            }
+            .into()));
+            return;
+        }
+
+        let new_config;
+
+        let curr = &self.core.membership.membership;
+
+        if let Some(ref next_membership) = curr.members_after_consensus {
+            // When it is in joint state, it is only allowed to change to the `members_after_consensus`
+            if &members != next_membership {
+                let _ = tx.send(Err(ChangeConfigError::Incompatible {
+                    curr: curr.clone(),
+                    to: members,
+                }
+                .into()));
                 return;
+            } else {
+                new_config = MembershipConfig {
+                    members: next_membership.clone(),
+                    members_after_consensus: None,
+                };
             }
+        } else {
+            // currently it is uniform config, enter joint state
+            new_config = MembershipConfig {
+                members: curr.members.clone(),
+                members_after_consensus: Some(members.clone()),
+            };
         }
 
         // Check the proposed config for any new nodes. If ALL new nodes already have replication
@@ -117,63 +144,37 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         // Here, all we do is check to see which nodes still need to be synced, which determines
         // if we can proceed.
 
-        // TODO(xp): test change membership without adding as non-voter.
-
-        let mut awaiting = HashSet::new();
-        for new_node in members.difference(&self.core.membership.members) {
-            match self.non_voters.get(&new_node) {
+        for new_node in members.difference(&self.core.membership.membership.members) {
+            match self.nodes.get(&new_node) {
                 // Node is ready to join.
-                Some(node) if node.is_ready_to_join => continue,
+                Some(node) => {
+                    if node.is_line_rate(&self.core.last_log_id, &self.core.config) {
+                        continue;
+                    }
 
-                // Node has repl stream, but is not yet ready to join.
-                Some(_) => (),
+                    if !wait {
+                        // Node has repl stream, but is not yet ready to join.
+                        let _ = tx.send(Err(ChangeConfigError::NonVoterIsLagging {
+                            node_id: *new_node,
+                            distance: self.core.last_log_id.index.saturating_sub(node.matched.index),
+                        }
+                        .into()));
+                        return;
+                    }
+                }
 
                 // Node does not yet have a repl stream, spawn one.
                 None => {
-                    // Spawn a replication stream for the new member. Track state as a non-voter so that it
-                    // can be updated to be added to the cluster config once it has been brought up-to-date.
-                    let state = self.spawn_replication_stream(*new_node);
-                    self.non_voters.insert(*new_node, NonVoterReplicationState {
-                        state,
-                        is_ready_to_join: false,
-                        tx: None,
-                    });
+                    let _ = tx.send(Err(ChangeConfigError::NonVoterNotFound { node_id: *new_node }.into()));
+                    return;
                 }
             }
-            awaiting.insert(*new_node);
-        }
-        // If there are new nodes which need to sync, then we need to wait until they are synced.
-        // Once they've finished, this routine will be called again to progress further.
-        if !awaiting.is_empty() {
-            self.consensus_state = ConsensusState::NonVoterSync { awaiting, members, tx };
-            return;
         }
 
-        // Enter into joint consensus if we are not awaiting any new nodes.
-        if !members.contains(&self.core.id) {
-            self.is_stepping_down = true;
-        }
-        self.consensus_state = ConsensusState::Joint { is_committed: false };
-        self.core.membership.members_after_consensus = Some(members.clone());
-
-        // Create final_config first, the joint config may be committed at once if the cluster has only 1 node
-        // and changes core.membership.
-        let final_config = MembershipConfig {
-            members: members.clone(),
-            members_after_consensus: None,
-        };
-
-        let joint_config = self.core.membership.clone();
-
-        let res = self.append_membership_log(joint_config, None).await;
+        let res = self.append_membership_log(new_config, Some(tx)).await;
         if let Err(e) = res {
             tracing::error!("append joint log error: {:?}", e);
         }
-
-        let res = self.append_membership_log(final_config, Some(tx)).await;
-        if let Err(e) = res {
-            tracing::error!("append final log error: {:?}", e);
-        }
     }
 
     #[tracing::instrument(level = "trace", skip(self, resp_tx), fields(id=self.core.id))]
@@ -182,8 +183,17 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         mem: MembershipConfig,
         resp_tx: Option<ResponseTx>,
     ) -> Result<(), RaftError> {
-        let payload = ClientWriteRequest::<D>::new_config(mem);
+        let payload = ClientWriteRequest::<D>::new_config(mem.clone());
         let res = self.append_payload_to_log(payload.entry).await;
+
+        // Caveat: membership must be updated before commit check is done with the new config.
+        self.core.membership = ActiveMembership {
+            log_id: self.core.last_log_id,
+            membership: mem,
+        };
+
+        self.leader_report_metrics();
+
         let entry = match res {
             Ok(entry) => entry,
             Err(err) => {
@@ -198,103 +208,27 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
             }
         };
 
-        let cr_entry = ClientRequestEntry::from_entry(entry, resp_tx);
+        let cr_entry = ClientRequestEntry {
+            entry: Arc::new(entry),
+            tx: ClientOrInternalResponseTx::Internal(resp_tx),
+        };
+
         self.replicate_client_request(cr_entry).await;
 
         Ok(())
     }
 
-    /// Handle the commitment of a joint consensus cluster configuration.
-    #[tracing::instrument(level = "trace", skip(self))]
-    pub(super) fn handle_joint_consensus_committed(&mut self) {
-        if let ConsensusState::Joint { is_committed, .. } = &mut self.consensus_state {
-            *is_committed = true; // Mark as committed.
-        }
-        // Only proceed to finalize this joint consensus if there are no remaining nodes being synced.
-        if self.consensus_state.is_joint_consensus_safe_to_finalize() {
-            self.update_replication_state();
-            self.finalize_joint_consensus();
-        }
-    }
-
-    /// When the joint membership is committed(not the uniform membership),
-    /// a new added node turns from a NonVoter to a Follower.
-    /// Thus we need to move replication state from `non_voters` to `nodes`.
+    /// Handle the commitment of a uniform consensus cluster configuration.
     ///
-    /// There are two place in this code base where `nodes` are changed:
-    /// - When a leader is established it adds all node_id found in `membership` to `nodes`.
-    /// - When membership change is committed, i.e., a joint membership or a uniform membership.
-    #[tracing::instrument(level = "trace", skip(self))]
-    pub(super) fn update_replication_state(&mut self) {
-        tracing::debug!("update_replication_state");
-
-        let new_node_ids = self
-            .core
-            .membership
-            .all_nodes()
-            .into_iter()
-            .filter(|elem| elem != &self.core.id)
-            .collect::<BTreeSet<_>>();
-
-        let old_node_ids = self.core.membership.members.clone();
-        let node_ids_to_add = new_node_ids.difference(&old_node_ids);
-
-        // move replication state from non_voters to nodes.
-        for node_id in node_ids_to_add {
-            if !self.non_voters.contains_key(node_id) {
-                // Just a probe for bug
-                panic!(
-                    "joint membership contains node_id:{} not in non_voters:{:?}",
-                    node_id,
-                    self.non_voters.keys().collect::<Vec<_>>()
-                );
-            }
-
-            if self.nodes.contains_key(node_id) {
-                // Just a probe for bug
-                panic!(
-                    "joint membership contains an existent node_id:{} in nodes:{:?}",
-                    node_id,
-                    self.nodes.keys().collect::<Vec<_>>()
-                );
-            }
-
-            let non_voter_state = self.non_voters.remove(node_id).unwrap();
-            self.nodes.insert(*node_id, non_voter_state.state);
-        }
-    }
-
-    /// Finalize the committed joint consensus.
-    #[tracing::instrument(level = "trace", skip(self))]
-    pub(super) fn finalize_joint_consensus(&mut self) {
-        // Only proceed if it is safe to do so.
-        if !self.consensus_state.is_joint_consensus_safe_to_finalize() {
-            tracing::error!("attempted to finalize joint consensus when it was not safe to do so");
-            return;
-        }
-
-        // Cut the cluster config over to the new membership config.
-        if let Some(new_members) = self.core.membership.members_after_consensus.take() {
-            self.core.membership.members = new_members;
-        }
-        self.consensus_state = ConsensusState::Uniform;
-
-        // NOTE WELL: this implementation uses replication streams (src/replication/**) to replicate
-        // entries. Nodes which do not exist in the new config will still have an active replication
-        // stream until the current leader determines that they have replicated the config entry which
-        // removes them from the cluster. At that point in time, the node will revert to non-voter state.
-        //
-        // HOWEVER, if an election takes place, the new leader will not have the old nodes in its config
-        // and the old nodes may not revert to non-voter state using the above mechanism. That is fine.
-        // The Raft spec accounts for this using the 3rd safety measure of cluster configuration changes
-        // described at the very end of §6. This measure is already implemented and in place.
-    }
+    /// This is ony called by leader.
+    #[tracing::instrument(level = "debug", skip(self))]
+    pub(super) fn handle_uniform_consensus_committed(&mut self, log_id: &LogId) {
+        let index = log_id.index;
 
-    /// Handle the commitment of a uniform consensus cluster configuration.
-    #[tracing::instrument(level = "trace", skip(self))]
-    pub(super) fn handle_uniform_consensus_committed(&mut self, index: u64) {
         // Step down if needed.
-        if self.is_stepping_down {
+        if !self.core.membership.membership.contains(&self.core.id) {
+            // }
+            // if self.is_stepping_down {
             tracing::debug!("raft node is stepping down");
             self.core.set_target_state(State::NonVoter);
             self.core.update_current_leader(UpdateCurrentLeader::Unknown);
@@ -304,7 +238,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         // Remove any replication streams which have replicated this config & which are no longer
         // cluster members. All other replication streams which are no longer cluster members, but
         // which have not yet replicated this config will be marked for removal.
-        let membership = &self.core.membership;
+        let membership = &self.core.membership.membership;
         let nodes_to_remove: Vec<_> = self
             .nodes
             .iter_mut()
@@ -320,14 +254,13 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
             .collect();
 
         let follower_ids: Vec<u64> = self.nodes.keys().cloned().collect();
-        let non_voter_ids: Vec<u64> = self.non_voters.keys().cloned().collect();
         tracing::debug!("nodes: {:?}", follower_ids);
-        tracing::debug!("non_voters: {:?}", non_voter_ids);
         tracing::debug!("membership: {:?}", self.core.membership);
         tracing::debug!("nodes_to_remove: {:?}", nodes_to_remove);
 
         for target in nodes_to_remove {
             tracing::debug!(target, "removing target node from replication pool");
+            // TODO(xp): just drop the replication then the task will be terminated.
             if let Some(node) = self.nodes.remove(&target) {
                 let _ = node.replstream.repl_tx.send((RaftEvent::Terminate, tracing::debug_span!("CH")));
 
diff --git a/async-raft/src/core/append_entries.rs b/async-raft/src/core/append_entries.rs
index 5a93c4c6a..d5e6259c5 100644
--- a/async-raft/src/core/append_entries.rs
+++ b/async-raft/src/core/append_entries.rs
@@ -8,6 +8,7 @@ use crate::raft::AppendEntriesResponse;
 use crate::raft::ConflictOpt;
 use crate::raft::Entry;
 use crate::raft::EntryPayload;
+use crate::ActiveMembership;
 use crate::AppData;
 use crate::AppDataResponse;
 use crate::LogId;
@@ -378,13 +379,17 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
         let last_conf_change = entries
             .iter()
             .filter_map(|ent| match &ent.payload {
-                EntryPayload::ConfigChange(conf) => Some(conf),
+                EntryPayload::ConfigChange(conf) => Some(ActiveMembership {
+                    log_id: ent.log_id,
+                    membership: conf.membership.clone(),
+                }),
                 _ => None,
             })
             .last();
+
         if let Some(conf) = last_conf_change {
             tracing::debug!({membership=?conf}, "applying new membership config received from leader");
-            self.update_membership(conf.membership.clone())?;
+            self.update_membership(conf)?;
         };
 
         // Replicate entries to log (same as append, but in follower mode).
diff --git a/async-raft/src/core/client.rs b/async-raft/src/core/client.rs
index 5f45bf4bf..52ddcf321 100644
--- a/async-raft/src/core/client.rs
+++ b/async-raft/src/core/client.rs
@@ -1,4 +1,3 @@
-use std::collections::BTreeSet;
 use std::sync::Arc;
 
 use anyhow::anyhow;
@@ -25,6 +24,7 @@ use crate::raft::ClientWriteResponse;
 use crate::raft::ClientWriteResponseTx;
 use crate::raft::Entry;
 use crate::raft::EntryPayload;
+use crate::raft::RaftResponse;
 use crate::raft::ResponseTx;
 use crate::replication::RaftEvent;
 use crate::AppData;
@@ -42,20 +42,12 @@ pub(super) struct ClientRequestEntry<D: AppData, R: AppDataResponse> {
     /// This value is Arc'd so that it may be sent across thread boundaries for replication
     /// without having to clone the data payload itself.
     pub entry: Arc<Entry<D>>,
+
     /// The response channel for the request.
+    /// TODO(xp): make it an Option
     pub tx: ClientOrInternalResponseTx<D, R>,
 }
 
-impl<D: AppData, R: AppDataResponse> ClientRequestEntry<D, R> {
-    /// Create a new instance from the raw components of a client request.
-    pub(crate) fn from_entry<T: Into<ClientOrInternalResponseTx<D, R>>>(entry: Entry<D>, tx: T) -> Self {
-        Self {
-            entry: Arc::new(entry),
-            tx: tx.into(),
-        }
-    }
-}
-
 impl<D: AppData, R: AppDataResponse> MessageSummary for ClientRequestEntry<D, R> {
     fn summary(&self) -> String {
         format!("entry:{}", self.entry.summary())
@@ -78,45 +70,21 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         let last_index = self.core.last_log_id.index;
 
         let req: ClientWriteRequest<D> = if last_index == 0 {
-            ClientWriteRequest::new_config(self.core.membership.clone())
+            ClientWriteRequest::new_config(self.core.membership.membership.clone())
         } else {
-            // Complete a partial member-change:
-            //
-            // Raft appends two consecutive membership change logs: the joint config and the final config,
-            // to impl a membership change.
-            //
-            // It is possible only the first one, the joint config log is written in to storage or replicated.
-            // Thus if a new leader sees only the first one, it needs to append the final config log to let
-            // the change-membership operation to finish.
-
-            let last_logs = self
-                .core
-                .storage
-                .get_log_entries(last_index..=last_index)
-                .await
-                .map_err(|x| RaftError::RaftStorage(x.into()))?;
-            let last_log = &last_logs[0];
-
-            let req = match last_log.payload {
-                EntryPayload::ConfigChange(ref mem) => {
-                    if mem.membership.members_after_consensus.is_some() {
-                        let final_config = mem.membership.to_final_config();
-                        Some(ClientWriteRequest::new_config(final_config))
-                    } else {
-                        None
-                    }
-                }
-                _ => None,
-            };
-
-            req.unwrap_or_else(ClientWriteRequest::new_blank_payload)
+            ClientWriteRequest::new_blank_payload()
         };
 
         // Commit the initial payload to the cluster.
         let entry = self.append_payload_to_log(req.entry).await?;
         self.core.last_log_id.term = self.core.current_term; // This only ever needs to be updated once per term.
 
-        let cr_entry = ClientRequestEntry::from_entry(entry, None);
+        self.leader_report_metrics();
+
+        let cr_entry = ClientRequestEntry {
+            entry: Arc::new(entry),
+            tx: ClientOrInternalResponseTx::Internal(None),
+        };
         self.replicate_client_request(cr_entry).await;
 
         Ok(())
@@ -139,13 +107,13 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         // Setup sentinel values to track when we've received majority confirmation of leadership.
         let mut c0_confirmed = 0usize;
         // Will never be zero, as we don't allow it when proposing config changes.
-        let len_members = self.core.membership.members.len();
+        let len_members = self.core.membership.membership.members.len();
 
         let c0_needed = quorum::majority_of(len_members);
 
         let mut c1_confirmed = 0usize;
         let mut c1_needed = 0usize;
-        if let Some(joint_members) = &self.core.membership.members_after_consensus {
+        if let Some(joint_members) = &self.core.membership.membership.members_after_consensus {
             let len = joint_members.len(); // Will never be zero, as we don't allow it when proposing config changes.
             c1_needed = quorum::majority_of(len);
         }
@@ -155,10 +123,12 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         let is_in_post_join_consensus_config = self
             .core
             .membership
+            .membership
             .members_after_consensus
             .as_ref()
             .map(|members| members.contains(&self.core.id))
             .unwrap_or(false);
+
         if is_in_post_join_consensus_config {
             c1_confirmed += 1;
         }
@@ -172,7 +142,11 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
 
         // Spawn parallel requests, all with the standard timeout for heartbeats.
         let mut pending = FuturesUnordered::new();
+        let all_members = self.core.membership.membership.all_nodes();
         for (id, node) in self.nodes.iter() {
+            if !all_members.contains(id) {
+                continue;
+            }
             let rpc = AppendEntriesRequest {
                 term: self.core.current_term,
                 leader_id: self.core.id,
@@ -219,12 +193,13 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
             }
 
             // If the term is the same, then it means we are still the leader.
-            if self.core.membership.members.contains(&target) {
+            if self.core.membership.membership.members.contains(&target) {
                 c0_confirmed += 1;
             }
             if self
                 .core
                 .membership
+                .membership
                 .members_after_consensus
                 .as_ref()
                 .map(|members| members.contains(&target))
@@ -253,12 +228,19 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         tx: ClientWriteResponseTx<D, R>,
     ) {
         let entry = match self.append_payload_to_log(rpc.entry).await {
-            Ok(entry) => ClientRequestEntry::from_entry(entry, tx),
+            Ok(entry) => ClientRequestEntry {
+                entry: Arc::new(entry),
+                tx: ClientOrInternalResponseTx::Client(tx),
+            },
+
             Err(err) => {
                 let _ = tx.send(Err(ClientWriteError::RaftError(err)));
                 return;
             }
         };
+
+        self.leader_report_metrics();
+
         self.replicate_client_request(entry).await;
     }
 
@@ -275,8 +257,6 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         self.core.storage.append_to_log(&[&entry]).await.map_err(|err| self.core.map_storage_error(err))?;
         self.core.last_log_id.index = entry.log_id.index;
 
-        self.leader_report_metrics();
-
         Ok(entry)
     }
 
@@ -294,15 +274,13 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         // TODO(xp): calculate nodes set that need to replicate to, when updating membership
         // TODO(xp): Or add to-non-voter replication into self.nodes.
 
-        let all_members = self.core.membership.all_nodes();
-        let non_voter_ids = self.non_voters.keys().copied().collect::<BTreeSet<_>>();
-
-        let joint_non_voter_ids = all_members.intersection(&non_voter_ids).collect::<Vec<_>>();
+        let all_members = self.core.membership.membership.all_nodes();
 
         let nodes = self.nodes.keys().collect::<Vec<_>>();
-        tracing::debug!(?nodes, ?joint_non_voter_ids, "replicate_client_request");
+        tracing::debug!(?nodes, ?all_members, "replicate_client_request");
 
-        let await_quorum = !self.nodes.is_empty() || !joint_non_voter_ids.is_empty();
+        // Except the leader itself, there are other nodes that need to replicate log to.
+        let await_quorum = all_members.len() > 1;
 
         if await_quorum {
             self.awaiting_committed.push(req);
@@ -314,29 +292,14 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
             self.client_request_post_commit(req).await;
         }
 
-        if !self.nodes.is_empty() {
-            for node in self.nodes.values() {
-                let _ = node.replstream.repl_tx.send((
-                    RaftEvent::Replicate {
-                        entry: entry_arc.clone(),
-                        commit_index: self.core.commit_index,
-                    },
-                    tracing::debug_span!("CH"),
-                ));
-            }
-        }
-
-        if !self.non_voters.is_empty() {
-            // Replicate to non-voters.
-            for node in self.non_voters.values() {
-                let _ = node.state.replstream.repl_tx.send((
-                    RaftEvent::Replicate {
-                        entry: entry_arc.clone(),
-                        commit_index: self.core.commit_index,
-                    },
-                    tracing::debug_span!("CH"),
-                ));
-            }
+        for node in self.nodes.values() {
+            let _ = node.replstream.repl_tx.send((
+                RaftEvent::Replicate {
+                    entry: entry_arc.clone(),
+                    commit_index: self.core.commit_index,
+                },
+                tracing::debug_span!("CH"),
+            ));
         }
     }
 
@@ -345,73 +308,86 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
     pub(super) async fn client_request_post_commit(&mut self, req: ClientRequestEntry<D, R>) {
         let entry = &req.entry;
 
-        match req.tx {
-            ClientOrInternalResponseTx::Client(tx) => {
-                match &entry.payload {
-                    EntryPayload::Normal(_) => match self.apply_entry_to_state_machine(&entry).await {
-                        Ok(data) => {
-                            let _ = tx.send(Ok(ClientWriteResponse {
-                                index: req.entry.log_id.index,
-                                data,
-                            }));
-                        }
-                        Err(err) => {
-                            let _ = tx.send(Err(ClientWriteError::RaftError(err)));
-                        }
-                    },
-                    _ => {
-                        // Why is this a bug, and why are we shutting down? This is because we can not easily
-                        // encode these constraints in the type system, and client requests should be the only
-                        // log entry types for which a `ClientOrInternalResponseTx::Client` type is used. This
-                        // error should never be hit unless we've done a poor job in code review.
-                        tracing::error!("critical error in Raft, this is a programming bug, please open an issue");
-                        self.core.set_target_state(State::Shutdown);
-                    }
-                }
-            }
-            ClientOrInternalResponseTx::Internal(tx) => {
-                self.handle_special_log(entry);
+        let apply_res = self.apply_entry_to_state_machine(&entry).await;
+
+        self.send_response(entry, apply_res, req.tx).await;
 
-                // TODO(xp): copied from above, need refactor.
-                let res = self.apply_entry_to_state_machine(&entry).await;
-                let res = match res {
-                    Ok(_data) => Ok(entry.log_id.index),
+        // Trigger log compaction if needed.
+        self.core.trigger_log_compaction_if_needed(false);
+    }
+
+    #[tracing::instrument(level = "debug", skip(self, entry, resp, tx), fields(entry=%entry.summary()))]
+    pub(super) async fn send_response(
+        &mut self,
+        entry: &Entry<D>,
+        resp: RaftResult<R>,
+        tx: ClientOrInternalResponseTx<D, R>,
+    ) {
+        match tx {
+            ClientOrInternalResponseTx::Client(tx) => {
+                let res = match resp {
+                    Ok(data) => Ok(ClientWriteResponse {
+                        index: entry.log_id.index,
+                        data,
+                    }),
                     Err(err) => {
-                        tracing::error!("res of applying to state machine: {:?}", err);
-                        Err(err)
+                        tracing::error!(err=?err, entry=%entry.summary(), "apply client entry");
+                        Err(ClientWriteError::RaftError(err))
                     }
                 };
 
+                let send_res = tx.send(res);
+                tracing::debug!(
+                    "send client response through tx, send_res is error: {}",
+                    send_res.is_err()
+                );
+            }
+            ClientOrInternalResponseTx::Internal(tx) => {
                 // TODO(xp): if there is error, shall we go on?
 
-                self.core.last_applied = entry.log_id;
-                self.leader_report_metrics();
-
-                match tx {
+                let tx = match tx {
                     None => {
                         tracing::debug!("no response tx to send res");
+                        return;
                     }
 
-                    Some(tx) => {
-                        let send_res = tx.send(res.map_err(ResponseError::from));
-                        tracing::debug!("send internal response through tx, res: {:?}", send_res);
+                    Some(tx) => tx,
+                };
+
+                let res = match resp {
+                    Ok(_data) => {
+                        //
+                        match entry.payload {
+                            EntryPayload::Blank => Ok(RaftResponse::LogId { log_id: entry.log_id }),
+                            EntryPayload::Normal(_) => Ok(RaftResponse::LogId { log_id: entry.log_id }),
+                            EntryPayload::ConfigChange(ref c) => Ok(RaftResponse::Membership {
+                                log_id: entry.log_id,
+                                membership: c.membership.clone(),
+                            }),
+                        }
                     }
-                }
+                    Err(err) => {
+                        tracing::error!("res of applying to state machine: {:?}", err);
+                        Err(ResponseError::from(err))
+                    }
+                };
+
+                // let resp = res.map_err(ResponseError::from).map(|x| RaftResponse::LogIndex { log_index: x.index });
+
+                let send_res = tx.send(res);
+                tracing::debug!("send internal response through tx, res: {:?}", send_res);
             }
         }
-
-        // Trigger log compaction if needed.
-        self.core.trigger_log_compaction_if_needed(false);
     }
 
-    pub fn handle_special_log(&mut self, entry: &Arc<Entry<D>>) {
+    pub fn handle_special_log(&mut self, entry: &Entry<D>) {
         match &entry.payload {
             EntryPayload::ConfigChange(ref mem) => {
                 let m = &mem.membership;
                 if m.is_in_joint_consensus() {
-                    self.handle_joint_consensus_committed();
+                    // nothing to do
                 } else {
-                    self.handle_uniform_consensus_committed(entry.log_id.index);
+                    self.handle_uniform_consensus_committed(&entry.log_id);
                 }
             }
             EntryPayload::Blank => {}
@@ -422,6 +398,8 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
     /// Apply the given log entry to the state machine.
     #[tracing::instrument(level = "debug", skip(self, entry))]
     pub(super) async fn apply_entry_to_state_machine(&mut self, entry: &Entry<D>) -> RaftResult<R> {
+        self.handle_special_log(entry);
+
         // First, we just ensure that we apply any outstanding up to, but not including, the index
         // of the given entry. We need to be able to return the data response from applying this
         // entry to the state machine.
diff --git a/async-raft/src/core/mod.rs b/async-raft/src/core/mod.rs
index 54e3f6237..8923428e5 100644
--- a/async-raft/src/core/mod.rs
+++ b/async-raft/src/core/mod.rs
@@ -5,11 +5,11 @@ mod append_entries;
 mod client;
 mod install_snapshot;
 pub(crate) mod replication;
+#[cfg(test)]
+mod replication_state_test;
 mod vote;
 
 use std::collections::BTreeMap;
-use std::collections::BTreeSet;
-use std::collections::HashSet;
 use std::sync::Arc;
 
 use futures::future::AbortHandle;
@@ -60,6 +60,21 @@ use crate::RaftStorage;
 use crate::StorageError;
 use crate::Update;
 
+/// The currently active membership config.
+///
+/// It includes:
+/// - the id of the log that sets this membership config,
+/// - and the config.
+///
+/// An active config is just the last seen config in raft spec.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct ActiveMembership {
+    /// The id of the log that applies this membership config
+    pub log_id: LogId,
+
+    pub membership: MembershipConfig,
+}
+
 /// The core type implementing the Raft protocol.
 pub struct RaftCore<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> {
     /// This node's ID.
@@ -67,7 +82,7 @@ pub struct RaftCore<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftSt
     /// This node's runtime config.
     config: Arc<Config>,
     /// The cluster's current membership configuration.
-    membership: MembershipConfig,
+    membership: ActiveMembership,
     /// The `RaftNetwork` implementation.
     network: Arc<N>,
     /// The `RaftStorage` implementation.
@@ -152,7 +167,10 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
         let this = Self {
             id,
             config,
-            membership,
+            membership: ActiveMembership {
+                log_id: LogId::default(),
+                membership,
+            },
             network,
             storage,
             target_state: State::Follower,
@@ -185,7 +203,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
         self.last_log_id = state.last_log_id;
         self.current_term = state.hard_state.current_term;
         self.voted_for = state.hard_state.voted_for;
-        self.membership = state.membership;
+        self.membership = state.last_membership.clone();
         self.last_applied = state.last_applied;
         // NOTE: this is repeated here for clarity. It is unsafe to initialize the node's commit
         // index to any other value. The commit index must be determined by a leader after
@@ -199,8 +217,8 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
         }
 
         let has_log = self.last_log_id.index != u64::MIN;
-        let single = self.membership.members.len() == 1;
-        let is_voter = self.membership.contains(&self.id);
+        let single = self.membership.membership.members.len() == 1;
+        let is_voter = self.membership.membership.contains(&self.id);
 
         self.target_state = match (has_log, single, is_voter) {
             // A restarted raft that already received some logs but was not yet added to a cluster.
@@ -292,7 +310,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
     /// Update core's target state, ensuring all invariants are upheld.
     #[tracing::instrument(level = "trace", skip(self))]
     fn set_target_state(&mut self, target_state: State) {
-        if target_state == State::Follower && !self.membership.contains(&self.id) {
+        if target_state == State::Follower && !self.membership.membership.contains(&self.id) {
             self.target_state = State::NonVoter;
         } else {
             self.target_state = target_state;
@@ -375,7 +393,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
 
     /// Update the node's current membership config & save hard state.
     #[tracing::instrument(level = "trace", skip(self))]
-    fn update_membership(&mut self, cfg: MembershipConfig) -> RaftResult<()> {
+    fn update_membership(&mut self, cfg: ActiveMembership) -> RaftResult<()> {
         // If the given config does not contain this node's ID, it means one of the following:
         //
         // - the node is currently a non-voter and is replicating an old config to which it has
@@ -384,9 +402,9 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
         // transition to the non-voter state as a signal for when it is safe to shutdown a node
         // being removed.
         self.membership = cfg;
-        if !self.membership.contains(&self.id) {
+        if !self.membership.membership.contains(&self.id) {
             self.set_target_state(State::NonVoter);
-        } else if self.target_state == State::NonVoter && self.membership.members.contains(&self.id) {
+        } else if self.target_state == State::NonVoter && self.membership.membership.members.contains(&self.id) {
             // The node is a NonVoter and the new config has it configured as a normal member.
             // Transition to follower.
             self.set_target_state(State::Follower);
@@ -613,12 +631,9 @@ impl State {
 /// Volatile state specific to the Raft leader.
 struct LeaderState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> {
     pub(super) core: &'a mut RaftCore<D, R, N, S>,
+
     /// A mapping of node IDs the replication state of the target node.
     pub(super) nodes: BTreeMap<NodeId, ReplicationState<D>>,
-    /// A mapping of new nodes (non-voters) which are being synced in order to join the cluster.
-    pub(super) non_voters: BTreeMap<NodeId, NonVoterReplicationState<D>>,
-    /// A bool indicating if this node will be stepping down after committing the current config change.
-    pub(super) is_stepping_down: bool,
 
     /// The metrics about a leader
     pub leader_metrics: LeaderMetrics,
@@ -631,29 +646,18 @@ struct LeaderState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: Raf
 
     /// A buffer of client requests which have been appended locally and are awaiting to be committed to the cluster.
     pub(super) awaiting_committed: Vec<ClientRequestEntry<D, R>>,
-
-    /// A field tracking the cluster's current consensus state, which is used for dynamic membership.
-    pub(super) consensus_state: ConsensusState,
 }
 
 impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> LeaderState<'a, D, R, N, S> {
     /// Create a new instance.
     pub(self) fn new(core: &'a mut RaftCore<D, R, N, S>) -> Self {
-        let consensus_state = if core.membership.is_in_joint_consensus() {
-            ConsensusState::Joint { is_committed: false }
-        } else {
-            ConsensusState::Uniform
-        };
         let (replication_tx, replication_rx) = mpsc::unbounded_channel();
         Self {
             core,
             nodes: BTreeMap::new(),
-            non_voters: BTreeMap::new(),
-            is_stepping_down: false,
             leader_metrics: LeaderMetrics::default(),
             replication_tx,
             replication_rx,
-            consensus_state,
             awaiting_committed: Vec::new(),
         }
     }
@@ -665,13 +669,14 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         let targets = self
             .core
             .membership
+            .membership
             .all_nodes()
             .into_iter()
             .filter(|elem| elem != &self.core.id)
             .collect::<Vec<_>>();
 
         for target in targets {
-            let state = self.spawn_replication_stream(target);
+            let state = self.spawn_replication_stream(target, None);
             self.nodes.insert(target, state);
         }
 
@@ -691,9 +696,6 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
                 for node in self.nodes.values() {
                     let _ = node.replstream.repl_tx.send((RaftEvent::Terminate, tracing::debug_span!("CH")));
                 }
-                for node in self.non_voters.values() {
-                    let _ = node.state.replstream.repl_tx.send((RaftEvent::Terminate, tracing::debug_span!("CH")));
-                }
                 return Ok(());
             }
 
@@ -728,13 +730,13 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
                             tracing::info!("leader recv from rx_api: Initialize");
                             self.core.reject_init_with_config(tx);
                         }
-                        RaftMsg::AddNonVoter{id, tx} => {
+                        RaftMsg::AddNonVoter{id, tx, blocking} => {
                             tracing::info!("leader recv from rx_api: AddNonVoter, {}", id);
-                            self.add_member(id, tx);
+                            self.add_member(id, tx, blocking);
                         }
-                        RaftMsg::ChangeMembership{members, tx} => {
+                        RaftMsg::ChangeMembership{members, blocking, tx} => {
                             tracing::info!("leader recv from rx_api: ChangeMembership, {:?}", members);
-                            self.change_membership(members, tx).await;
+                            self.change_membership(members, blocking, tx).await;
                         }
                     }
                 },
@@ -767,60 +769,27 @@ struct ReplicationState<D: AppData> {
     pub matched: LogId,
     pub remove_after_commit: Option<u64>,
     pub replstream: ReplicationStream<D>,
-}
-
-/// The same as `ReplicationState`, except for non-voters.
-struct NonVoterReplicationState<D: AppData> {
-    /// The replication stream state.
-    pub state: ReplicationState<D>,
-    /// A bool indicating if this non-voters is ready to join the cluster.
-    pub is_ready_to_join: bool,
 
     /// The response channel to use for when this node has successfully synced with the cluster.
     pub tx: Option<ResponseTx>,
 }
 
-/// A state enum used by Raft leaders to navigate the joint consensus protocol.
-pub enum ConsensusState {
-    /// The cluster is preparring to go into joint consensus, but the leader is still syncing
-    /// some non-voters to prepare them for cluster membership.
-    NonVoterSync {
-        /// The set of non-voters nodes which are still being synced.
-        awaiting: HashSet<NodeId>,
-        /// The full membership change which has been proposed.
-        members: BTreeSet<NodeId>,
-
-        /// The response channel to use once the consensus state is back into uniform state.
-        tx: ResponseTx,
-    },
-    /// The cluster is in a joint consensus state and is syncing new nodes.
-    Joint {
-        /// A bool indicating if the associated config which started this joint consensus has yet been comitted.
-        ///
-        /// NOTE: when a new leader is elected, it will initialize this value to false, and then
-        /// update this value to true once the new leader's blank payload has been committed.
-        is_committed: bool,
-    },
-    /// The cluster consensus is uniform; not in a joint consensus state.
-    Uniform,
-}
+impl<D> ReplicationState<D>
+where D: AppData
+{
+    // TODO(xp): make this a method of Config?
 
-impl ConsensusState {
-    /// Check the current state to determine if it is in joint consensus, and if it is safe to finalize the joint
-    /// consensus.
-    ///
-    /// The return value will be true if:
-    /// 1. this object currently represents a joint consensus state.
-    /// 2. the corresponding config for this consensus state has been committed to the cluster.
-    pub fn is_joint_consensus_safe_to_finalize(&self) -> bool {
-        match self {
-            ConsensusState::Joint { is_committed } => *is_committed,
-            _ => false,
-        }
+    /// Return true if the distance behind last_log_id is smaller than the threshold to join.
+    pub fn is_line_rate(&self, last_log_id: &LogId, config: &Config) -> bool {
+        is_matched_upto_date(&self.matched, last_log_id, config)
     }
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
+pub fn is_matched_upto_date(matched: &LogId, last_log_id: &LogId, config: &Config) -> bool {
+    let my_index = matched.index;
+    let distance = last_log_id.index.saturating_sub(my_index);
+    distance <= config.replication_lag_threshold
+}
 
 /// Volatile state specific to a Raft node in candidate state.
 struct CandidateState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> {
@@ -857,8 +826,8 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
 
             // Setup initial state per term.
             self.votes_granted_old = 1; // We must vote for ourselves per the Raft spec.
-            self.votes_needed_old = ((self.core.membership.members.len() / 2) + 1) as u64; // Just need a majority.
-            if let Some(nodes) = &self.core.membership.members_after_consensus {
+            self.votes_needed_old = ((self.core.membership.membership.members.len() / 2) + 1) as u64; // Just need a majority.
+            if let Some(nodes) = &self.core.membership.membership.members_after_consensus {
                 self.votes_granted_new = 1; // We must vote for ourselves per the Raft spec.
                 self.votes_needed_new = ((nodes.len() / 2) + 1) as u64; // Just need a majority.
             }
diff --git a/async-raft/src/core/replication.rs b/async-raft/src/core/replication.rs
index fc48728ac..3b2a01c25 100644
--- a/async-raft/src/core/replication.rs
+++ b/async-raft/src/core/replication.rs
@@ -4,7 +4,6 @@ use tokio::sync::oneshot;
 use tracing_futures::Instrument;
 
 use crate::config::SnapshotPolicy;
-use crate::core::ConsensusState;
 use crate::core::LeaderState;
 use crate::core::ReplicationState;
 use crate::core::SnapshotState;
@@ -12,6 +11,8 @@ use crate::core::State;
 use crate::core::UpdateCurrentLeader;
 use crate::error::RaftResult;
 use crate::quorum;
+use crate::raft::RaftResponse;
+use crate::raft::ResponseTx;
 use crate::replication::RaftEvent;
 use crate::replication::ReplicaEvent;
 use crate::replication::ReplicationStream;
@@ -27,8 +28,12 @@ use crate::ReplicationMetrics;
 
 impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> LeaderState<'a, D, R, N, S> {
     /// Spawn a new replication stream returning its replication state handle.
-    #[tracing::instrument(level = "debug", skip(self))]
-    pub(super) fn spawn_replication_stream(&self, target: NodeId) -> ReplicationState<D> {
+    #[tracing::instrument(level = "debug", skip(self, caller_tx))]
+    pub(super) fn spawn_replication_stream(
+        &self,
+        target: NodeId,
+        caller_tx: Option<ResponseTx>,
+    ) -> ReplicationState<D> {
         let replstream = ReplicationStream::new(
             self.core.id,
             target,
@@ -44,6 +49,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
             matched: LogId { term: 0, index: 0 },
             replstream,
             remove_after_commit: None,
+            tx: caller_tx,
         }
     }
 
@@ -67,77 +73,6 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         }
     }
 
-    /// Handle events from replication streams updating their replication rate tracker.
-    #[tracing::instrument(level = "debug", skip(self))]
-    async fn handle_rate_update(&mut self, target: NodeId, matched: LogId) -> RaftResult<()> {
-        {
-            let voter = self.nodes.get(&target);
-            let non_voter = self.non_voters.get(&target);
-
-            assert!(
-                !(voter.is_some() && non_voter.is_some()),
-                "target {} can not be in both voter and non-voter",
-                target
-            );
-        }
-
-        let state = self.non_voters.get_mut(&target);
-        let state = match state {
-            None => {
-                // target is a follower.
-                return Ok(());
-            }
-            Some(x) => x,
-        };
-
-        // the matched increments monotonically.
-
-        tracing::debug!("state.matched: {}, update to matched: {}", state.state.matched, matched);
-        assert!(matched >= state.state.matched);
-
-        state.state.matched = matched;
-
-        // TODO(xp): use Vec<_> to replace the two membership configs.
-
-        state.is_ready_to_join =
-            self.core.last_log_id.index.saturating_sub(matched.index) < self.core.config.replication_lag_threshold;
-
-        // Issue a response on the non-voters response channel if needed.
-        if !state.is_ready_to_join {
-            return Ok(());
-        }
-
-        // ready to join
-
-        if let Some(tx) = state.tx.take() {
-            // TODO(xp): no log index to send
-            let _ = tx.send(Ok(0));
-        }
-
-        // If we are in NonVoterSync state, and this is one of the nodes being awaiting, then update.
-        match std::mem::replace(&mut self.consensus_state, ConsensusState::Uniform) {
-            ConsensusState::NonVoterSync {
-                mut awaiting,
-                members,
-                tx,
-            } => {
-                awaiting.remove(&target);
-
-                if awaiting.is_empty() {
-                    // We are ready to move forward with entering joint consensus.
-                    self.consensus_state = ConsensusState::Uniform;
-                    self.change_membership(members, tx).await;
-                } else {
-                    // We are still awaiting additional nodes, so replace our original state.
-                    self.consensus_state = ConsensusState::NonVoterSync { awaiting, members, tx };
-                }
-            }
-
-            other => self.consensus_state = other, // Set the original value back to what it was.
-        }
-        Ok(())
-    }
-
     /// Handle events from replication streams for when this node needs to revert to follower state.
     #[tracing::instrument(level = "trace", skip(self, term))]
     async fn handle_revert_to_follower(&mut self, _: NodeId, term: u64) -> RaftResult<()> {
@@ -150,56 +85,51 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         Ok(())
     }
 
-    #[tracing::instrument(level = "trace", skip(self))]
+    #[tracing::instrument(level = "debug", skip(self))]
     async fn handle_update_mactched_and_rate(&mut self, target: NodeId, matched: LogId) -> RaftResult<()> {
-        self.handle_update_matched(target, matched).await?;
-        self.handle_rate_update(target, matched).await
-    }
-
-    /// Handle events from a replication stream which updates the target node's match index.
-    #[tracing::instrument(level = "trace", skip(self))]
-    async fn handle_update_matched(&mut self, target: NodeId, matched: LogId) -> RaftResult<()> {
-        let mut found = false;
-
-        if let Some(state) = self.non_voters.get_mut(&target) {
-            tracing::debug!("state.matched: {}, update to matched: {}", state.state.matched, matched);
-            assert!(matched >= state.state.matched);
-
-            state.state.matched = matched;
-            found = true;
-        }
-
         // Update target's match index & check if it is awaiting removal.
         let mut needs_removal = false;
 
         if let Some(state) = self.nodes.get_mut(&target) {
+            tracing::debug!("state.matched: {}, update to matched: {}", state.matched, matched);
+
+            assert!(matched >= state.matched, "the matched increments monotonically");
+
             state.matched = matched;
-            found = true;
 
+            // Issue a response on the non-voters response channel if needed.
+            if state.is_line_rate(&self.core.last_log_id, &self.core.config) {
+                // This replication became line rate.
+
+                // When adding a non-voter, it blocks until the replication becomes line-rate.
+                if let Some(tx) = state.tx.take() {
+                    // TODO(xp): define a specific response type for non-voter matched event.
+                    let x = RaftResponse::LogId { log_id: state.matched };
+                    let _ = tx.send(Ok(x));
+                }
+            }
+
+            // TODO(xp): stop replication only when commit index is replicated?
             if let Some(threshold) = &state.remove_after_commit {
                 if &matched.index >= threshold {
                     needs_removal = true;
                 }
             }
-        }
-
-        if !found {
+        } else {
+            panic!("repliation is removed: {}", target);
             // no such node
-            return Ok(());
+            // return Ok(());
         }
 
-        self.update_leader_metrics(target, matched);
-
+        // TODO(xp): use Vec<_> to replace the two membership configs.
         // Drop replication stream if needed.
-        // TODO(xp): is it possible to merge the two node remove routines?
-        //           here and that in handle_uniform_consensus_committed()
         if needs_removal {
             if let Some(node) = self.nodes.remove(&target) {
                 let _ = node.replstream.repl_tx.send((RaftEvent::Terminate, tracing::debug_span!("CH")));
-
-                // remove metrics entry
                 self.leader_metrics.replication.remove(&target);
             }
+        } else {
+            self.update_leader_metrics(target, matched);
         }
 
         let commit_index = self.calc_commit_index();
@@ -219,14 +149,6 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
                     tracing::debug_span!("CH"),
                 ));
             }
-            for node in self.non_voters.values() {
-                let _ = node.state.replstream.repl_tx.send((
-                    RaftEvent::UpdateCommitIndex {
-                        commit_index: self.core.commit_index,
-                    },
-                    tracing::debug_span!("CH"),
-                ));
-            }
 
             // Check if there are any pending requests which need to be processed.
             let filter = self
@@ -258,12 +180,12 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
 
     #[tracing::instrument(level = "trace", skip(self))]
     fn calc_commit_index(&self) -> u64 {
-        let c0_index = self.calc_members_commit_index(&self.core.membership.members, "c0");
+        let c0_index = self.calc_members_commit_index(&self.core.membership.membership.members, "c0");
 
         // If we are in joint consensus, then calculate the new commit index of the new membership config nodes.
         let mut c1_index = c0_index; // Defaults to just matching C0.
 
-        if let Some(members) = &self.core.membership.members_after_consensus {
+        if let Some(members) = &self.core.membership.membership.members_after_consensus {
             c1_index = self.calc_members_commit_index(members, "c1");
         }
 
@@ -300,12 +222,6 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
                 continue;
             }
 
-            // this node is a non-voter
-            let repl_state = self.non_voters.get(id);
-            if let Some(x) = repl_state {
-                rst.push(x.state.matched);
-                continue;
-            }
             panic!("node {} not found in nodes or non-voters", id);
         }
 
diff --git a/async-raft/src/core/replication_state_test.rs b/async-raft/src/core/replication_state_test.rs
new file mode 100644
index 000000000..b0510cae2
--- /dev/null
+++ b/async-raft/src/core/replication_state_test.rs
@@ -0,0 +1,37 @@
+use crate::core::is_matched_upto_date;
+use crate::Config;
+use crate::LogId;
+
+#[test]
+fn test_is_line_rate() -> anyhow::Result<()> {
+    let m = LogId { term: 1, index: 10 };
+    assert!(
+        is_matched_upto_date(&m, &LogId { term: 2, index: 10 }, &Config {
+            replication_lag_threshold: 0,
+            ..Default::default()
+        }),
+        "matched, threshold=0"
+    );
+    assert!(
+        is_matched_upto_date(&m, &LogId { term: 2, index: 9 }, &Config {
+            replication_lag_threshold: 0,
+            ..Default::default()
+        }),
+        "overflow, threshold=0"
+    );
+    assert!(
+        !is_matched_upto_date(&m, &LogId { term: 2, index: 11 }, &Config {
+            replication_lag_threshold: 0,
+            ..Default::default()
+        }),
+        "not caught up, threshold=0"
+    );
+    assert!(
+        is_matched_upto_date(&m, &LogId { term: 2, index: 11 }, &Config {
+            replication_lag_threshold: 1,
+            ..Default::default()
+        }),
+        "caught up, threshold=1"
+    );
+    Ok(())
+}
diff --git a/async-raft/src/core/vote.rs b/async-raft/src/core/vote.rs
index df50c379d..c6248f6c9 100644
--- a/async-raft/src/core/vote.rs
+++ b/async-raft/src/core/vote.rs
@@ -143,13 +143,14 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
         // If peer granted vote, then update campaign state.
         if res.vote_granted {
             // Handle vote responses from the C0 config group.
-            if self.core.membership.members.contains(&target) {
+            if self.core.membership.membership.members.contains(&target) {
                 self.votes_granted_old += 1;
             }
             // Handle vote responses from members of C1 config group.
             if self
                 .core
                 .membership
+                .membership
                 .members_after_consensus
                 .as_ref()
                 .map(|members| members.contains(&target))
@@ -172,7 +173,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
     /// Spawn parallel vote requests to all cluster members.
     #[tracing::instrument(level = "trace", skip(self))]
     pub(super) fn spawn_parallel_vote_requests(&self) -> mpsc::Receiver<(VoteResponse, NodeId)> {
-        let all_members = self.core.membership.all_nodes();
+        let all_members = self.core.membership.membership.all_nodes();
         let (tx, rx) = mpsc::channel(all_members.len());
         for member in all_members.into_iter().filter(|member| member != &self.core.id) {
             let rpc = VoteRequest::new(
diff --git a/async-raft/src/error.rs b/async-raft/src/error.rs
index 9dc5706d7..3d0210661 100644
--- a/async-raft/src/error.rs
+++ b/async-raft/src/error.rs
@@ -1,9 +1,12 @@
 //! Error types exposed by this crate.
 
+use std::collections::BTreeSet;
 use std::fmt;
 
+use crate::raft::MembershipConfig;
 use crate::raft_types::SnapshotSegmentId;
 use crate::AppData;
+use crate::LogId;
 use crate::NodeId;
 
 /// A result type where the error variant is always a `RaftError`.
@@ -73,7 +76,7 @@ impl<D: AppData> fmt::Debug for ClientWriteError<D> {
 }
 
 /// Error variants related to configuration.
-#[derive(Debug, thiserror::Error, Eq, PartialEq)]
+#[derive(Debug, thiserror::Error, PartialEq)]
 #[non_exhaustive]
 pub enum ConfigError {
     /// A configuration error indicating that the given values for election timeout min & max are invalid: max must be
@@ -116,8 +119,8 @@ pub enum ChangeConfigError {
     RaftError(#[from] RaftError),
 
     /// The cluster is already undergoing a configuration change.
-    #[error("the cluster is already undergoing a configuration change")]
-    ConfigChangeInProgress,
+    #[error("the cluster is already undergoing a configuration change at log {membership_log_id}")]
+    ConfigChangeInProgress { membership_log_id: LogId },
 
     /// The given config would leave the cluster in an inoperable state.
     ///
@@ -131,6 +134,20 @@ pub enum ChangeConfigError {
     #[error("this node is not the Raft leader")]
     NodeNotLeader(Option<NodeId>),
 
+    #[error("to add a member {node_id} first need to add it as non-voter")]
+    NonVoterNotFound { node_id: NodeId },
+
+    #[error("replication to non voter {node_id} is lagging {distance}, can not add as member")]
+    NonVoterIsLagging { node_id: NodeId, distance: u64 },
+
+    // TODO(xp): test it in unittest
+    // TOOO(xp): rename this error to some elaborated name.
+    #[error("now allowed to change from {curr:?} to {to:?}")]
+    Incompatible {
+        curr: MembershipConfig,
+        to: BTreeSet<NodeId>,
+    },
+
     /// The proposed config changes would make no difference to the current config.
     ///
     /// This takes into account a current joint consensus and the end result of the config.
diff --git a/async-raft/src/lib.rs b/async-raft/src/lib.rs
index ddc03c3eb..ce8accf40 100644
--- a/async-raft/src/lib.rs
+++ b/async-raft/src/lib.rs
@@ -22,6 +22,7 @@ use serde::Serialize;
 
 pub use crate::config::Config;
 pub use crate::config::SnapshotPolicy;
+pub use crate::core::ActiveMembership;
 pub use crate::core::State;
 pub use crate::error::ChangeConfigError;
 pub use crate::error::ClientWriteError;
diff --git a/async-raft/src/metrics.rs b/async-raft/src/metrics.rs
index 06c529b8d..3e8e091fc 100644
--- a/async-raft/src/metrics.rs
+++ b/async-raft/src/metrics.rs
@@ -16,6 +16,7 @@ use thiserror::Error;
 use tokio::sync::watch;
 use tokio::time::Duration;
 
+use crate::core::ActiveMembership;
 use crate::core::State;
 use crate::raft::MembershipConfig;
 use crate::LogId;
@@ -39,7 +40,7 @@ pub struct RaftMetrics {
     /// The current cluster leader.
     pub current_leader: Option<NodeId>,
     /// The current membership config of the cluster.
-    pub membership_config: MembershipConfig,
+    pub membership_config: ActiveMembership,
 
     /// The id of the last log included in snapshot.
     /// If there is no snapshot, it is (0,0).
@@ -66,7 +67,10 @@ impl RaftMetrics {
             last_log_index: 0,
             last_applied: 0,
             current_leader: None,
-            membership_config,
+            membership_config: ActiveMembership {
+                log_id: LogId::default(),
+                membership: membership_config,
+            },
             snapshot: LogId { term: 0, index: 0 },
             leader_metrics: None,
         }
@@ -173,7 +177,7 @@ impl Wait {
     #[tracing::instrument(level = "debug", skip(self), fields(msg=msg.to_string().as_str()))]
     pub async fn members(&self, want_members: BTreeSet<NodeId>, msg: impl ToString) -> Result<RaftMetrics, WaitError> {
         self.metrics(
-            |x| x.membership_config.members == want_members,
+            |x| x.membership_config.membership.members == want_members,
             &format!("{} .membership_config.members -> {:?}", msg.to_string(), want_members),
         )
         .await
@@ -187,7 +191,7 @@ impl Wait {
         msg: impl ToString,
     ) -> Result<RaftMetrics, WaitError> {
         self.metrics(
-            |x| x.membership_config.members_after_consensus == want_members,
+            |x| x.membership_config.membership.members_after_consensus == want_members,
             &format!(
                 "{} .membership_config.members_after_consensus -> {:?}",
                 msg.to_string(),
diff --git a/async-raft/src/metrics_wait_test.rs b/async-raft/src/metrics_wait_test.rs
index 8e885b40f..43842a65a 100644
--- a/async-raft/src/metrics_wait_test.rs
+++ b/async-raft/src/metrics_wait_test.rs
@@ -4,6 +4,7 @@ use maplit::btreeset;
 use tokio::sync::watch;
 use tokio::time::sleep;
 
+use crate::core::ActiveMembership;
 use crate::metrics::Wait;
 use crate::metrics::WaitError;
 use crate::raft::MembershipConfig;
@@ -73,14 +74,14 @@ async fn test_wait() -> anyhow::Result<()> {
         let h = tokio::spawn(async move {
             sleep(Duration::from_millis(10)).await;
             let mut update = init.clone();
-            update.membership_config.members = btreeset![1, 2];
+            update.membership_config.membership.members = btreeset![1, 2];
             let rst = tx.send(update);
             assert!(rst.is_ok());
         });
         let got = w.members(btreeset![1, 2], "members").await?;
         h.await?;
 
-        assert_eq!(btreeset![1, 2], got.membership_config.members);
+        assert_eq!(btreeset![1, 2], got.membership_config.membership.members);
     }
 
     {
@@ -90,14 +91,17 @@ async fn test_wait() -> anyhow::Result<()> {
         let h = tokio::spawn(async move {
             sleep(Duration::from_millis(10)).await;
             let mut update = init.clone();
-            update.membership_config.members_after_consensus = Some(btreeset![1, 2]);
+            update.membership_config.membership.members_after_consensus = Some(btreeset![1, 2]);
             let rst = tx.send(update);
             assert!(rst.is_ok());
         });
         let got = w.next_members(Some(btreeset![1, 2]), "next_members").await?;
         h.await?;
 
-        assert_eq!(Some(btreeset![1, 2]), got.membership_config.members_after_consensus);
+        assert_eq!(
+            Some(btreeset![1, 2]),
+            got.membership_config.membership.members_after_consensus
+        );
     }
 
     tracing::info!("--- wait for snapshot, Ok");
@@ -175,10 +179,14 @@ fn init_wait_test() -> (RaftMetrics, Wait, watch::Sender<RaftMetrics>) {
         last_log_index: 0,
         last_applied: 0,
         current_leader: None,
-        membership_config: MembershipConfig {
-            members: Default::default(),
-            members_after_consensus: None,
+        membership_config: ActiveMembership {
+            log_id: LogId::default(),
+            membership: MembershipConfig {
+                members: Default::default(),
+                members_after_consensus: None,
+            },
         },
+
         snapshot: LogId { term: 0, index: 0 },
         leader_metrics: None,
     };
diff --git a/async-raft/src/raft.rs b/async-raft/src/raft.rs
index 827185bcc..c0620aa5a 100644
--- a/async-raft/src/raft.rs
+++ b/async-raft/src/raft.rs
@@ -8,6 +8,7 @@ use serde::Deserialize;
 use serde::Serialize;
 use tokio::sync::mpsc;
 use tokio::sync::oneshot;
+use tokio::sync::oneshot::Receiver;
 use tokio::sync::watch;
 use tokio::sync::Mutex;
 use tokio::task::JoinHandle;
@@ -273,76 +274,130 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
         rx.await.map_err(|_| InitializeError::RaftError(RaftError::ShuttingDown)).and_then(|res| res)
     }
 
-    /// Synchronize a new Raft node, bringing it up-to-speed (§6).
+    /// Synchronize a new Raft node, optionally, blocking until up-to-speed (§6).
     ///
-    /// Applications built on top of Raft will typically have some peer discovery mechanism for
-    /// detecting when new nodes come online and need to be added to the cluster. This API
-    /// facilitates the ability to request that a new node be synchronized with the leader, so
-    /// that it is up-to-date and ready to be added to the cluster.
+    /// - Add a node as non-voter into the cluster.
+    /// - Setup replication from leader to it.
     ///
-    /// Calling this API will add the target node as a non-voter, starting the syncing process.
-    /// Once the node is up-to-speed, this function will return. It is the responsibility of the
-    /// application to then call `change_membership` once all of the new nodes are synced.
+    /// If blocking is true, this function blocks until the leader believes the logs on the new node is up to date,
+    /// i.e., ready to join the cluster, as a voter, by calling `change_membership`.
+    /// When finished, it returns the last log id on the new node, in a `RaftResponse::LogId`.
     ///
-    /// If this Raft node is not the cluster leader, then this call will fail.
+    /// If blocking is false, this function returns at once as successfully setting up the replication.
+    ///
+    /// If the node to add is already a voter or non-voter, it returns `RaftResponse::NoChange` at once.
     #[tracing::instrument(level = "debug", skip(self, id), fields(target=id))]
-    pub async fn add_non_voter(&self, id: NodeId) -> Result<(), ResponseError> {
-        let span = tracing::debug_span!("CH");
-
+    pub async fn add_non_voter(&self, id: NodeId, blocking: bool) -> Result<RaftResponse, ResponseError> {
         let (tx, rx) = oneshot::channel();
+        self.call_core(RaftMsg::AddNonVoter { id, blocking, tx }, rx).await
+    }
 
-        self.inner
-            .tx_api
-            .send((RaftMsg::AddNonVoter { id, tx }, span))
-            .map_err(|_| RaftError::ShuttingDown)?;
+    /// Propose a cluster configuration change (§6).
+    ///
+    /// If a node in the proposed config but is not yet a voter or non-voter, it first calls `add_non_voter` to setup
+    /// replication to the new node.
+    ///
+    /// Internal:
+    /// - It proposes a **joint** config.
+    /// - When the **joint** config is committed, it proposes a uniform config.
+    ///
+    /// If blocking is true, it blocks until every non-voter becomes up to date.
+    /// Otherwise it returns error `ChangeConfigError::NonVoterIsLagging` if there is a lagging non-voter.
+    ///
+    /// If it lost leadership or crashed before committing the second **uniform** config log, the cluster is left in the
+    /// **joint** config.
+    #[tracing::instrument(level = "debug", skip(self))]
+    pub async fn change_membership(
+        &self,
+        members: BTreeSet<NodeId>,
+        blocking: bool,
+    ) -> Result<RaftResponse, ResponseError> {
+        tracing::info!(?members, "change_membership: add every member as non-voter");
 
-        let recv_res = rx.await;
-        let res = match recv_res {
-            Ok(x) => x,
-            Err(e) => {
-                tracing::error!("recv rx error: {}", e);
-                return Err(ChangeConfigError::RaftError(RaftError::ShuttingDown).into());
+        for id in members.iter() {
+            let (tx, rx) = oneshot::channel();
+            let res = self.call_core(RaftMsg::AddNonVoter { id: *id, blocking, tx }, rx).await;
+
+            let res_err = match res {
+                Ok(_) => {
+                    continue;
+                }
+                Err(e) => e,
+            };
+
+            if matches!(res_err, ResponseError::ChangeConfig(ChangeConfigError::Noop)) {
+                tracing::info!(%res_err, "add non_voter: already exists");
+                // this node is already a non-voter
+                continue;
+            }
+
+            tracing::error!(%res_err, "add non_voter");
+
+            // unhandle-able error
+            return Err(res_err);
+        }
+
+        tracing::info!("change_membership: start to commit joint config");
+
+        let (tx, rx) = oneshot::channel();
+        // res is error if membership can not be changed.
+        // If it is not error, it will go into a joint state
+        let res = self
+            .call_core(
+                RaftMsg::ChangeMembership {
+                    members: members.clone(),
+                    blocking,
+                    tx,
+                },
+                rx,
+            )
+            .await?;
+
+        tracing::info!("res of first change_membership: {:?}", res);
+
+        let (log_id, joint) = match res {
+            RaftResponse::Membership { log_id, membership } => {
+                // There is a previously in progress joint state and it becomes the membership config we want.
+                if !membership.is_in_joint_consensus() {
+                    return Ok(RaftResponse::Membership { log_id, membership });
+                }
+
+                (log_id, membership)
+            }
+            _ => {
+                panic!("expect membership response")
             }
         };
 
-        res?;
+        tracing::debug!("committed a joint config: {} {:?}", log_id, joint);
+        tracing::debug!("the second step is to change to uniform config: {:?}", members);
 
-        Ok(())
+        let (tx, rx) = oneshot::channel();
+        let res = self.call_core(RaftMsg::ChangeMembership { members, blocking, tx }, rx).await?;
+
+        tracing::info!("res of second change_membership: {:?}", res);
+
+        Ok(res)
     }
 
-    /// Propose a cluster configuration change (§6).
-    ///
-    /// This will cause the leader to begin a cluster membership configuration change. If there
-    /// are new nodes in the proposed config which are not already registered as non-voters — from
-    /// an earlier call to `add_non_voter` — then the new nodes will first be synced as non-voters
-    /// before moving the cluster into joint consensus. As this process may take some time, it is
-    /// recommended that `add_non_voter` be called first for new nodes, and then once all new nodes
-    /// have been synchronized, call this method to start reconfiguration.
-    ///
-    /// If this Raft node is not the cluster leader, then the proposed configuration change will be
-    /// rejected.
-    #[tracing::instrument(level = "debug", skip(self))]
-    pub async fn change_membership(&self, members: BTreeSet<NodeId>) -> Result<(), ResponseError> {
+    #[tracing::instrument(level = "debug", skip(self, mes, rx))]
+    pub(crate) async fn call_core(
+        &self,
+        mes: RaftMsg<D, R>,
+        rx: Receiver<Result<RaftResponse, ResponseError>>,
+    ) -> Result<RaftResponse, ResponseError> {
         let span = tracing::debug_span!("CH");
 
-        let (tx, rx) = oneshot::channel();
-        self.inner
-            .tx_api
-            .send((RaftMsg::ChangeMembership { members, tx }, span))
-            .map_err(|_| RaftError::ShuttingDown)?;
+        self.inner.tx_api.send((mes, span)).map_err(|_| RaftError::ShuttingDown)?;
 
         let recv_res = rx.await;
-        let res = match recv_res {
+        match recv_res {
             Ok(x) => x,
             Err(e) => {
                 tracing::error!("recv rx error: {}", e);
-                return Err(ChangeConfigError::RaftError(RaftError::ShuttingDown).into());
+                Err(ChangeConfigError::RaftError(RaftError::ShuttingDown).into())
             }
-        };
-
-        res?;
-
-        Ok(())
+        }
     }
 
     /// Get a handle to the metrics channel.
@@ -400,7 +455,25 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Cl
 
 pub(crate) type ClientWriteResponseTx<D, R> = oneshot::Sender<Result<ClientWriteResponse<R>, ClientWriteError<D>>>;
 pub(crate) type ClientReadResponseTx = oneshot::Sender<Result<(), ClientReadError>>;
-pub(crate) type ResponseTx = oneshot::Sender<Result<u64, ResponseError>>;
+pub(crate) type ResponseTx = oneshot::Sender<Result<RaftResponse, ResponseError>>;
+
+/// Response type to send back to a caller
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum RaftResponse {
+    LogIndex {
+        log_index: u64,
+    },
+    LogId {
+        log_id: LogId,
+    },
+    Membership {
+        log_id: LogId,
+        membership: MembershipConfig,
+    },
+
+    /// An Ok response indicating nothing is affected.
+    NoChange,
+}
 
 /// A message coming from the Raft API.
 pub(crate) enum RaftMsg<D: AppData, R: AppDataResponse> {
@@ -427,12 +500,24 @@ pub(crate) enum RaftMsg<D: AppData, R: AppDataResponse> {
         members: BTreeSet<NodeId>,
         tx: oneshot::Sender<Result<(), InitializeError>>,
     },
+    // TODO(xp): make tx a field of a struct
+    /// Request raft core to setup a new replication to a non-voter.
     AddNonVoter {
         id: NodeId,
+
+        /// If block until the newly added non-voter becomes line-rate.
+        blocking: bool,
+
+        /// Send the log id when the replication becomes line-rate.
         tx: ResponseTx,
     },
     ChangeMembership {
         members: BTreeSet<NodeId>,
+        /// with blocking==false, respond to client a ChangeConfigError::NonVoterIsLagging error at once if a
+        /// non-member is lagging.
+        ///
+        /// Otherwise, wait for commit of the member change log.
+        blocking: bool,
         tx: ResponseTx,
     },
 }
@@ -625,6 +710,7 @@ impl MembershipConfig {
         self.members_after_consensus.is_some()
     }
 
+    // TODO(xp): rename this
     /// Create a new initial config containing only the given node ID.
     pub fn new_initial(id: NodeId) -> Self {
         let mut members = BTreeSet::new();
diff --git a/async-raft/src/replication/mod.rs b/async-raft/src/replication/mod.rs
index 049d733e8..236dc067a 100644
--- a/async-raft/src/replication/mod.rs
+++ b/async-raft/src/replication/mod.rs
@@ -295,7 +295,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
         }
 
         // Replication was not successful, handle conflict optimization record, else decrement `next_index`.
-        let conflict = match res.conflict_opt {
+        let mut conflict = match res.conflict_opt {
             None => {
                 panic!("append_entries failed but without a reason: {:?}", res);
             }
@@ -304,15 +304,6 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
 
         tracing::debug!(?conflict, res.term, "append entries failed, handling conflict opt");
 
-        // If the returned conflict opt index is greater than last_log_index, then this is a
-        // logical error, and no action should be taken. This represents a replication failure.
-        if conflict.log_id.index > self.last_log_index {
-            panic!(
-                "impossible: conflict.log_it({}) > last_log_index({})",
-                conflict.log_id, self.last_log_index
-            );
-        }
-
         // If conflict index is 0, we will not be able to fetch that index from storage because
         // it will never exist. So instead, we just return, and accept the conflict data.
         if conflict.log_id.index == 0 {
@@ -322,6 +313,19 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
             return;
         }
 
+        // The follower has more log and set the conflict.log_id to its last_log_id if:
+        // - req.prev_log_id.index is 0
+        // - req.prev_log_id.index is applied, in which case the follower does not know if the prev_log_id is valid.
+        //
+        // In such case, fake a conflict log_id that never matches the log term in local store, in order to not
+        // update_matched().
+        if conflict.log_id.index > self.last_log_index {
+            conflict.log_id = LogId {
+                term: 0,
+                index: self.last_log_index,
+            };
+        }
+
         // Fetch the entry at conflict index and use the term specified there.
         let ent = self.storage.try_get_log_entry(conflict.log_id.index).await;
         let ent = match ent {
@@ -334,24 +338,24 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
             }
         };
 
-        let ent_term = ent.map(|entry| entry.log_id.term);
-        match ent_term {
-            Some(term) => {
-                self.matched = LogId {
-                    term,
-                    index: conflict.log_id.index,
-                };
-
-                if term == conflict.log_id.term {
-                    self.update_matched();
-                }
-            }
+        let ent = match ent {
+            Some(x) => x,
             None => {
                 // This condition would only ever be reached if the log has been removed due to
                 // log compaction (barring critical storage failure), so transition to snapshotting.
                 self.set_target_state(TargetReplState::Snapshotting);
+                return;
             }
         };
+
+        let term = ent.log_id.term;
+
+        // Next time try sending from the log at conflict.log_id.index.
+        self.matched = ent.log_id;
+
+        if term == conflict.log_id.term {
+            self.update_matched();
+        }
     }
 
     #[tracing::instrument(level = "debug", skip(self))]
diff --git a/async-raft/src/storage.rs b/async-raft/src/storage.rs
index c91a70837..e07eb6352 100644
--- a/async-raft/src/storage.rs
+++ b/async-raft/src/storage.rs
@@ -10,6 +10,7 @@ use tokio::io::AsyncRead;
 use tokio::io::AsyncSeek;
 use tokio::io::AsyncWrite;
 
+use crate::core::ActiveMembership;
 use crate::raft::Entry;
 use crate::raft::MembershipConfig;
 use crate::raft_types::SnapshotId;
@@ -68,9 +69,9 @@ pub struct InitialState {
     /// The saved hard state of the node.
     pub hard_state: HardState,
 
-    /// The latest cluster membership configuration found in the log, else a new initial
+    /// The latest cluster membership configuration found, in log or in state machine, else a new initial
     /// membership config consisting only of this node's ID.
-    pub membership: MembershipConfig,
+    pub last_membership: ActiveMembership,
 }
 
 impl InitialState {
@@ -86,7 +87,10 @@ impl InitialState {
                 current_term: 0,
                 voted_for: None,
             },
-            membership: MembershipConfig::new_initial(id),
+            last_membership: ActiveMembership {
+                log_id: LogId { term: 0, index: 0 },
+                membership: MembershipConfig::new_initial(id),
+            },
         }
     }
 }
@@ -114,7 +118,7 @@ where
         false
     }
 
-    /// Get the latest membership config found in the log.
+    /// Get the latest membership config found in the log or in state machine.
     ///
     /// This must always be implemented as a reverse search through the log to find the most
     /// recent membership config to be appended to the log.
@@ -127,7 +131,7 @@ where
     /// the node's ID so that it is consistent across restarts.
     ///
     /// Errors returned from this method will cause Raft to go into shutdown.
-    async fn get_membership_config(&self) -> Result<MembershipConfig, StorageError>;
+    async fn get_membership_config(&self) -> Result<ActiveMembership, StorageError>;
 
     /// Get Raft's state information from storage.
     ///
@@ -168,7 +172,7 @@ where
 
     /// Returns the last applied log id which is recorded in state machine, and the last applied membership log id and
     /// membership config.
-    async fn last_applied_state(&self) -> Result<(LogId, Option<(LogId, MembershipConfig)>), StorageError>;
+    async fn last_applied_state(&self) -> Result<(LogId, Option<ActiveMembership>), StorageError>;
 
     /// Delete all logs in a `range`.
     ///
diff --git a/async-raft/tests/add_remove_voter.rs b/async-raft/tests/add_remove_voter.rs
index 8d010f24d..95631c113 100644
--- a/async-raft/tests/add_remove_voter.rs
+++ b/async-raft/tests/add_remove_voter.rs
@@ -86,32 +86,37 @@ async fn add_remove_voter() -> Result<()> {
     wait_log(router.clone(), &all_members, want).await?;
 
     tracing::info!("--- changing cluster config");
-    router.change_membership(0, all_members.clone()).await?;
-    want += 2; // 2 member-change logs
+    {
+        router.change_membership(0, all_members.clone()).await?;
+        want += 2; // 2 member-change logs
 
-    wait_log(router.clone(), &all_members, want).await?;
-    router.assert_stable_cluster(Some(1), Some(want)).await; // Still in term 1, so leader is still node 0.
+        wait_log(router.clone(), &all_members, want).await?;
+        router.assert_stable_cluster(Some(1), Some(want)).await; // Still in term 1, so leader is still node 0.
+    }
 
-    // Send some requests
-    router.client_request_many(0, "client", 100).await;
-    want += 100;
+    tracing::info!("--- write 100 logs");
+    {
+        router.client_request_many(0, "client", 100).await;
+        want += 100;
 
-    wait_log(router.clone(), &all_members, want).await?;
+        wait_log(router.clone(), &all_members, want).await?;
+    }
 
-    // Remove Node 4
     tracing::info!("--- remove n{}", 4);
-    router.change_membership(0, left_members.clone()).await?;
-    want += 2; // two member-change logs
+    {
+        router.change_membership(0, left_members.clone()).await?;
+        want += 2; // two member-change logs
 
-    wait_log(router.clone(), &left_members, want).await?;
-    router
-        .wait_for_metrics(
-            &4u64,
-            |x| x.state == State::NonVoter,
-            Some(timeout),
-            &format!("n{}.state -> {:?}", 4, State::NonVoter),
-        )
-        .await?;
+        wait_log(router.clone(), &left_members, want).await?;
+        router
+            .wait_for_metrics(
+                &4u64,
+                |x| x.state == State::NonVoter,
+                Some(timeout),
+                &format!("n{}.state -> {:?}", 4, State::NonVoter),
+            )
+            .await?;
+    }
 
     // Send some requests
     router.client_request_many(0, "client", 100).await;
diff --git a/async-raft/tests/fixtures/mod.rs b/async-raft/tests/fixtures/mod.rs
index 466732820..db7a4fa88 100644
--- a/async-raft/tests/fixtures/mod.rs
+++ b/async-raft/tests/fixtures/mod.rs
@@ -7,6 +7,7 @@ use std::collections::BTreeSet;
 use std::collections::HashSet;
 use std::env;
 use std::sync::Arc;
+use std::sync::Once;
 use std::time::Duration;
 
 use anyhow::anyhow;
@@ -23,6 +24,7 @@ use async_raft::raft::ClientWriteRequest;
 use async_raft::raft::InstallSnapshotRequest;
 use async_raft::raft::InstallSnapshotResponse;
 use async_raft::raft::MembershipConfig;
+use async_raft::raft::RaftResponse;
 use async_raft::raft::VoteRequest;
 use async_raft::raft::VoteResponse;
 use async_raft::storage::RaftStorage;
@@ -73,6 +75,14 @@ pub type MemRaft = Raft<MemClientRequest, MemClientResponse, RaftRouter, MemStor
 
 /// Initialize the tracing system.
 pub fn init_tracing() {
+    static START: Once = Once::new();
+
+    START.call_once(|| {
+        do_init_tracing();
+    });
+}
+
+pub fn do_init_tracing() {
     let fmt_layer = tracing_subscriber::fmt::Layer::default()
         .with_span_events(tracing_subscriber::fmt::format::FmtSpan::FULL)
         .with_ansi(false);
@@ -388,16 +398,42 @@ impl RaftRouter {
         nodes.remove(&id);
     }
 
-    pub async fn add_non_voter(&self, leader: NodeId, target: NodeId) -> Result<(), ResponseError> {
+    pub async fn add_non_voter(&self, leader: NodeId, target: NodeId) -> Result<RaftResponse, ResponseError> {
         let rt = self.routing_table.read().await;
         let node = rt.get(&leader).unwrap_or_else(|| panic!("node with ID {} does not exist", leader));
-        node.0.add_non_voter(target).await
+        node.0.add_non_voter(target, true).await
     }
 
-    pub async fn change_membership(&self, leader: NodeId, members: BTreeSet<NodeId>) -> Result<(), ResponseError> {
+    pub async fn add_non_voter_with_blocking(
+        &self,
+        leader: NodeId,
+        target: NodeId,
+        blocking: bool,
+    ) -> Result<RaftResponse, ResponseError> {
+        let rt = self.routing_table.read().await;
+        let node = rt.get(&leader).unwrap_or_else(|| panic!("node with ID {} does not exist", leader));
+        node.0.add_non_voter(target, blocking).await
+    }
+
+    pub async fn change_membership(
+        &self,
+        leader: NodeId,
+        members: BTreeSet<NodeId>,
+    ) -> Result<RaftResponse, ResponseError> {
+        let rt = self.routing_table.read().await;
+        let node = rt.get(&leader).unwrap_or_else(|| panic!("node with ID {} does not exist", leader));
+        node.0.change_membership(members, true).await
+    }
+
+    pub async fn change_membership_with_blocking(
+        &self,
+        leader: NodeId,
+        members: BTreeSet<NodeId>,
+        blocking: bool,
+    ) -> Result<RaftResponse, ResponseError> {
         let rt = self.routing_table.read().await;
         let node = rt.get(&leader).unwrap_or_else(|| panic!("node with ID {} does not exist", leader));
-        node.0.change_membership(members).await
+        node.0.change_membership(members, blocking).await
     }
 
     /// Send a client read request to the target node.
@@ -476,7 +512,7 @@ impl RaftRouter {
                 "node {} has last_log_index {}, expected 0",
                 node.id, node.last_log_index
             );
-            let members = node.membership_config.members.iter().collect::<Vec<_>>();
+            let members = node.membership_config.membership.members.iter().collect::<Vec<_>>();
             assert_eq!(
                 members,
                 vec![&node.id],
@@ -485,7 +521,7 @@ impl RaftRouter {
                 members
             );
             assert!(
-                node.membership_config.members_after_consensus.is_none(),
+                node.membership_config.membership.members_after_consensus.is_none(),
                 "node {} is in joint consensus, expected uniform consensus",
                 node.id
             );
@@ -556,7 +592,7 @@ impl RaftRouter {
                 "node {} has last_log_index {}, expected {}",
                 node.id, node.last_log_index, expected_last_log
             );
-            let mut members = node.membership_config.members.iter().cloned().collect::<Vec<_>>();
+            let mut members = node.membership_config.membership.members.iter().cloned().collect::<Vec<_>>();
             members.sort_unstable();
             assert_eq!(
                 members, all_nodes,
@@ -564,7 +600,7 @@ impl RaftRouter {
                 node.id, members, all_nodes
             );
             assert!(
-                node.membership_config.members_after_consensus.is_none(),
+                node.membership_config.membership.members_after_consensus.is_none(),
                 "node {} was not in uniform consensus state",
                 node.id
             );
diff --git a/async-raft/tests/initialization.rs b/async-raft/tests/initialization.rs
index 96857ff7b..af8432d79 100644
--- a/async-raft/tests/initialization.rs
+++ b/async-raft/tests/initialization.rs
@@ -3,6 +3,7 @@ use std::sync::Arc;
 use anyhow::Result;
 use async_raft::raft::EntryPayload;
 use async_raft::raft::MembershipConfig;
+use async_raft::ActiveMembership;
 use async_raft::Config;
 use async_raft::LogId;
 use async_raft::RaftStorage;
@@ -67,10 +68,13 @@ async fn initialization() -> Result<()> {
 
         let sm_mem = sto.last_applied_state().await?.1;
         assert_eq!(
-            Some((LogId { term: 1, index: 1 }, MembershipConfig {
-                members: btreeset![0, 1, 2],
-                members_after_consensus: None,
-            })),
+            Some(ActiveMembership {
+                log_id: LogId { term: 1, index: 1 },
+                membership: MembershipConfig {
+                    members: btreeset![0, 1, 2],
+                    members_after_consensus: None,
+                }
+            }),
             sm_mem
         );
     }
diff --git a/async-raft/tests/leader_metrics.rs b/async-raft/tests/leader_metrics.rs
index f64fe0fdc..70e802392 100644
--- a/async-raft/tests/leader_metrics.rs
+++ b/async-raft/tests/leader_metrics.rs
@@ -131,39 +131,51 @@ async fn leader_metrics() -> Result<()> {
     want += 10;
 
     tracing::info!("--- remove n{}", 4);
+    {
+        router.change_membership(0, left_members.clone()).await?;
+        want += 2; // two member-change logs
 
-    router.change_membership(0, left_members.clone()).await?;
-    want += 2; // two member-change logs
-
-    router
-        .wait_for_metrics(
-            &4,
-            |x| x.state == State::NonVoter,
-            timeout,
-            &format!("n{}.state -> {:?}", 4, State::NonVoter),
-        )
-        .await?;
+        tracing::info!("--- n{} should revert to non-voter", 4);
+        router
+            .wait_for_metrics(
+                &4,
+                |x| x.state == State::NonVoter,
+                timeout,
+                &format!("n{}.state -> {:?}", 4, State::NonVoter),
+            )
+            .await?;
 
-    router.wait_for_log(&left_members, want, timeout, "removed node 4").await?;
+        router
+            .wait_for_log(
+                &left_members,
+                want,
+                timeout,
+                "other nodes should commit the membership change log",
+            )
+            .await?;
+    }
 
-    let ww = ReplicationMetrics {
-        matched: LogId { term: 1, index: want },
-    };
-    let want_repl = hashmap! { 1=>ww.clone(), 2=>ww.clone(), 3=>ww.clone()};
-    router
-        .wait_for_metrics(
-            &0,
-            |x| {
-                if let Some(ref q) = x.leader_metrics {
-                    q.replication == want_repl
-                } else {
-                    false
-                }
-            },
-            timeout,
-            "replication metrics to 3 nodes",
-        )
-        .await?;
+    tracing::info!("--- replication metrics should reflect the replication state");
+    {
+        let ww = ReplicationMetrics {
+            matched: LogId { term: 1, index: want },
+        };
+        let want_repl = hashmap! { 1=>ww.clone(), 2=>ww.clone(), 3=>ww.clone()};
+        router
+            .wait_for_metrics(
+                &0,
+                |x| {
+                    if let Some(ref q) = x.leader_metrics {
+                        q.replication == want_repl
+                    } else {
+                        false
+                    }
+                },
+                timeout,
+                "replication metrics to 3 nodes",
+            )
+            .await?;
+    }
 
     let leader = router.current_leader(0).await.unwrap();
 
diff --git a/async-raft/tests/members.rs b/async-raft/tests/members.rs
new file mode 100644
index 000000000..857d372a7
--- /dev/null
+++ b/async-raft/tests/members.rs
@@ -0,0 +1,127 @@
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::Result;
+use async_raft::error::ChangeConfigError;
+use async_raft::error::ResponseError;
+use async_raft::Config;
+use async_raft::RaftStorage;
+use fixtures::RaftRouter;
+use maplit::btreeset;
+
+#[macro_use]
+mod fixtures;
+
+/// RUST_LOG=async_raft,memstore,non_voter_add=trace cargo test -p async-raft --test non_voter_add
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn members_add_lagging_non_voter_non_blocking() -> Result<()> {
+    // Add a non-voter into membership config, expect error NonVoterIsLagging.
+
+    let (_log_guard, ut_span) = init_ut!();
+    let _ent = ut_span.enter();
+
+    let lag_threshold = 1;
+
+    let config = Arc::new(
+        Config {
+            replication_lag_threshold: lag_threshold,
+            ..Default::default()
+        }
+        .validate()?,
+    );
+    let router = Arc::new(RaftRouter::new(config.clone()));
+
+    let mut n_logs = router.new_nodes_from_single(btreeset! {0}, btreeset! {1}).await?;
+
+    tracing::info!("--- stop replication by isolating node 1");
+    {
+        router.isolate_node(1).await;
+    }
+
+    tracing::info!("--- write up to 100 logs");
+    {
+        router.client_request_many(0, "non_voter_add", 500 - n_logs as usize).await;
+        n_logs = 500;
+
+        router.wait(&0, timeout()).await?.log(n_logs, "received 500 logs").await?;
+    }
+
+    tracing::info!("--- restore replication and change membership at once, expect NonVoterIsLagging");
+    {
+        router.restore_node(1).await;
+        let res = router.change_membership_with_blocking(0, btreeset! {0,1}, false).await;
+
+        tracing::info!("--- got res: {:?}", res);
+
+        let err = res.unwrap_err();
+
+        match err {
+            ResponseError::ChangeConfig(e) => match e {
+                ChangeConfigError::NonVoterIsLagging { node_id, distance } => {
+                    tracing::info!(distance, "--- distance");
+                    assert_eq!(1, node_id);
+                    assert!(distance >= lag_threshold);
+                    assert!(distance < 500);
+                }
+                _ => {
+                    panic!("expect ChangeConfigError::NonVoterNotFound");
+                }
+            },
+            _ => {
+                panic!("expect ResponseError::ChangeConfig");
+            }
+        }
+    }
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn members_add_absent_non_voter_blocking() -> Result<()> {
+    // Add a member without adding it as non-voter, in blocking mode it should finish successfully.
+
+    let (_log_guard, ut_span) = init_ut!();
+    let _ent = ut_span.enter();
+
+    let lag_threshold = 1;
+
+    let config = Arc::new(
+        Config {
+            replication_lag_threshold: lag_threshold,
+            ..Default::default()
+        }
+        .validate()?,
+    );
+    let router = Arc::new(RaftRouter::new(config.clone()));
+
+    let mut n_logs = router.new_nodes_from_single(btreeset! {0}, btreeset! {}).await?;
+
+    tracing::info!("--- write up to 100 logs");
+    {
+        router.client_request_many(0, "non_voter_add", 100 - n_logs as usize).await;
+        n_logs = 100;
+
+        router.wait(&0, timeout()).await?.log(n_logs, "received 100 logs").await?;
+    }
+
+    tracing::info!("--- change membership without adding-non-voter");
+    {
+        router.new_raft_node(1).await;
+
+        let res = router.change_membership_with_blocking(0, btreeset! {0,1}, true).await?;
+        n_logs += 2;
+        tracing::info!("--- change_membership blocks until success: {:?}", res);
+
+        for node_id in 0..2 {
+            let sto = router.get_storage_handle(&node_id).await?;
+            let logs = sto.get_log_entries(..).await?;
+            assert_eq!(n_logs, logs.len() as u64);
+        }
+    }
+
+    Ok(())
+}
+
+fn timeout() -> Option<Duration> {
+    Some(Duration::from_micros(500))
+}
diff --git a/async-raft/tests/members_leader_fix_partial.rs b/async-raft/tests/members_leader_fix_partial.rs
index 29c434a74..e81b28709 100644
--- a/async-raft/tests/members_leader_fix_partial.rs
+++ b/async-raft/tests/members_leader_fix_partial.rs
@@ -1,5 +1,4 @@
 use std::sync::Arc;
-use std::time::Duration;
 
 use anyhow::Result;
 use async_raft::raft::Entry;
@@ -20,7 +19,7 @@ mod fixtures;
 ///
 /// - brings up 1 leader.
 /// - manually append a joint config log.
-/// - shutdown and restart, it should add another final config log to complete the partial
+/// - shutdown and restart, it should NOT add another final config log to complete the partial
 /// membership changing
 ///
 /// RUST_LOG=async_raft,memstore,members_leader_fix_partial=trace cargo test -p async-raft --test
@@ -58,35 +57,39 @@ async fn members_leader_fix_partial() -> Result<()> {
     // A joint log and the leader should add a new final config log.
     want += 2;
 
+    let _ = want;
+
     // To let tne router not panic
     router.new_raft_node(1).await;
     router.new_raft_node(2).await;
 
     let node = Raft::new(0, config.clone(), router.clone(), sto.clone());
 
-    node.wait(Some(Duration::from_millis(500)))
-        .metrics(
-            |x| x.last_log_index == want,
-            "wait for leader to complete the final config log",
-        )
-        .await?;
-
-    let final_log = sto.get_log_entries(want..=want).await?[0].clone();
-
-    let m = match final_log.payload {
-        EntryPayload::ConfigChange(ref m) => m.membership.clone(),
-        _ => {
-            panic!("expect membership config log")
-        }
-    };
+    let _ = node;
 
-    assert_eq!(
-        MembershipConfig {
-            members: btreeset! {0,1,2},
-            members_after_consensus: None,
-        },
-        m
-    );
+    // node.wait(Some(Duration::from_millis(500)))
+    //     .metrics(
+    //         |x| x.last_log_index == want,
+    //         "wait for leader to complete the final config log",
+    //     )
+    //     .await?;
+    //
+    // let final_log = sto.get_log_entries(want..=want).await?[0].clone();
+    //
+    // let m = match final_log.payload {
+    //     EntryPayload::ConfigChange(ref m) => m.membership.clone(),
+    //     _ => {
+    //         panic!("expect membership config log")
+    //     }
+    // };
+    //
+    // assert_eq!(
+    //     MembershipConfig {
+    //         members: btreeset! {0,1,2},
+    //         members_after_consensus: None,
+    //     },
+    //     m
+    // );
 
     Ok(())
 }
diff --git a/async-raft/tests/non_voter_add.rs b/async-raft/tests/non_voter_add.rs
new file mode 100644
index 000000000..4964d4785
--- /dev/null
+++ b/async-raft/tests/non_voter_add.rs
@@ -0,0 +1,119 @@
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::Result;
+use async_raft::raft::RaftResponse;
+use async_raft::Config;
+use async_raft::LogId;
+use async_raft::RaftStorage;
+use fixtures::RaftRouter;
+use maplit::btreeset;
+
+#[macro_use]
+mod fixtures;
+
+/// RUST_LOG=async_raft,memstore,non_voter_add=trace cargo test -p async-raft --test non_voter_add
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn non_voter_add_readd() -> Result<()> {
+    //
+    // - Add leader, expect NoChange
+    // - Add a non-voter, expect raft to block until catching up.
+    // - Re-add should fail.
+
+    let (_log_guard, ut_span) = init_ut!();
+    let _ent = ut_span.enter();
+
+    let config = Arc::new(
+        Config {
+            replication_lag_threshold: 0,
+            ..Default::default()
+        }
+        .validate()?,
+    );
+    let router = Arc::new(RaftRouter::new(config.clone()));
+
+    let mut n_logs = router.new_nodes_from_single(btreeset! {0}, btreeset! {}).await?;
+
+    tracing::info!("--- re-adding leader does nothing");
+    {
+        let res = router.add_non_voter(0, 0).await?;
+        assert_eq!(RaftResponse::NoChange, res);
+    }
+
+    tracing::info!("--- add new node node-1");
+    {
+        tracing::info!("--- write up to 1000 logs");
+
+        router.client_request_many(0, "non_voter_add", 1000 - n_logs as usize).await;
+        n_logs = 1000;
+
+        tracing::info!("--- write up to 1000 logs done");
+
+        router.wait_for_log(&btreeset! {0}, n_logs, timeout(), "write 1000 logs to leader").await?;
+
+        router.new_raft_node(1).await;
+        router.add_non_voter(0, 1).await?;
+
+        tracing::info!("--- add_non_voter blocks until the replication catches up");
+        let sto1 = router.get_storage_handle(&1).await?;
+        assert_eq!(n_logs, sto1.get_log_entries(..).await?.len() as u64);
+
+        router.wait_for_log(&btreeset! {0,1}, n_logs, timeout(), "replication to non_voter").await?;
+    }
+
+    tracing::info!("--- re-add node-1, expect error");
+    {
+        let res = router.add_non_voter(0, 1).await?;
+        assert_eq!(RaftResponse::NoChange, res);
+    }
+
+    Ok(())
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn non_voter_add_non_blocking() -> Result<()> {
+    //
+    // - Add leader, expect NoChange
+    // - Add a non-voter, expect raft to block until catching up.
+    // - Re-add should fail.
+
+    let (_log_guard, ut_span) = init_ut!();
+    let _ent = ut_span.enter();
+
+    let config = Arc::new(
+        Config {
+            replication_lag_threshold: 0,
+            ..Default::default()
+        }
+        .validate()?,
+    );
+    let router = Arc::new(RaftRouter::new(config.clone()));
+
+    let mut n_logs = router.new_nodes_from_single(btreeset! {0}, btreeset! {}).await?;
+
+    tracing::info!("--- add new node node-1, in non blocking mode");
+    {
+        tracing::info!("--- write up to 100 logs");
+
+        router.client_request_many(0, "non_voter_add", 100 - n_logs as usize).await;
+        n_logs = 100;
+
+        router.wait(&0, timeout()).await?.log(n_logs, "received 100 logs").await?;
+
+        router.new_raft_node(1).await;
+        let res = router.add_non_voter_with_blocking(0, 1, false).await?;
+
+        assert_eq!(
+            RaftResponse::LogId {
+                log_id: LogId { term: 0, index: 0 }
+            },
+            res
+        );
+    }
+
+    Ok(())
+}
+
+fn timeout() -> Option<Duration> {
+    Some(Duration::from_micros(500))
+}
diff --git a/async-raft/tests/snapshot_overrides_membership.rs b/async-raft/tests/snapshot_overrides_membership.rs
index 7870ca0aa..9205b6e81 100644
--- a/async-raft/tests/snapshot_overrides_membership.rs
+++ b/async-raft/tests/snapshot_overrides_membership.rs
@@ -119,7 +119,7 @@ async fn snapshot_overrides_membership() -> Result<()> {
                         members: btreeset![2, 3],
                         members_after_consensus: None,
                     },
-                    m
+                    m.membership
                 );
             }
         }
@@ -154,7 +154,7 @@ async fn snapshot_overrides_membership() -> Result<()> {
                     members: btreeset![0],
                     members_after_consensus: None,
                 },
-                m,
+                m.membership,
                 "membership should be overridden by the snapshot"
             );
         }
diff --git a/async-raft/tests/snapshot_uses_prev_snap_membership.rs b/async-raft/tests/snapshot_uses_prev_snap_membership.rs
index bbf60c98a..dca0ed085 100644
--- a/async-raft/tests/snapshot_uses_prev_snap_membership.rs
+++ b/async-raft/tests/snapshot_uses_prev_snap_membership.rs
@@ -87,7 +87,7 @@ async fn snapshot_uses_prev_snap_membership() -> Result<()> {
                 members: btreeset![0, 1],
                 members_after_consensus: None,
             },
-            m,
+            m.membership,
             "membership "
         );
 
@@ -130,7 +130,7 @@ async fn snapshot_uses_prev_snap_membership() -> Result<()> {
                 members: btreeset![0, 1],
                 members_after_consensus: None,
             },
-            m,
+            m.membership,
             "membership "
         );
     }
diff --git a/async-raft/tests/state_machien_apply_membership.rs b/async-raft/tests/state_machien_apply_membership.rs
index 4e4cc64ee..b900212f3 100644
--- a/async-raft/tests/state_machien_apply_membership.rs
+++ b/async-raft/tests/state_machien_apply_membership.rs
@@ -2,6 +2,7 @@ use std::sync::Arc;
 
 use anyhow::Result;
 use async_raft::raft::MembershipConfig;
+use async_raft::ActiveMembership;
 use async_raft::Config;
 use async_raft::LogId;
 use async_raft::RaftStorage;
@@ -50,10 +51,13 @@ async fn state_machine_apply_membership() -> Result<()> {
     for i in 0..=0 {
         let sto = router.get_storage_handle(&i).await?;
         assert_eq!(
-            Some((LogId { term: 1, index: 1 }, MembershipConfig {
-                members: btreeset![0],
-                members_after_consensus: None,
-            })),
+            Some(ActiveMembership {
+                log_id: LogId { term: 1, index: 1 },
+                membership: MembershipConfig {
+                    members: btreeset![0],
+                    members_after_consensus: None,
+                }
+            }),
             sto.last_applied_state().await?.1
         );
     }
@@ -78,17 +82,27 @@ async fn state_machine_apply_membership() -> Result<()> {
     router.change_membership(0, btreeset![0, 1, 2]).await?;
     want += 2;
 
-    router.wait_for_log(&btreeset![0, 1, 2, 3, 4], want, None, "cluster of 5 candidates").await?;
+    // router.wait_for_log(&btreeset![0, 1, 2, 3, 4], want, None, "cluster of 5 candidates").await?;
 
-    tracing::info!("--- check applied membership config");
+    tracing::info!("--- every node receives joint log");
     for i in 0..5 {
+        router.wait(&i, None).await?.metrics(|x| x.last_applied >= want - 1, "joint log applied").await?;
+    }
+
+    tracing::info!("--- only 3 node applied membership config");
+    for i in 0..3 {
+        router.wait(&i, None).await?.metrics(|x| x.last_applied == want, "uniform log applied").await?;
+
         let sto = router.get_storage_handle(&i).await?;
         let (_, last_membership) = sto.last_applied_state().await?;
         assert_eq!(
-            Some((LogId { term: 1, index: 3 }, MembershipConfig {
-                members: btreeset![0, 1, 2],
-                members_after_consensus: None,
-            })),
+            Some(ActiveMembership {
+                log_id: LogId { term: 1, index: 3 },
+                membership: MembershipConfig {
+                    members: btreeset![0, 1, 2],
+                    members_after_consensus: None,
+                }
+            }),
             last_membership
         );
     }
diff --git a/async-raft/tests/stepdown.rs b/async-raft/tests/stepdown.rs
index ae4c23faf..ada6c0d08 100644
--- a/async-raft/tests/stepdown.rs
+++ b/async-raft/tests/stepdown.rs
@@ -83,7 +83,7 @@ async fn stepdown() -> Result<()> {
             .into_iter()
             .find(|node| node.id == 0)
             .expect("expected to find metrics on original leader node");
-        let cfg = metrics.membership_config;
+        let cfg = metrics.membership_config.membership;
         assert!(
             metrics.state != State::Leader,
             "expected old leader to have stepped down"
diff --git a/memstore/src/lib.rs b/memstore/src/lib.rs
index ac5170ee2..eb92e14f6 100644
--- a/memstore/src/lib.rs
+++ b/memstore/src/lib.rs
@@ -22,6 +22,7 @@ use async_raft::raft::MembershipConfig;
 use async_raft::storage::HardState;
 use async_raft::storage::InitialState;
 use async_raft::storage::Snapshot;
+use async_raft::ActiveMembership;
 use async_raft::AppData;
 use async_raft::AppDataResponse;
 use async_raft::DefensiveError;
@@ -78,7 +79,7 @@ pub struct MemStoreSnapshot {
 pub struct MemStoreStateMachine {
     pub last_applied_log: LogId,
 
-    pub last_membership: Option<(LogId, MembershipConfig)>,
+    pub last_membership: Option<ActiveMembership>,
 
     /// A mapping of client IDs to their state info.
     pub client_serial_responses: HashMap<String, (u64, Option<String>)>,
@@ -472,20 +473,23 @@ impl RaftStorageDebug<MemStoreStateMachine> for MemStore {
 }
 
 impl MemStore {
-    fn find_first_membership_log<'a, T, D>(mut it: T) -> Option<(LogId, MembershipConfig)>
+    fn find_first_membership_log<'a, T, D>(mut it: T) -> Option<ActiveMembership>
     where
         T: 'a + Iterator<Item = &'a Entry<D>>,
         D: AppData,
     {
         it.find_map(|entry| match &entry.payload {
-            EntryPayload::ConfigChange(cfg) => Some((entry.log_id, cfg.membership.clone())),
+            EntryPayload::ConfigChange(cfg) => Some(ActiveMembership {
+                log_id: entry.log_id,
+                membership: cfg.membership.clone(),
+            }),
             _ => None,
         })
     }
 
     /// Go backwards through the log to find the most recent membership config <= `upto_index`.
     #[tracing::instrument(level = "trace", skip(self))]
-    pub async fn get_membership_from_log(&self, upto_index: Option<u64>) -> Result<MembershipConfig, StorageError> {
+    pub async fn get_membership_from_log(&self, upto_index: Option<u64>) -> Result<ActiveMembership, StorageError> {
         self.defensive_no_dirty_log().await?;
 
         let membership_in_log = {
@@ -506,7 +510,7 @@ impl MemStore {
         let (_, membership_in_sm) = self.last_applied_state().await?;
 
         let membership =
-            if membership_in_log.as_ref().map(|(id, _)| id.index) > membership_in_sm.as_ref().map(|(id, _)| id.index) {
+            if membership_in_log.as_ref().map(|x| x.log_id.index) > membership_in_sm.as_ref().map(|x| x.log_id.index) {
                 membership_in_log
             } else {
                 membership_in_sm
@@ -515,8 +519,11 @@ impl MemStore {
         // Create a default one if both are None.
 
         Ok(match membership {
-            Some((_id, cfg)) => cfg,
-            None => MembershipConfig::new_initial(self.id),
+            Some(x) => x,
+            None => ActiveMembership {
+                log_id: LogId { term: 0, index: 0 },
+                membership: MembershipConfig::new_initial(self.id),
+            },
         })
     }
 }
@@ -532,7 +539,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
     }
 
     #[tracing::instrument(level = "trace", skip(self))]
-    async fn get_membership_config(&self) -> Result<MembershipConfig, StorageError> {
+    async fn get_membership_config(&self) -> Result<ActiveMembership, StorageError> {
         self.get_membership_from_log(None).await
     }
 
@@ -559,7 +566,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
                     last_log_id,
                     last_applied,
                     hard_state: inner.clone(),
-                    membership,
+                    last_membership: membership,
                 })
             }
             None => {
@@ -609,7 +616,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
         Ok(last)
     }
 
-    async fn last_applied_state(&self) -> Result<(LogId, Option<(LogId, MembershipConfig)>), StorageError> {
+    async fn last_applied_state(&self) -> Result<(LogId, Option<ActiveMembership>), StorageError> {
         let sm = self.sm.read().await;
         Ok((sm.last_applied_log, sm.last_membership.clone()))
     }
@@ -678,7 +685,10 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
                     res.push(ClientResponse(previous));
                 }
                 EntryPayload::ConfigChange(ref mem) => {
-                    sm.last_membership = Some((entry.log_id, mem.membership.clone()));
+                    sm.last_membership = Some(ActiveMembership {
+                        log_id: entry.log_id,
+                        membership: mem.membership.clone(),
+                    });
                     res.push(ClientResponse(None))
                 }
             };
@@ -701,7 +711,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
             membership_config = sm
                 .last_membership
                 .clone()
-                .map(|(_id, mem)| mem)
+                .map(|x| x.membership)
                 .unwrap_or_else(|| MembershipConfig::new_initial(self.id));
         }
 
diff --git a/memstore/src/test.rs b/memstore/src/test.rs
index 3fddff9fa..2a88a63f7 100644
--- a/memstore/src/test.rs
+++ b/memstore/src/test.rs
@@ -136,7 +136,7 @@ where
                 members: btreeset! {NODE_ID},
                 members_after_consensus: None,
             },
-            membership,
+            membership.membership,
         );
 
         Ok(())
@@ -172,7 +172,7 @@ where
                     members: btreeset! {3,4,5},
                     members_after_consensus: None,
                 },
-                mem,
+                mem.membership,
             );
         }
 
@@ -197,7 +197,7 @@ where
                     members: btreeset! {3, 4, 5},
                     members_after_consensus: None,
                 },
-                mem,
+                mem.membership,
             );
         }
 
@@ -222,7 +222,7 @@ where
                     members: btreeset! {1,2,3},
                     members_after_consensus: None,
                 },
-                mem,
+                mem.membership,
             );
         }
 
@@ -255,7 +255,7 @@ where
                 members: btreeset! {NODE_ID},
                 members_after_consensus: None,
             },
-            initial.membership,
+            initial.last_membership.membership,
         );
 
         assert_eq!(
@@ -341,7 +341,7 @@ where
                     members: btreeset! {3,4,5},
                     members_after_consensus: None,
                 },
-                initial.membership,
+                initial.last_membership.membership,
             );
         }
 
@@ -366,7 +366,7 @@ where
                     members: btreeset! {3, 4, 5},
                     members_after_consensus: None,
                 },
-                initial.membership,
+                initial.last_membership.membership,
             );
         }
 
@@ -391,7 +391,7 @@ where
                     members: btreeset! {1,2,3},
                     members_after_consensus: None,
                 },
-                initial.membership,
+                initial.last_membership.membership,
             );
         }
 
@@ -592,10 +592,13 @@ where
             let (applied, membership) = store.last_applied_state().await?;
             assert_eq!(LogId { term: 1, index: 3 }, applied);
             assert_eq!(
-                Some((LogId { term: 1, index: 3 }, MembershipConfig {
-                    members: btreeset! {1,2},
-                    members_after_consensus: None
-                })),
+                Some(ActiveMembership {
+                    log_id: LogId { term: 1, index: 3 },
+                    membership: MembershipConfig {
+                        members: btreeset! {1,2},
+                        members_after_consensus: None
+                    }
+                }),
                 membership
             );
         }
@@ -612,10 +615,13 @@ where
             let (applied, membership) = store.last_applied_state().await?;
             assert_eq!(LogId { term: 1, index: 5 }, applied);
             assert_eq!(
-                Some((LogId { term: 1, index: 3 }, MembershipConfig {
-                    members: btreeset! {1,2},
-                    members_after_consensus: None
-                })),
+                Some(ActiveMembership {
+                    log_id: LogId { term: 1, index: 3 },
+                    membership: MembershipConfig {
+                        members: btreeset! {1,2},
+                        members_after_consensus: None
+                    }
+                }),
                 membership
             );
         }