From e332ae4444a7e46162693125680dcbdbf5c253d0 Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 12 Apr 2022 20:54:36 +0800 Subject: [PATCH 1/4] raftstore: Introduce force leader state (#11932) close tikv/tikv#6107, ref tikv/tikv#10483 Signed-off-by: Connor1996 --- Cargo.lock | 1 + components/error_code/src/raftstore.rs | 3 + components/raftstore/Cargo.toml | 1 + components/raftstore/src/errors.rs | 9 + components/raftstore/src/store/fsm/peer.rs | 305 +++++++- components/raftstore/src/store/msg.rs | 6 +- components/raftstore/src/store/peer.rs | 89 ++- components/raftstore/src/store/util.rs | 14 - components/test_raftstore/src/cluster.rs | 22 + components/test_raftstore/src/pd.rs | 5 - tests/integrations/raftstore/mod.rs | 2 +- .../raftstore/test_unsafe_recover.rs | 86 -- .../raftstore/test_unsafe_recovery.rs | 739 ++++++++++++++++++ 13 files changed, 1168 insertions(+), 114 deletions(-) delete mode 100644 tests/integrations/raftstore/test_unsafe_recover.rs create mode 100644 tests/integrations/raftstore/test_unsafe_recovery.rs diff --git a/Cargo.lock b/Cargo.lock index 3b204784429..040735c07ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3491,6 +3491,7 @@ dependencies = [ "fs2", "futures 0.3.15", "futures-util", + "getset", "into_other", "itertools 0.10.0", "keys", diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index eebd6fbe403..d7bf1af4ad5 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -28,6 +28,7 @@ define_error_codes!( DATA_IS_NOT_READY => ("DataIsNotReady", "", ""), DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), PENDING_PREPARE_MERGE => ("PendingPrepareMerge", "", ""), + RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -60,6 +61,8 @@ impl ErrorCodeExt for errorpb::Error { PROPOSAL_IN_MERGING_MODE } else if self.has_data_is_not_ready() { DATA_IS_NOT_READY + } else if self.has_recovery_in_progress() { + RECOVERY_IN_PROGRESS } else { UNKNOWN } diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 935e5f59f71..d90c0b3563d 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -42,6 +42,7 @@ file_system = { path = "../file_system", default-features = false } fs2 = "0.4" futures = "0.3" futures-util = { version = "0.3.1", default-features = false, features = ["io"] } +getset = "0.1" into_other = { path = "../into_other", default-features = false } itertools = "0.10" keys = { path = "../keys", default-features = false } diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 801626ac5fc..9929f43b7f7 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -60,6 +60,9 @@ pub enum Error { #[error("store ids {0:?}, errmsg {1}")] DiskFull(Vec, String), + #[error("region {0} is in the recovery progress")] + RecoveryInProgress(u64), + #[error( "key {} is not in region key range [{}, {}) for region {}", log_wrappers::Value::key(.0), @@ -238,6 +241,11 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_region_not_initialized(e); } + Error::RecoveryInProgress(region_id) => { + let mut e = errorpb::RecoveryInProgress::default(); + e.set_region_id(region_id); + errorpb.set_recovery_in_progress(e); + } _ => {} }; @@ -271,6 +279,7 @@ impl ErrorCodeExt for Error { Error::RegionNotFound(_) => error_code::raftstore::REGION_NOT_FOUND, Error::NotLeader(..) => error_code::raftstore::NOT_LEADER, Error::DiskFull(..) => error_code::raftstore::DISK_FULL, + Error::RecoveryInProgress(..) => error_code::raftstore::RECOVERY_IN_PROGRESS, Error::StaleCommand => error_code::raftstore::STALE_COMMAND, Error::RegionNotInitialized(_) => error_code::raftstore::REGION_NOT_INITIALIZED, Error::KeyNotInRegion(..) => error_code::raftstore::KEY_NOT_IN_REGION, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 6b3ecdc32ad..f747309f0e4 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -66,11 +66,12 @@ use crate::store::memory::*; use crate::store::metrics::*; use crate::store::msg::{Callback, ExtCallback, InspectedRaftMessage}; use crate::store::peer::{ - ConsistencyState, Peer, PersistSnapshotResult, StaleState, TRANSFER_LEADER_COMMAND_REPLY_CTX, + ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, StaleState, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }; use crate::store::peer_storage::write_peer_state; use crate::store::transport::Transport; -use crate::store::util::{is_learner, KeysInfoFormatter}; +use crate::store::util::{is_learner, KeysInfoFormatter, LeaseState}; use crate::store::worker::{ ConsistencyCheckTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, @@ -1269,6 +1270,275 @@ where SignificantMsg::RaftlogFetched { context, res } => { self.on_raft_log_fetched(context, res); } + SignificantMsg::EnterForceLeaderState { failed_stores } => { + self.on_enter_pre_force_leader(failed_stores) + } + SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(), + } + } + + fn on_enter_pre_force_leader(&mut self, failed_stores: HashSet) { + match self.fsm.peer.force_leader { + Some(ForceLeaderState::PreForceLeader { .. }) => { + self.on_force_leader_fail(); + } + Some(ForceLeaderState::ForceLeader { .. }) => { + // already is a force leader, do nothing + return; + } + None => {} + } + + if self.fsm.peer.is_leader() { + warn!( + "reject pre force leader due to already being leader"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + + // When election timeout is triggered, leader_id is set to INVALID_ID. + // But learner(not promotable) is a exception here as it wouldn't tick + // election. + if self.fsm.peer.raft_group.raft.promotable() + && self.fsm.hibernate_state.group_state() != GroupState::Idle + && self.fsm.peer.leader_id() != raft::INVALID_ID + { + warn!( + "reject pre force leader due to leader lease may not expired"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + + let expected_alive_voter = self.get_force_leader_expected_alive_voter(&failed_stores); + if self + .fsm + .peer + .raft_group + .raft + .prs() + .has_quorum(&expected_alive_voter) + { + warn!( + "reject pre force leader due to has quorum"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + + info!( + "enter pre force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "alive_voter" => ?expected_alive_voter, + ); + + // Do not use prevote as prevote won't set `vote` to itself. + // When PD issues force leader on two different peer, it may cause + // two force leader in same term. + self.fsm.peer.raft_group.raft.pre_vote = false; + // trigger vote request to all voters, will check the vote result in `check_force_leader` + self.fsm.peer.raft_group.campaign().unwrap(); + assert_eq!(self.fsm.peer.get_role(), StateRole::Candidate); + if !self + .fsm + .peer + .raft_group + .raft + .prs() + .votes() + .get(&self.fsm.peer.peer_id()) + .unwrap() + { + warn!( + "pre force leader failed to campaign"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + self.on_force_leader_fail(); + return; + } + + self.fsm.peer.force_leader = Some(ForceLeaderState::PreForceLeader { failed_stores }); + self.fsm.has_ready = true; + } + + fn on_force_leader_fail(&mut self) { + self.fsm.peer.raft_group.raft.pre_vote = true; + self.fsm.peer.raft_group.raft.set_check_quorum(true); + self.fsm.peer.force_leader = None; + } + + fn on_enter_force_leader(&mut self) { + info!( + "enter force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + assert_eq!(self.fsm.peer.get_role(), StateRole::Candidate); + + let failed_stores = match self.fsm.peer.force_leader.take() { + Some(ForceLeaderState::PreForceLeader { failed_stores }) => failed_stores, + _ => unreachable!(), + }; + + let peer_ids: Vec<_> = self.fsm.peer.voters().iter().collect(); + for peer_id in peer_ids { + let store_id = self + .region() + .get_peers() + .iter() + .find(|p| p.get_id() == peer_id) + .unwrap() + .get_store_id(); + if !failed_stores.contains(&store_id) { + continue; + } + + // make fake vote response + let mut msg = raft::eraftpb::Message::new(); + msg.msg_type = MessageType::MsgRequestVoteResponse; + msg.reject = false; + msg.term = self.fsm.peer.term(); + msg.from = peer_id; + msg.to = self.fsm.peer.peer_id(); + self.fsm.peer.raft_group.step(msg).unwrap(); + } + + // after receiving all votes, should become leader + assert!(self.fsm.peer.is_leader()); + self.fsm.peer.raft_group.raft.set_check_quorum(false); + + // make sure it's not hibernated + self.reset_raft_tick(GroupState::Ordered); + + self.fsm.peer.force_leader = Some(ForceLeaderState::ForceLeader { failed_stores }); + self.fsm.has_ready = true; + } + + fn on_exit_force_leader(&mut self) { + if self.fsm.peer.force_leader.is_none() { + return; + } + + info!( + "exit force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + self.fsm.peer.force_leader = None; + // make sure it's not hibernated + self.reset_raft_tick(GroupState::Ordered); + // leader lease shouldn't be renewed in force leader state. + assert_eq!( + self.fsm.peer.leader_lease().inspect(None), + LeaseState::Expired + ); + self.fsm + .peer + .raft_group + .raft + .become_follower(self.fsm.peer.term(), raft::INVALID_ID); + + self.fsm.peer.raft_group.raft.set_check_quorum(true); + self.fsm.peer.raft_group.raft.pre_vote = true; + if self.fsm.peer.raft_group.raft.promotable() { + // let it trigger election immediately. + let _ = self.fsm.peer.raft_group.campaign(); + } + self.fsm.has_ready = true; + } + + #[inline] + fn get_force_leader_expected_alive_voter(&self, failed_stores: &HashSet) -> HashSet { + let region = self.region(); + self.fsm + .peer + .voters() + .iter() + .filter(|peer_id| { + let store_id = region + .get_peers() + .iter() + .find(|p| p.get_id() == *peer_id) + .unwrap() + .get_store_id(); + !failed_stores.contains(&store_id) + }) + .collect() + } + + #[inline] + fn check_force_leader(&mut self) { + let failed_stores = match &self.fsm.peer.force_leader { + None => return, + Some(ForceLeaderState::ForceLeader { .. }) => { + if self.fsm.peer.maybe_force_forward_commit_index() { + self.fsm.has_ready = true; + } + return; + } + Some(ForceLeaderState::PreForceLeader { failed_stores, .. }) => failed_stores, + }; + + if self.fsm.peer.raft_group.raft.election_elapsed + 1 + < self.ctx.cfg.raft_election_timeout_ticks + { + // wait as longer as it can to collect responses of request vote + return; + } + + let expected_alive_voter: HashSet<_> = + self.get_force_leader_expected_alive_voter(failed_stores); + let check = || { + if self.fsm.peer.raft_group.raft.state != StateRole::Candidate { + Err(format!( + "unexpected role {:?}", + self.fsm.peer.raft_group.raft.state + )) + } else { + let mut granted = 0; + for (id, vote) in self.fsm.peer.raft_group.raft.prs().votes() { + if expected_alive_voter.contains(id) { + if *vote { + granted += 1; + } else { + return Err(format!("receive reject response from {}", *id)); + } + } else if *id == self.fsm.peer_id() { + // self may be a learner + continue; + } else { + return Err(format!( + "receive unexpected vote from {} vote {}", + *id, *vote + )); + } + } + Ok(granted) + } + }; + + match check() { + Err(err) => { + warn!( + "pre force leader check failed"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "alive_voter" => ?expected_alive_voter, + "reason" => err, + ); + self.on_force_leader_fail(); + } + Ok(granted) => { + if granted == expected_alive_voter.len() { + self.on_enter_force_leader(); + } + } } } @@ -1382,6 +1652,19 @@ where self.register_raft_gc_log_tick(); self.register_check_leader_lease_tick(); } + + if let Some(ForceLeaderState::ForceLeader { .. }) = self.fsm.peer.force_leader { + if r != StateRole::Leader { + // for some reason, it's not leader anymore + info!( + "step down in force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "state" => ?r, + ); + self.on_force_leader_fail(); + } + } } } @@ -1512,6 +1795,8 @@ where self.fsm.peer.retry_pending_reads(&self.ctx.cfg); + self.check_force_leader(); + let mut res = None; if self.ctx.cfg.hibernate_regions { if self.fsm.hibernate_state.group_state() == GroupState::Idle { @@ -1847,9 +2132,7 @@ where Either::Right(v) => v, }; - if util::is_vote_msg(msg.get_message()) - || msg.get_message().get_msg_type() == MessageType::MsgTimeoutNow - { + if util::is_vote_msg(msg.get_message()) || msg_type == MessageType::MsgTimeoutNow { if self.fsm.hibernate_state.group_state() != GroupState::Chaos { self.fsm.reset_hibernate_state(GroupState::Chaos); self.register_raft_base_tick(); @@ -1861,7 +2144,7 @@ where let from_peer_id = msg.get_from_peer().get_id(); self.fsm.peer.insert_peer_cache(msg.take_from_peer()); - let result = if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { + let result = if msg_type == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(msg.get_message(), peer_disk_usage); Ok(()) } else { @@ -4161,6 +4444,16 @@ where let leader_id = self.fsm.peer.leader_id(); let request = msg.get_requests(); + if self.fsm.peer.force_leader.is_some() { + // in force leader state, forbid requests to make the recovery progress less error-prone + if !(msg.has_admin_request() + && (msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeer + || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2)) + { + return Err(Error::RecoveryInProgress(self.region_id())); + } + } + // ReadIndex can be processed on the replicas. let is_read_index_request = request.len() == 1 && request[0].get_cmd_type() == CmdType::ReadIndex; diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 35e7843a0a4..67954e28b19 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -23,8 +23,8 @@ use crate::store::fsm::apply::{CatchUpLogs, ChangeObserver}; use crate::store::metrics::RaftEventDurationType; use crate::store::util::{KeysInfoFormatter, LatencyInspector}; use crate::store::{RaftlogFetchResult, SnapKey}; +use collections::HashSet; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; - use super::{AbstractPeer, RegionSnapshot}; #[derive(Debug)] @@ -313,6 +313,10 @@ where context: GetEntriesContext, res: Box, }, + EnterForceLeaderState { + failed_stores: HashSet, + }, + ExitForceLeaderState, } /// Message that will be sent to a peer. diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9c3520ba330..9ce6965d34c 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -17,6 +17,7 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; +use getset::Getters; use kvproto::errorpb; use kvproto::kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp, LockInfo}; use kvproto::metapb::{self, PeerRole}; @@ -454,6 +455,13 @@ pub struct ReadyResult { pub has_write_ready: bool, } +#[derive(Debug)] +pub enum ForceLeaderState { + PreForceLeader { failed_stores: HashSet }, + ForceLeader { failed_stores: HashSet }, +} + +#[derive(Getters)] pub struct Peer where EK: KvEngine, @@ -478,6 +486,7 @@ where proposals: ProposalQueue, leader_missing_time: Option, + #[getset(get = "pub")] leader_lease: Lease, pending_reads: ReadIndexQueue, @@ -491,6 +500,20 @@ where /// 2. all read requests must be rejected. pub pending_remove: bool, + /// Force leader state is only used in online recovery when the majority of + /// peers are missing. In this state, it forces one peer to become leader out + /// of accordance with Raft election rule, and forbids any read/write proposals. + /// With that, we can further propose remove failed-nodes conf-change, to make + /// the Raft group forms majority and works normally later on. + /// + /// ForceLeader process would be: + /// 1. Enter pre force leader state, become candidate and send request vote to all peers + /// 2. Wait for the responses of the request vote, no reject should be received. + /// 3. Enter force leader state, become leader without leader lease + /// 4. Execute recovery plan(some remove-peer commands) + /// 5. After the plan steps are all applied, exit force leader state + pub force_leader: Option, + /// Record the instants of peers being added into the configuration. /// Remove them after they are not pending any more. pub peers_start_pending_time: Vec<(u64, Instant)>, @@ -682,6 +705,7 @@ where leader_unreachable: false, pending_remove: false, should_wake_up: false, + force_leader: None, pending_merge_state: None, want_rollback_merge_peers: HashSet::default(), pending_request_snapshot_count: Arc::new(AtomicUsize::new(0)), @@ -1605,6 +1629,39 @@ where false } + pub fn maybe_force_forward_commit_index(&mut self) -> bool { + let failed_stores = match &self.force_leader { + Some(ForceLeaderState::ForceLeader { failed_stores }) => failed_stores, + _ => unreachable!(), + }; + + let region = self.region(); + let mut replicated_idx = self.raft_group.raft.raft_log.persisted; + for (peer_id, p) in self.raft_group.raft.prs().iter() { + let store_id = region + .get_peers() + .iter() + .find(|p| p.get_id() == *peer_id) + .unwrap() + .get_store_id(); + if failed_stores.contains(&store_id) { + continue; + } + if replicated_idx > p.matched { + replicated_idx = p.matched; + } + } + + if self.raft_group.store().term(replicated_idx).unwrap_or(0) < self.term() { + // do not commit logs of previous term directly + return false; + } + + self.raft_group.raft.raft_log.committed = + std::cmp::max(self.raft_group.raft.raft_log.committed, replicated_idx); + true + } + pub fn check_stale_state(&mut self, ctx: &mut PollContext) -> StaleState { if self.is_leader() { // Leaders always have valid state. @@ -2495,6 +2552,11 @@ where let persist_index = self.raft_group.raft.raft_log.persisted; self.mut_store().update_cache_persisted(persist_index); + + if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { + // forward commit index, the committed entries will be applied in the next raft base tick round + self.maybe_force_forward_commit_index(); + } } if self.apply_snap_ctx.is_some() && self.unpersisted_readies.is_empty() { @@ -2533,6 +2595,10 @@ where self.report_commit_log_duration(pre_commit_index, &ctx.raft_metrics); let persist_index = self.raft_group.raft.raft_log.persisted; + if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { + // forward commit index, the committed entries will be applied in the next raft base tick round + self.maybe_force_forward_commit_index(); + } self.mut_store().update_cache_persisted(persist_index); self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics.ready); @@ -2841,6 +2907,13 @@ where "peer_id" => self.peer.get_id(), ); None + } else if self.force_leader.is_some() { + debug!( + "prevents renew lease while in force leader state"; + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + ); + None } else { self.leader_lease.renew(ts); let term = self.term(); @@ -3153,7 +3226,7 @@ where let promoted_commit_index = after_progress.maximal_committed_index().0; if current_progress.is_singleton() // It's always safe if there is only one node in the cluster. - || promoted_commit_index >= self.get_store().truncated_index() + || promoted_commit_index >= self.get_store().truncated_index() || self.force_leader.is_some() { return Ok(()); } @@ -3778,6 +3851,16 @@ where poll_ctx: &mut PollContext, mut req: RaftCmdRequest, ) -> Result> { + // Should not propose normal in force leader state. + // In `pre_propose_raft_command`, it rejects all the requests expect conf-change + // if in force leader state. + if self.force_leader.is_some() { + panic!( + "{} propose normal in force leader state {:?}", + self.tag, self.force_leader + ); + }; + if (self.pending_merge_state.is_some() && req.get_admin_request().get_cmd_type() != AdminCmdType::RollbackMerge) || (self.prepare_merge_fence > 0 @@ -4139,6 +4222,10 @@ where resp } + pub fn voters(&self) -> raft::util::Union<'_> { + self.raft_group.raft.prs().conf().voters().ids() + } + pub fn term(&self) -> u64 { self.raft_group.raft.term } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 9043982901c..2d0f5dd66c0 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -780,10 +780,6 @@ impl< } } -pub fn integration_on_half_fail_quorum_fn(voters: usize) -> usize { - (voters + 1) / 2 + 1 -} - #[derive(PartialEq, Eq, Debug)] pub enum ConfChangeKind { // Only contains one configuration change @@ -1935,16 +1931,6 @@ mod tests { } } - #[test] - fn test_integration_on_half_fail_quorum_fn() { - let voters = vec![1, 2, 3, 4, 5, 6, 7]; - let quorum = vec![2, 2, 3, 3, 4, 4, 5]; - for (voter_count, expected_quorum) in voters.into_iter().zip(quorum) { - let quorum = super::integration_on_half_fail_quorum_fn(voter_count); - assert_eq!(quorum, expected_quorum); - } - } - #[test] fn test_is_region_initialized() { let mut region = metapb::Region::default(); diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 55f5b991e7c..3e402a03713 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1460,6 +1460,28 @@ impl Cluster { .unwrap(); } + pub fn enter_force_leader( + &mut self, + region_id: u64, + store_id: u64, + failed_stores: HashSet, + ) { + let router = self.sim.rl().get_router(store_id).unwrap(); + router + .significant_send( + region_id, + SignificantMsg::EnterForceLeaderState { failed_stores }, + ) + .unwrap(); + } + + pub fn exit_force_leader(&mut self, region_id: u64, store_id: u64) { + let router = self.sim.rl().get_router(store_id).unwrap(); + router + .significant_send(region_id, SignificantMsg::ExitForceLeaderState) + .unwrap(); + } + pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { let mut try_cnt = 0; let split_count = self.pd_client.get_split_count(); diff --git a/components/test_raftstore/src/pd.rs b/components/test_raftstore/src/pd.rs index a436dae8d82..e3238480a31 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_raftstore/src/pd.rs @@ -1462,11 +1462,6 @@ impl PdClient for TestPdClient { { let cluster1 = Arc::clone(&self.cluster); let timer = self.timer.clone(); - { - if let Err(e) = self.cluster.try_write() { - println!("try write {:?}", e); - } - } let mut cluster = self.cluster.wl(); let store = cluster .stores diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index decc5ea1ab7..efa118fb8f1 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -28,5 +28,5 @@ mod test_status_command; mod test_tombstone; mod test_transfer_leader; mod test_transport; -mod test_unsafe_recover; +mod test_unsafe_recovery; mod test_update_region_size; diff --git a/tests/integrations/raftstore/test_unsafe_recover.rs b/tests/integrations/raftstore/test_unsafe_recover.rs deleted file mode 100644 index 5e7b774a6d1..00000000000 --- a/tests/integrations/raftstore/test_unsafe_recover.rs +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::iter::FromIterator; -use std::sync::Arc; - -use futures::executor::block_on; -use kvproto::metapb; -use pd_client::PdClient; -use test_raftstore::*; - -#[test] -fn test_unsafe_recover_update_region() { - let mut cluster = new_server_cluster(0, 3); - cluster.run(); - let nodes = Vec::from_iter(cluster.get_node_ids()); - assert_eq!(nodes.len(), 3); - - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer number check. - pd_client.disable_default_operator(); - - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - - configure_for_lease_read(&mut cluster, None, None); - cluster.stop_node(nodes[1]); - cluster.stop_node(nodes[2]); - cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); - - let mut update = metapb::Region::default(); - update.set_id(1); - update.set_end_key(b"anykey2".to_vec()); - for p in region.get_peers() { - if p.get_store_id() == nodes[0] { - update.mut_peers().push(p.clone()); - } - } - update.mut_region_epoch().set_version(1); - update.mut_region_epoch().set_conf_ver(1); - // Removes the boostrap region, since it overlaps with any regions we create. - cluster.must_update_region_for_unsafe_recover(nodes[0], &update); - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - assert_eq!(region.get_end_key(), b"anykey2"); -} - -#[test] -fn test_unsafe_recover_create_region() { - let mut cluster = new_server_cluster(0, 3); - cluster.run(); - let nodes = Vec::from_iter(cluster.get_node_ids()); - assert_eq!(nodes.len(), 3); - - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer number check. - pd_client.disable_default_operator(); - - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - - configure_for_lease_read(&mut cluster, None, None); - cluster.stop_node(nodes[1]); - cluster.stop_node(nodes[2]); - cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); - - let mut update = metapb::Region::default(); - update.set_id(1); - update.set_end_key(b"anykey".to_vec()); - for p in region.get_peers() { - if p.get_store_id() == nodes[0] { - update.mut_peers().push(p.clone()); - } - } - update.mut_region_epoch().set_version(1); - update.mut_region_epoch().set_conf_ver(1); - // Removes the boostrap region, since it overlaps with any regions we create. - cluster.must_update_region_for_unsafe_recover(nodes[0], &update); - block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - let mut create = metapb::Region::default(); - create.set_id(101); - create.set_start_key(b"anykey".to_vec()); - let mut peer = metapb::Peer::default(); - peer.set_id(102); - peer.set_store_id(nodes[0]); - create.mut_peers().push(peer); - cluster.must_recreate_region_for_unsafe_recover(nodes[0], &create); - let region = pd_client.get_region(b"anykey1").unwrap(); - assert_eq!(create.get_id(), region.get_id()); -} diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs new file mode 100644 index 00000000000..c97c5fa63c5 --- /dev/null +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -0,0 +1,739 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. + +use std::iter::FromIterator; +use std::sync::Arc; +use std::time::Duration; + +use collections::HashSet; +use futures::executor::block_on; +use kvproto::metapb; +use pd_client::PdClient; +use raft::eraftpb::ConfChangeType; +use raft::eraftpb::MessageType; +use test_raftstore::*; +use tikv_util::config::ReadableDuration; +use tikv_util::HandyRwLock; + +#[test] +fn test_unsafe_recover_update_region() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + configure_for_lease_read(&mut cluster, None, None); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); + + let mut update = metapb::Region::default(); + update.set_id(1); + update.set_end_key(b"anykey2".to_vec()); + for p in region.get_peers() { + if p.get_store_id() == nodes[0] { + update.mut_peers().push(p.clone()); + } + } + update.mut_region_epoch().set_version(1); + update.mut_region_epoch().set_conf_ver(1); + // Removes the boostrap region, since it overlaps with any regions we create. + cluster.must_update_region_for_unsafe_recover(nodes[0], &update); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + assert_eq!(region.get_end_key(), b"anykey2"); +} + +#[test] +fn test_unsafe_recover_create_region() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + configure_for_lease_read(&mut cluster, None, None); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); + + let mut update = metapb::Region::default(); + update.set_id(1); + update.set_end_key(b"anykey".to_vec()); + for p in region.get_peers() { + if p.get_store_id() == nodes[0] { + update.mut_peers().push(p.clone()); + } + } + update.mut_region_epoch().set_version(1); + update.mut_region_epoch().set_conf_ver(1); + // Removes the bootstrap region, since it overlaps with any regions we create. + cluster.must_update_region_for_unsafe_recover(nodes[0], &update); + block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_start_key(b"anykey".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + cluster.must_recreate_region_for_unsafe_recover(nodes[0], &create); + let region = pd_client.get_region(b"anykey1").unwrap(); + assert_eq!(create.get_id(), region.get_id()); +} + +fn must_get_error_recovery_in_progress( + cluster: &mut Cluster, + region: &metapb::Region, + cmd: kvproto::raft_cmdpb::Request, +) { + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![cmd], + true, + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); +} + +// Test the case that two of three nodes fail and force leader on the rest node. +#[test] +fn test_force_leader_three_nodes() { + let mut cluster = new_node_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let mut region = cluster.get_region(b"k2"); + let peer_on_store3 = find_peer(®ion, 3).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store3.clone()); + + cluster.stop_node(2); + cluster.stop_node(3); + + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request(region.get_id(), region.take_region_epoch(), vec![put], true); + // quorum is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![2, 3])); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 2).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + // forbid writes in force leader state + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + // forbid reads in force leader state + let get = new_get_cmd(b"k1"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + // forbid read index in force leader state + let read_index = new_read_index_cmd(); + must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on one of the +// rest nodes. +#[test] +fn test_force_leader_five_nodes() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let mut region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request(region.get_id(), region.take_region_epoch(), vec![put], true); + // quorum is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + // forbid writes in force leader state + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + // forbid reads in force leader state + let get = new_get_cmd(b"k1"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + // forbid read index in force leader state + let read_index = new_read_index_cmd(); + must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// which is a learner. +#[test] +fn test_force_leader_for_learner() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let mut region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + // replace one peer with learner + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store1.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(peer_on_store1.get_store_id(), peer_on_store1.get_id()), + ); + + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request(region.get_id(), region.take_region_epoch(), vec![put], true); + // quorum is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + // promote the learner first and remove the peers on failed nodes + cluster + .pd_client + .must_add_peer(region.get_id(), find_peer(®ion, 1).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); + cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// with triggering snapshot. +#[test] +fn test_force_leader_trigger_snapshot() { + let mut cluster = new_node_cluster(0, 5); + configure_for_snapshot(&mut cluster); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(90); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // Isolate node 2 + cluster.add_send_filter(IsolationFilterFactory::new(2)); + + // Compact logs to force requesting snapshot after clearing send filters. + let state = cluster.truncated_state(region.get_id(), 1); + // Write some data to trigger snapshot. + for i in 100..150 { + let key = format!("k{}", i); + let value = format!("v{}", i); + cluster.must_put(key.as_bytes(), value.as_bytes()); + } + cluster.wait_log_truncated(region.get_id(), 1, state.get_index() + 40); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // Recover the isolation of 2, but still don't permit snapshot + let recv_filter = Box::new( + RegionPacketFilter::new(region.get_id(), 2) + .direction(Direction::Recv) + .msg_type(MessageType::MsgSnapshot), + ); + cluster.sim.wl().add_recv_filter(2, recv_filter); + cluster.clear_send_filters(); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 3).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + // Though it has a force leader now, but the command can't committed because the log is not replicated to all the alive peers. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap() + .get_header() + .has_error() // error "there is a pending conf change" indicating no committed log after being the leader + ); + + // Permit snapshot message, snapshot should be applied and advance commit index now. + cluster.sim.wl().clear_recv_filters(2); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); + cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// with uncommitted conf change. +#[test] +fn test_force_leader_with_uncommitted_conf_change() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(90); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + + // an uncommitted conf-change + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 2).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + // the uncommitted conf-change is committed successfully after being force leader + cluster + .pd_client + .must_none_peer(region.get_id(), find_peer(®ion, 2).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), Some(b"v2".to_vec())); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that none of five nodes fails and force leader on one of the nodes. +// Note: It still can't defend extreme misuse cases. For example, a group of a, +// b and c. c is isolated from a, a is the leader. If c has increased its term +// by 2 somehow (for example false prevote success twice) and force leader is +// sent to b and break lease constrain, then b will reject a's heartbeat while +// can vote for c. So c becomes leader and there are two leaders in the group. +#[test] +fn test_force_leader_on_healthy_region() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(30); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // try to enter force leader, it can't succeed due to quorum isn't lost + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + // make sure it leaves pre force leader state. + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + // put and get can propose successfully. + assert_eq!(cluster.must_get(b"k1"), Some(b"v1".to_vec())); + cluster.must_put(b"k2", b"v2"); + + // try to exit force leader, it will be ignored silently as it's not in the force leader state + cluster.exit_force_leader(region.get_id(), 1); + + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on the one not +// having latest log +#[test] +fn test_force_leader_on_wrong_leader() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let mut region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // peer on node2 doesn't have latest committed log + cluster.stop_node(2); + cluster.must_put(b"k2", b"v2"); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + cluster.run_node(2).unwrap(); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + + let put = new_put_cmd(b"k3", b"v3"); + let req = new_request(region.get_id(), region.take_region_epoch(), vec![put], true); + // quorum is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .is_err() + ); + + // try to force leader on peer of node2 which is stale + cluster.enter_force_leader(region.get_id(), 2, HashSet::from_iter(vec![3, 4, 5])); + // can't propose confchange as it's not in force leader state + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 3).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + cluster.exit_force_leader(region.get_id(), 2); + + // peer on node2 still doesn't have the latest committed log. + must_get_none(&cluster.get_engine(2), b"k2"); +} + +// Test the case that three of five nodes fail and force leader twice on +// peers on different nodes +#[test] +fn test_force_leader_twice_on_different_peers() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + cluster.stop_node(2); + cluster.run_node(2).unwrap(); + + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + std::thread::sleep(Duration::from_millis(100)); + // enter force leader on a different peer + cluster.enter_force_leader(region.get_id(), 2, HashSet::from_iter(vec![3, 4, 5])); + // leader is the peer of store 2 + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap(), + *find_peer(®ion, 2).unwrap() + ); + + // peer of store 1 should exit force leader state, and propose conf-change on it should fail + let conf_change = new_change_peer_request(ConfChangeType::RemoveNode, new_peer(3, 3)); + let mut req = new_admin_request(region.get_id(), region.get_region_epoch(), conf_change); + req.mut_header() + .set_peer(find_peer(®ion, 1).unwrap().clone()); + let resp = cluster + .call_command(req, Duration::from_millis(10)) + .unwrap(); + let mut not_leader = kvproto::errorpb::NotLeader { + region_id: region.get_id(), + ..Default::default() + }; + not_leader.set_leader(find_peer(®ion, 2).unwrap().clone()); + assert_eq!(resp.get_header().get_error().get_not_leader(), ¬_leader,); + + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 2); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader twice on +// peer on the same node +#[test] +fn test_force_leader_twice_on_same_peer() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + cluster.stop_node(2); + cluster.run_node(2).unwrap(); + + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + std::thread::sleep(Duration::from_millis(100)); + // enter force leader on the same peer + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader doesn't finish +// in one election rounds due to network partition. +#[test] +fn test_force_leader_multiple_election_rounds() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(30); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.add_send_filter(IsolationFilterFactory::new(1)); + cluster.add_send_filter(IsolationFilterFactory::new(2)); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + // wait multiple election rounds + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 6, + )); + + cluster.clear_send_filters(); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} From c010a5f8f7aa7389fc23bfa1bf1bbc91af9ee33c Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 20 Apr 2022 19:14:04 +0800 Subject: [PATCH 2/4] raftstore: Wait ticks for hibernated peer when doing force leader (#12364) ref tikv/tikv#10483 Force leader is rejected on a peer who is already a leader. For the hibernated leader, it doesn't step down to follower when quorum is missing due to not ticking election. So wait ticks in force leader process for hibernated peers to make sure election ticking is performed. Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/peer.rs | 86 +++++++++++++--- components/raftstore/src/store/peer.rs | 30 ++++-- .../raftstore/test_unsafe_recovery.rs | 97 +++++++++++++++++++ 3 files changed, 189 insertions(+), 24 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index f747309f0e4..d631d977a96 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1286,30 +1286,69 @@ where // already is a force leader, do nothing return; } + Some(ForceLeaderState::WaitTicks { .. }) => { + self.fsm.peer.force_leader = None; + } None => {} } - if self.fsm.peer.is_leader() { - warn!( - "reject pre force leader due to already being leader"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return; - } - + let ticks = if self.fsm.peer.is_leader() { + if self.fsm.hibernate_state.group_state() == GroupState::Ordered { + warn!( + "reject pre force leader due to already being leader"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + // wait two rounds of election timeout to trigger check quorum to step down the leader + // note: check quorum is triggered every `election_timeout` instead of `randomized_election_timeout` + Some( + self.fsm.peer.raft_group.raft.election_timeout() * 2 + - self.fsm.peer.raft_group.raft.election_elapsed, + ) // When election timeout is triggered, leader_id is set to INVALID_ID. // But learner(not promotable) is a exception here as it wouldn't tick // election. - if self.fsm.peer.raft_group.raft.promotable() - && self.fsm.hibernate_state.group_state() != GroupState::Idle + } else if self.fsm.peer.raft_group.raft.promotable() && self.fsm.peer.leader_id() != raft::INVALID_ID { - warn!( - "reject pre force leader due to leader lease may not expired"; + if self.fsm.hibernate_state.group_state() == GroupState::Ordered + || self.fsm.hibernate_state.group_state() == GroupState::Chaos + { + warn!( + "reject pre force leader due to leader lease may not expired"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + // wait one round of election timeout to make sure leader_id is invalid + Some( + self.fsm.peer.raft_group.raft.randomized_election_timeout() + - self.fsm.peer.raft_group.raft.election_elapsed, + ) + } else { + None + }; + + if let Some(ticks) = ticks { + info!( + "enter wait ticks"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "ticks" => ticks, ); + self.fsm.peer.force_leader = Some(ForceLeaderState::WaitTicks { + failed_stores, + ticks, + }); + self.reset_raft_tick(if self.fsm.peer.is_leader() { + GroupState::Ordered + } else { + GroupState::Chaos + }); + self.fsm.has_ready = true; return; } @@ -1432,7 +1471,7 @@ where ); self.fsm.peer.force_leader = None; // make sure it's not hibernated - self.reset_raft_tick(GroupState::Ordered); + assert_eq!(self.fsm.hibernate_state.group_state(), GroupState::Ordered); // leader lease shouldn't be renewed in force leader state. assert_eq!( self.fsm.peer.leader_lease().inspect(None), @@ -1474,6 +1513,20 @@ where #[inline] fn check_force_leader(&mut self) { + if let Some(ForceLeaderState::WaitTicks { + failed_stores, + ticks, + }) = &mut self.fsm.peer.force_leader + { + if *ticks == 0 { + let s = mem::take(failed_stores); + self.on_enter_pre_force_leader(s); + } else { + *ticks -= 1; + } + return; + }; + let failed_stores = match &self.fsm.peer.force_leader { None => return, Some(ForceLeaderState::ForceLeader { .. }) => { @@ -1483,6 +1536,7 @@ where return; } Some(ForceLeaderState::PreForceLeader { failed_stores, .. }) => failed_stores, + Some(ForceLeaderState::WaitTicks { .. }) => unreachable!(), }; if self.fsm.peer.raft_group.raft.election_elapsed + 1 @@ -1804,14 +1858,14 @@ where // follower may tick more than an election timeout in chaos state. // Before stopping tick, `missing_tick` should be `raft_election_timeout_ticks` - 2 // - `raft_heartbeat_ticks` (default 10 - 2 - 2 = 6) - // and the follwer's `election_elapsed` in raft-rs is 1. + // and the follower's `election_elapsed` in raft-rs is 1. // After the group state becomes Chaos, the next tick will call `raft_group.tick` // `missing_tick` + 1 times(default 7). // Then the follower's `election_elapsed` will be 1 + `missing_tick` + 1 // (default 1 + 6 + 1 = 8) which is less than the min election timeout. // The reason is that we don't want let all followers become (pre)candidate if one // follower may receive a request, then becomes (pre)candidate and sends (pre)vote msg - // to others. As long as the leader can wake up and broadcast hearbeats in one `raft_heartbeat_ticks` + // to others. As long as the leader can wake up and broadcast heartbeats in one `raft_heartbeat_ticks` // time(default 2s), no more followers will wake up and sends vote msg again. if self.fsm.missing_ticks + 1 /* for the next tick after the peer isn't Idle */ + self.fsm.peer.raft_group.raft.election_elapsed diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9ce6965d34c..0d489996b02 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -456,9 +456,24 @@ pub struct ReadyResult { } #[derive(Debug)] +/// ForceLeader process would be: +/// 1. If it's hibernated, enter wait ticks state, and wake up the peer +/// 2. Enter pre force leader state, become candidate and send request vote to all peers +/// 3. Wait for the responses of the request vote, no reject should be received. +/// 4. Enter force leader state, become leader without leader lease +/// 5. Execute recovery plan(some remove-peer commands) +/// 6. After the plan steps are all applied, exit force leader state pub enum ForceLeaderState { - PreForceLeader { failed_stores: HashSet }, - ForceLeader { failed_stores: HashSet }, + WaitTicks { + failed_stores: HashSet, + ticks: usize, + }, + PreForceLeader { + failed_stores: HashSet, + }, + ForceLeader { + failed_stores: HashSet, + }, } #[derive(Getters)] @@ -506,12 +521,7 @@ where /// With that, we can further propose remove failed-nodes conf-change, to make /// the Raft group forms majority and works normally later on. /// - /// ForceLeader process would be: - /// 1. Enter pre force leader state, become candidate and send request vote to all peers - /// 2. Wait for the responses of the request vote, no reject should be received. - /// 3. Enter force leader state, become leader without leader lease - /// 4. Execute recovery plan(some remove-peer commands) - /// 5. After the plan steps are all applied, exit force leader state + /// For details, see the comment of `ForceLeaderState`. pub force_leader: Option, /// Record the instants of peers being added into the configuration. @@ -1103,6 +1113,10 @@ where res.reason = "transfer leader"; return res; } + if self.force_leader.is_some() { + res.reason = "force leader"; + return res; + } // Unapplied entries can change the configuration of the group. if self.get_store().applied_index() < last_index { res.reason = "unapplied"; diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index c97c5fa63c5..e3740dd300e 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -300,6 +300,103 @@ fn test_force_leader_for_learner() { cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); } +// Test the case that three of five nodes fail and force leader on a hibernated +// previous leader. +#[test] +fn test_force_leader_on_hibernated_leader() { + let mut cluster = new_node_cluster(0, 5); + configure_for_hibernate(&mut cluster); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // wait a while to hibernate + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on a hibernated +// previous follower. +#[test] +fn test_force_leader_on_hibernated_follower() { + test_util::init_log_for_test(); + let mut cluster = new_node_cluster(0, 5); + configure_for_hibernate(&mut cluster); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // wait a while to hibernate + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + // Test the case that three of five nodes fail and force leader on the rest node // with triggering snapshot. #[test] From 7acedff294669d93b32ebb9eb3e805964e1a8a47 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 14 Apr 2022 19:10:35 -0700 Subject: [PATCH 3/4] raftstore: Make unsafe recovery wait apply cover snapshot apply cases ref #10483 (#12308) ref tikv/tikv#10483 Signed-off-by: v01dstar --- components/raftstore/src/store/fsm/peer.rs | 92 ++++--------- components/raftstore/src/store/fsm/store.rs | 7 +- components/raftstore/src/store/msg.rs | 2 + components/raftstore/src/store/peer.rs | 52 +++++++- .../raftstore/src/store/worker/raftlog_gc.rs | 1 + .../failpoints/cases/test_unsafe_recovery.rs | 126 +++++++++++++----- .../raftstore/test_unsafe_recovery.rs | 13 +- 7 files changed, 187 insertions(+), 106 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d631d977a96..f1bd0f6be7f 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -23,7 +23,7 @@ use kvproto::errorpb; use kvproto::import_sstpb::SwitchMode; use kvproto::kvrpcpb::DiskFullOpt; use kvproto::metapb::{self, Region, RegionEpoch}; -use kvproto::pdpb::{CheckPolicy, StoreStats}; +use kvproto::pdpb::CheckPolicy; use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, CmdType, PutRequest, RaftCmdRequest, RaftCmdResponse, Request, StatusCmdType, StatusResponse, @@ -58,7 +58,7 @@ use crate::store::cmd_resp::{bind_term, new_error}; use crate::store::fsm::store::{PollContext, StoreMeta}; use crate::store::fsm::{ apply, ApplyMetrics, ApplyTask, ApplyTaskRes, CatchUpLogs, ChangeObserver, ChangePeer, - ExecResult, StoreInfo, + ExecResult, }; use crate::store::hibernate_state::{GroupState, HibernateState}; use crate::store::local_metrics::RaftMetrics; @@ -67,7 +67,7 @@ use crate::store::metrics::*; use crate::store::msg::{Callback, ExtCallback, InspectedRaftMessage}; use crate::store::peer::{ ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, StaleState, - TRANSFER_LEADER_COMMAND_REPLY_CTX, + UnsafeRecoveryState, TRANSFER_LEADER_COMMAND_REPLY_CTX, }; use crate::store::peer_storage::write_peer_state; use crate::store::transport::Transport; @@ -149,12 +149,6 @@ where /// Before actually destroying a peer, ensure all log gc tasks are finished, so we /// can start destroying without seeking. logs_gc_flushed: bool, - - /// To make sure the reported store/peer meta is up to date, each peer has to wait for the log - /// at its target commit index to be applied. The last peer does so triggers the next procedure - /// which is reporting the store/peer meta to PD. - unsafe_recovery_target_commit_index: Option, - unsafe_recovery_wait_apply_counter: Option>, } pub struct BatchRaftCmdRequestBuilder @@ -273,8 +267,6 @@ where trace: PeerMemoryTrace::default(), delayed_destroy: None, logs_gc_flushed: false, - unsafe_recovery_target_commit_index: None, - unsafe_recovery_wait_apply_counter: None, }), )) } @@ -329,8 +321,6 @@ where trace: PeerMemoryTrace::default(), delayed_destroy: None, logs_gc_flushed: false, - unsafe_recovery_target_commit_index: None, - unsafe_recovery_wait_apply_counter: None, }), )) } @@ -880,54 +870,22 @@ where self.register_raft_base_tick(); } - fn finish_unsafe_recovery_wait_apply(&mut self) { - if self - .fsm - .unsafe_recovery_wait_apply_counter - .as_ref() - .unwrap() - .fetch_sub(1, Ordering::Relaxed) - == 1 - { - let mut stats = StoreStats::default(); - stats.set_store_id(self.store_id()); - let store_info = StoreInfo { - kv_engine: self.ctx.engines.kv.clone(), - raft_engine: self.ctx.engines.raft.clone(), - capacity: self.ctx.cfg.capacity.0, - }; - let task = PdTask::StoreHeartbeat { - stats, - store_info, - send_detailed_report: true, - dr_autosync_status: self - .ctx - .global_replication_state - .lock() - .unwrap() - .store_dr_autosync_status(), - }; - if let Err(e) = self.ctx.pd_scheduler.schedule(task) { - panic!("fail to send detailed report to pd {:?}", e); - } - } - self.fsm.unsafe_recovery_target_commit_index = None; - self.fsm.unsafe_recovery_wait_apply_counter = None; - } - fn on_unsafe_recovery_wait_apply(&mut self, counter: Arc) { - self.fsm.unsafe_recovery_target_commit_index = - Some(self.fsm.peer.raft_group.store().commit_index()); - self.fsm.unsafe_recovery_wait_apply_counter = Some(counter); - // If the applied index equals to the commit index, there is nothing to wait for, proceeds - // to the next step immediately. If they are not equal, further checks will be performed in - // on_apply_res(). - if self.fsm.stopped - || self.fsm.peer.raft_group.store().applied_index() - == self.fsm.peer.raft_group.store().commit_index() - { - self.finish_unsafe_recovery_wait_apply(); - } + info!( + "unsafe recovery wait apply"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + "target_index" => self.fsm.peer.raft_group.raft.raft_log.committed, + "task_counter" => counter.load(Ordering::Relaxed) + ); + + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { + target_index: self.fsm.peer.raft_group.raft.raft_log.committed, + task_counter: counter, + }); + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(self.ctx, /*force=*/ self.fsm.stopped); } fn on_casual_msg(&mut self, msg: CasualMessage) { @@ -1998,10 +1956,10 @@ where } } // After a log has been applied, check if we need to trigger the unsafe recovery reporting procedure. - if let Some(target_commit_index) = self.fsm.unsafe_recovery_target_commit_index { - if self.fsm.peer.raft_group.store().applied_index() >= target_commit_index { - self.finish_unsafe_recovery_wait_apply(); - } + if self.fsm.peer.unsafe_recovery_state.is_some() { + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(self.ctx, /*force=*/ false); } } @@ -3084,8 +3042,10 @@ where assert!(!self.fsm.peer.is_handling_snapshot()); // No need to wait for the apply anymore. - if self.fsm.unsafe_recovery_target_commit_index.is_some() { - self.finish_unsafe_recovery_wait_apply(); + if self.fsm.peer.unsafe_recovery_state.is_some() { + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(self.ctx, /*force=*/ true); } { diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3aee7df4259..bc6051dc6ef 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -621,6 +621,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> inspector.record_store_wait(send_time.saturating_elapsed()); self.ctx.pending_latency_inspect.push(inspector); } + StoreMsg::UnsafeRecoveryReport => self.store_heartbeat_pd(true), StoreMsg::CreatePeer(region) => self.on_create_peer(region), } } @@ -2157,7 +2158,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - fn store_heartbeat_pd(&mut self) { + fn store_heartbeat_pd(&mut self, send_detailed_report: bool) { let mut stats = StoreStats::default(); stats.set_store_id(self.ctx.store_id()); @@ -2235,7 +2236,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let task = PdTask::StoreHeartbeat { stats, store_info, - send_detailed_report: false, + send_detailed_report, dr_autosync_status: self .ctx .global_replication_state @@ -2252,7 +2253,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } fn on_pd_store_heartbeat_tick(&mut self) { - self.store_heartbeat_pd(); + self.store_heartbeat_pd(false); self.register_pd_store_heartbeat_tick(); } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 67954e28b19..f075ac67b4a 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -621,6 +621,7 @@ where #[cfg(any(test, feature = "testexport"))] Validate(Box), + UnsafeRecoveryReport, CreatePeer(metapb::Region), } @@ -650,6 +651,7 @@ where StoreMsg::Validate(_) => write!(fmt, "Validate config"), StoreMsg::UpdateReplicationMode(_) => write!(fmt, "UpdateReplicationMode"), StoreMsg::LatencyInspect { .. } => write!(fmt, "LatencyInspect"), + StoreMsg::UnsafeRecoveryReport => write!(fmt, "UnsafeRecoveryReport"), StoreMsg::CreatePeer(_) => write!(fmt, "CreatePeer"), } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 0d489996b02..cabaddaa06d 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -54,7 +54,7 @@ use crate::store::fsm::store::PollContext; use crate::store::fsm::{apply, Apply, ApplyMetrics, ApplyTask, Proposal}; use crate::store::hibernate_state::GroupState; use crate::store::memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}; -use crate::store::msg::RaftCommand; +use crate::store::msg::{RaftCommand, StoreMsg}; use crate::store::txn_ext::LocksStatus; use crate::store::util::{admin_cmd_epoch_lookup, RegionReadProgress}; use crate::store::worker::{ @@ -476,6 +476,18 @@ pub enum ForceLeaderState { }, } +pub enum UnsafeRecoveryState { + // Stores the state that is necessary for the wait apply stage of unsafe recovery process. + // This state is set by the peer fsm. Once set, it is checked every time this peer applies a + // new entry or a snapshot, if the target index is met, the caller substract 1 from the task + // counter, whoever executes the last task is responsible for triggering the reporting logic by + // sending a store heartbeat message to store fsm. + WaitApply { + target_index: u64, + task_counter: Arc, + }, +} + #[derive(Getters)] pub struct Peer where @@ -643,6 +655,7 @@ where pub region_buckets: Option, /// lead_transferee if the peer is in a leadership transferring. pub lead_transferee: u64, + pub unsafe_recovery_state: Option, } impl Peer @@ -770,6 +783,7 @@ where apply_snap_ctx: None, region_buckets: None, lead_transferee: raft::INVALID_ID, + unsafe_recovery_state: None, }; // If this region has only one peer and I am the one, campaign directly. @@ -2021,6 +2035,11 @@ where self.term(), self.raft_group.store().region(), ); + + if self.unsafe_recovery_state.is_some() { + debug!("unsafe recovery finishes applying a snapshot"); + self.unsafe_recovery_maybe_finish_wait_apply(ctx, /*force=*/ false); + } } // If `apply_snap_ctx` is none, it means this snapshot does not // come from the ready but comes from the unfinished snapshot task @@ -4471,6 +4490,37 @@ where .propose_check_epoch(cmd, self.term()) .is_none() } + + pub fn unsafe_recovery_maybe_finish_wait_apply( + &mut self, + ctx: &PollContext, + force: bool, + ) { + if let Some(unsafe_recovery_state) = &self.unsafe_recovery_state { + let UnsafeRecoveryState::WaitApply { + target_index, + task_counter, + } = unsafe_recovery_state; + if self.raft_group.raft.raft_log.applied >= *target_index || force { + info!( + "unsafe recovery finish wait apply"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "target_index" => target_index, + "applied" => self.raft_group.raft.raft_log.applied, + "force" => force, + "counter" => task_counter.load(Ordering::SeqCst) + ); + if task_counter.fetch_sub(1, Ordering::SeqCst) == 1 { + if let Err(e) = ctx.router.send_control(StoreMsg::UnsafeRecoveryReport) { + error!("fail to send detailed report after recovery tasks finished"; "err" => ?e); + } + } + // Reset the state if the wait is finished. + self.unsafe_recovery_state = None; + } + } + } } #[derive(Default, Debug)] diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index 05d10f03dfb..067983d2805 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -89,6 +89,7 @@ impl Runner { Ok(s.and_then(|s| s.parse().ok()).unwrap_or(0)) }); let deleted = box_try!(self.engines.raft.batch_gc(regions)); + fail::fail_point!("worker_gc_raft_log_finished", |_| { Ok(deleted) }); Ok(deleted) } diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index 293e3620f5c..cc090bbb782 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -1,14 +1,15 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::iter::FromIterator; -use std::sync::{Arc, Condvar, Mutex}; +use std::sync::Arc; +use std::time::Duration; use futures::executor::block_on; use pd_client::PdClient; use raftstore::store::util::find_peer; use test_raftstore::*; +use tikv_util::{config::ReadableDuration, mpsc}; -#[allow(clippy::mutex_atomic)] #[test] fn test_unsafe_recover_send_report() { let mut cluster = new_server_cluster(0, 3); @@ -27,36 +28,19 @@ fn test_unsafe_recover_send_report() { cluster.put(b"random_key1", b"random_val1").unwrap(); // Blocks the raft apply process on store 1 entirely . - let apply_triggered_pair = Arc::new((Mutex::new(false), Condvar::new())); - let apply_triggered_pair2 = Arc::clone(&apply_triggered_pair); - let apply_released_pair = Arc::new((Mutex::new(false), Condvar::new())); - let apply_released_pair2 = Arc::clone(&apply_released_pair); + let (apply_triggered_tx, apply_triggered_rx) = mpsc::bounded::<()>(1); + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); fail::cfg_callback("on_handle_apply_store_1", move || { - { - let (lock, cvar) = &*apply_triggered_pair2; - let mut triggered = lock.lock().unwrap(); - *triggered = true; - cvar.notify_one(); - } - { - let (lock2, cvar2) = &*apply_released_pair2; - let mut released = lock2.lock().unwrap(); - while !*released { - released = cvar2.wait(released).unwrap(); - } - } + let _ = apply_triggered_tx.send(()); + let _ = apply_released_rx.recv(); }) .unwrap(); // Mannually makes an update, and wait for the apply to be triggered, to simulate "some entries are commited but not applied" scenario. cluster.put(b"random_key2", b"random_val2").unwrap(); - { - let (lock, cvar) = &*apply_triggered_pair; - let mut triggered = lock.lock().unwrap(); - while !*triggered { - triggered = cvar.wait(triggered).unwrap(); - } - } + apply_triggered_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); // Makes the group lose its quorum. cluster.stop_node(nodes[1]); @@ -73,12 +57,7 @@ fn test_unsafe_recover_send_report() { } // Unblocks the apply process. - { - let (lock2, cvar2) = &*apply_released_pair; - let mut released = lock2.lock().unwrap(); - *released = true; - cvar2.notify_all(); - } + drop(apply_released_tx); // Store reports are sent once the entries are applied. let mut reported = false; @@ -92,3 +71,86 @@ fn test_unsafe_recover_send_report() { assert_eq!(reported, true); fail::remove("on_handle_apply_store_1"); } + +#[test] +fn test_unsafe_recover_wait_for_snapshot_apply() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = 8; + cluster.cfg.raft_store.merge_max_log_gap = 3; + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.stop_node(nodes[1]); + let (raft_gc_triggered_tx, raft_gc_triggered_rx) = mpsc::bounded::<()>(1); + let (raft_gc_finished_tx, raft_gc_finished_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("worker_gc_raft_log", move || { + let _ = raft_gc_triggered_rx.recv(); + }) + .unwrap(); + fail::cfg_callback("worker_gc_raft_log_finished", move || { + let _ = raft_gc_finished_tx.send(()); + }) + .unwrap(); + // Add at least 4m data + (0..10).for_each(|_| cluster.must_put(b"random_k", b"random_v")); + // Unblock raft log GC. + drop(raft_gc_triggered_tx); + // Wait until logs are GCed. + raft_gc_finished_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); + // Makes the group lose its quorum. + cluster.stop_node(nodes[2]); + + // Blocks the raft snap apply process. + let (apply_triggered_tx, apply_triggered_rx) = mpsc::bounded::<()>(1); + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("region_apply_snap", move || { + let _ = apply_triggered_tx.send(()); + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + cluster.run_node(nodes[1]).unwrap(); + + apply_triggered_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); + + // Triggers the unsafe recovery store reporting process. + pd_client.must_set_require_report(true); + cluster.must_send_store_heartbeat(nodes[1]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..20 { + assert_eq!(pd_client.must_get_store_reported(&nodes[1]), 0); + sleep_ms(100); + } + + // Unblocks the snap apply process. + drop(apply_released_tx); + + // Store reports are sent once the entries are applied. + let mut reported = false; + for _ in 0..20 { + if pd_client.must_get_store_reported(&nodes[1]) > 0 { + reported = true; + break; + } + sleep_ms(100); + } + assert_eq!(reported, true); + fail::remove("worker_gc_raft_log"); + fail::remove("worker_gc_raft_log_finished"); + fail::remove("raft_before_apply_snap_callback"); +} diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index e3740dd300e..0b1c533b542 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -444,13 +444,18 @@ fn test_force_leader_trigger_snapshot() { cluster.clear_send_filters(); // wait election timeout - std::thread::sleep(Duration::from_millis( + sleep_ms( cluster.cfg.raft_store.raft_election_timeout_ticks as u64 * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() - * 2, - )); + * 5, + ); cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + sleep_ms( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + ); let cmd = new_change_peer_request( ConfChangeType::RemoveNode, find_peer(®ion, 3).unwrap().clone(), @@ -459,7 +464,7 @@ fn test_force_leader_trigger_snapshot() { // Though it has a force leader now, but the command can't committed because the log is not replicated to all the alive peers. assert!( cluster - .call_command_on_leader(req, Duration::from_millis(10)) + .call_command_on_leader(req, Duration::from_millis(1000)) .unwrap() .get_header() .has_error() // error "there is a pending conf change" indicating no committed log after being the leader From f9659e273d135801991ef48e89e6992fde49bc88 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Wed, 18 May 2022 14:43:35 +0800 Subject: [PATCH 4/4] raftstore: Execute recovery plan via raft (#12022) Signed-off-by: Connor1996 --- Cargo.lock | 2 +- components/pd_client/src/client.rs | 4 +- components/pd_client/src/util.rs | 2 + components/raftstore/src/store/fsm/peer.rs | 534 ++++++++++++------ components/raftstore/src/store/fsm/store.rs | 147 +++-- components/raftstore/src/store/msg.rs | 41 +- components/raftstore/src/store/peer.rs | 257 +++++++-- .../raftstore/src/store/peer_storage.rs | 5 + components/raftstore/src/store/worker/mod.rs | 3 +- components/raftstore/src/store/worker/pd.rs | 191 +++---- components/test_raftstore/src/cluster.rs | 83 +-- components/test_raftstore/src/pd.rs | 53 +- .../failpoints/cases/test_unsafe_recovery.rs | 336 ++++++++++- .../raftstore/test_unsafe_recovery.rs | 421 ++++++++++---- 14 files changed, 1471 insertions(+), 608 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 040735c07ca..9c884dd33ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2231,7 +2231,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#e22d8864c9579660424f83b62ca196c03ef85927" +source = "git+https://github.com/pingcap/kvproto.git#0e2f26c0a46ae7d666d6ca4410046a39e0c96f36" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index b9fdce8740b..9d97682302e 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -719,7 +719,7 @@ impl PdClient for RpcClient { fn store_heartbeat( &self, mut stats: pdpb::StoreStats, - report_opt: Option, + store_report: Option, dr_autosync_status: Option, ) -> PdFuture { let timer = Instant::now(); @@ -730,7 +730,7 @@ impl PdClient for RpcClient { .mut_interval() .set_end_timestamp(UnixSecs::now().into_inner()); req.set_stats(stats); - if let Some(report) = report_opt { + if let Some(report) = store_report { req.set_store_report(report); } if let Some(status) = dr_autosync_status { diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index dc58506cdbd..970fcc8feff 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -548,6 +548,8 @@ impl PdConnector { let addr_trim = trim_http_prefix(addr); let channel = { let cb = ChannelBuilder::new(self.env.clone()) + .max_send_message_len(-1) + .max_receive_message_len(-1) .keepalive_time(Duration::from_secs(10)) .keepalive_timeout(Duration::from_secs(3)); self.security_mgr.connect(cb, addr_trim) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index f1bd0f6be7f..c26c2a3ad21 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5,16 +5,14 @@ use std::borrow::Cow; use std::cell::Cell; use std::collections::Bound::{Excluded, Unbounded}; use std::collections::VecDeque; -use std::iter::Iterator; -use std::sync::{atomic::AtomicUsize, atomic::Ordering, Arc}; +use std::iter::{FromIterator, Iterator}; +use std::sync::{Arc, Mutex}; use std::time::Instant; use std::{cmp, mem, u64}; use batch_system::{BasicMailbox, Fsm}; use collections::{HashMap, HashSet}; -use engine_traits::{ - Engines, KvEngine, RaftEngine, SSTMetaInfo, WriteBatch, WriteBatchExt, WriteOptions, -}; +use engine_traits::{Engines, KvEngine, RaftEngine, SSTMetaInfo, WriteBatchExt}; use engine_traits::{CF_LOCK, CF_RAFT}; use error_code::ErrorCodeExt; use fail::fail_point; @@ -23,7 +21,7 @@ use kvproto::errorpb; use kvproto::import_sstpb::SwitchMode; use kvproto::kvrpcpb::DiskFullOpt; use kvproto::metapb::{self, Region, RegionEpoch}; -use kvproto::pdpb::CheckPolicy; +use kvproto::pdpb::{self, CheckPolicy}; use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, CmdType, PutRequest, RaftCmdRequest, RaftCmdResponse, Request, StatusCmdType, StatusResponse, @@ -67,14 +65,15 @@ use crate::store::metrics::*; use crate::store::msg::{Callback, ExtCallback, InspectedRaftMessage}; use crate::store::peer::{ ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, StaleState, - UnsafeRecoveryState, TRANSFER_LEADER_COMMAND_REPLY_CTX, + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }; -use crate::store::peer_storage::write_peer_state; use crate::store::transport::Transport; use crate::store::util::{is_learner, KeysInfoFormatter, LeaseState}; use crate::store::worker::{ - ConsistencyCheckTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, - SplitCheckTask, + new_change_peer_v2_request, ConsistencyCheckTask, RaftlogFetchTask, RaftlogGcTask, + ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }; use crate::store::PdTask; use crate::store::RaftlogFetchResult; @@ -668,12 +667,6 @@ where } } } - PeerMsg::UpdateRegionForUnsafeRecover(region) => { - self.on_update_region_for_unsafe_recover(region) - } - PeerMsg::UnsafeRecoveryWaitApply(counter) => { - self.on_unsafe_recovery_wait_apply(counter) - } } } // Propose batch request which may be still waiting for more raft-command @@ -735,157 +728,163 @@ where } } - fn on_update_region_for_unsafe_recover(&mut self, region: Region) { - let mut new_peer_list = HashSet::default(); - for peer in region.get_peers() { - new_peer_list.insert(peer.get_id()); - } - let to_be_removed: Vec = self - .region() - .get_peers() - .iter() - .filter(|&peer| !new_peer_list.contains(&peer.get_id())) - .map(|peer| peer.get_id()) - .collect(); - if to_be_removed.is_empty() - && self.region().get_start_key() == region.get_start_key() - && self.region().get_end_key() == region.get_end_key() - { - // Nothing to be updated, return directly. - return; - } - info!( - "updating the reigon for unsafe recover, original: {:?}, target: {:?}", - self.region(), - region - ); - if self.fsm.peer.has_valid_leader() { - panic!("region update for unsafe recover should only occur in leaderless reigons"); - } - if self.fsm.peer.raft_group.store().applied_index() - != self.fsm.peer.raft_group.store().commit_index() - { + fn on_unsafe_recovery_pre_demote_failed_voters( + &mut self, + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + ) { + if self.fsm.peer.unsafe_recovery_state.is_some() { warn!( - "cannot proceed region update for unsafe recover, applied index is not equal to commit index" + "Unsafe recovery, demote failed voters has already been initiated"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), ); + syncer.abort(); return; } - let region_state_key = keys::region_state_key(region.get_id()); - let original_region_state = match self - .ctx - .engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) - { - Ok(Some(region_state)) => region_state, - Ok(None) => { - panic!("Can't find RegionLocalState while updating {:?}", region); - } - Err(e) => { - panic!( - "Fail to look up RegionLocalState while updating {:?} err {:?}", - region, e - ); - } - }; - let mut kv_wb = self.ctx.engines.kv.write_batch(); - write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None).unwrap_or_else(|e| { - panic!( - "fails to write RegionLocalState {:?} into write brach, err {:?}", - region, e - ) - }); - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - if let Err(e) = kv_wb.write_opt(&write_opts) { - panic!("fail to update RegionLocalstate {:?} err {:?}", region, e); + if !self.fsm.peer.is_force_leader() { + error!( + "Unsafe recovery, demoting failed voters failed, since this peer is not forced leader"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), + ); + return; } - { - let mut meta = self.ctx.store_meta.lock().unwrap(); - meta.set_region( - &self.ctx.coprocessor_host, - region.clone(), - &mut self.fsm.peer, + if self.fsm.peer.in_joint_state() { + info!( + "Unsafe recovery, already in joint state, exit first"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), ); - if meta - .region_ranges - .remove(&enc_end_key(original_region_state.get_region())) - .is_none() - { - panic!( - "{} original region does not exist in store meta", - self.fsm.peer.tag - ); - } - for (_, id) in meta.region_ranges.range(( - Excluded(keys::data_key(region.get_start_key())), - Unbounded::>, - )) { - let exist_region = &meta.regions[id]; - if enc_start_key(exist_region) >= keys::data_end_key(region.get_end_key()) { - break; + let failed = Arc::new(Mutex::new(false)); + let failed_clone = failed.clone(); + let callback = Callback::::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + *failed_clone.lock().unwrap() = true; + error!( + "Unsafe recovery, fail to exit residual joint state"; + "err" => ?resp.response.get_header().get_error(), + ); } - panic!( - "{:?} is overlapped with an existing region {:?}", - region, exist_region - ); - } - if meta - .region_ranges - .insert(enc_end_key(®ion), region.get_id()) - .is_some() - { - panic!( - "key conflicts while inserting region {:?} into store meta", - region - ); + })); + self.propose_raft_command_internal( + exit_joint_request(self.region(), &self.fsm.peer.peer), + callback, + DiskFullOpt::AllowedOnAlmostFull, + ); + + if !*failed.lock().unwrap() { + self.fsm.peer.unsafe_recovery_state = + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters, + target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), + demote_after_exit: true, + }); } + } else { + self.unsafe_recovery_demote_failed_voters(syncer, failed_voters); } - for peer_id in to_be_removed.clone() { - let mut cc = eraftpb::ConfChangeV2::default(); - let mut ccs = eraftpb::ConfChangeSingle::default(); - ccs.set_change_type(eraftpb::ConfChangeType::RemoveNode); - ccs.set_node_id(peer_id); - cc.set_transition(eraftpb::ConfChangeTransition::Auto); - cc.mut_changes().push(ccs); - if let Err(e) = self.fsm.peer.raft_group.apply_conf_change(&cc) { - panic!("fail to apply conf change for unsafe recover {:?}", e); + } + + fn unsafe_recovery_demote_failed_voters( + &mut self, + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + ) { + if let Some(req) = + demote_failed_voters_request(self.region(), &self.fsm.peer.peer, failed_voters) + { + info!( + "Unsafe recovery, demoting failed voters"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), + "req" => ?req); + let failed = Arc::new(Mutex::new(false)); + let failed_clone = failed.clone(); + let callback = Callback::::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + *failed_clone.lock().unwrap() = true; + error!( + "Unsafe recovery, fail to finish demotion"; + "err" => ?resp.response.get_header().get_error(), + ); + } + })); + self.propose_raft_command_internal(req, callback, DiskFullOpt::AllowedOnAlmostFull); + if !*failed.lock().unwrap() { + self.fsm.peer.unsafe_recovery_state = + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters: vec![], // No longer needed since here. + target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), + demote_after_exit: false, + }); } + } else { + warn!( + "Unsafe recovery, no need to demote failed voters"; + "region" => ?self.region(), + ); } - self.fsm - .peer - .peer_heartbeats - .retain(|&k, _| new_peer_list.contains(&k)); - self.fsm - .peer - .peers_start_pending_time - .retain(|&(k, _)| new_peer_list.contains(&k)); - for peer in to_be_removed { - self.fsm.peer.remove_peer_from_cache(peer); + } + + fn on_unsafe_recovery_destroy(&mut self, syncer: UnsafeRecoveryExecutePlanSyncer) { + if self.fsm.peer.unsafe_recovery_state.is_some() { + warn!( + "Unsafe recovery, can't destroy, another plan is executing in progress"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + syncer.abort(); + return; } - self.fsm.peer.post_split(); - self.fsm.reset_hibernate_state(GroupState::Chaos); - self.register_raft_base_tick(); + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Destroy(syncer)); + self.handle_destroy_peer(DestroyPeerJob { + initialized: self.fsm.peer.is_initialized(), + region_id: self.region_id(), + peer: self.fsm.peer.peer.clone(), + }); } - fn on_unsafe_recovery_wait_apply(&mut self, counter: Arc) { - info!( - "unsafe recovery wait apply"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id(), - "target_index" => self.fsm.peer.raft_group.raft.raft_log.committed, - "task_counter" => counter.load(Ordering::Relaxed) - ); + fn on_unsafe_recovery_wait_apply(&mut self, syncer: UnsafeRecoveryWaitApplySyncer) { + if self.fsm.peer.unsafe_recovery_state.is_some() { + warn!( + "Unsafe recovery, can't wait apply, another plan is executing in progress"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + syncer.abort(); + return; + } + let target_index = if self.fsm.peer.force_leader.is_some() { + // For regions that lose quorum (or regions have force leader), whatever has been + // proposed will be committed. Based on that fact, we simply use "last index" here to + // avoid implementing another "wait commit" process. + self.fsm.peer.raft_group.raft.raft_log.last_index() + } else { + self.fsm.peer.raft_group.raft.raft_log.committed + }; self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { - target_index: self.fsm.peer.raft_group.raft.raft_log.committed, - task_counter: counter, + target_index, + syncer, }); self.fsm .peer - .unsafe_recovery_maybe_finish_wait_apply(self.ctx, /*force=*/ self.fsm.stopped); + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ self.fsm.stopped); + } + + fn on_unsafe_recovery_fill_out_report(&mut self, syncer: UnsafeRecoveryFillOutReportSyncer) { + let mut self_report = pdpb::PeerReport::default(); + self_report.set_raft_state(self.fsm.peer.get_store().raft_state().clone()); + let mut region_local_state = RegionLocalState::default(); + region_local_state.set_region(self.region().clone()); + self_report.set_region_state(region_local_state); + self_report.set_is_force_leader(self.fsm.peer.force_leader.is_some()); + syncer.report_for_self(self_report); } fn on_casual_msg(&mut self, msg: CasualMessage) { @@ -993,6 +992,10 @@ where CasualMessage::SnapshotApplied => { self.fsm.has_ready = true; } + CasualMessage::Campaign => { + let _ = self.fsm.peer.raft_group.campaign(); + self.fsm.has_ready = true; + } } } @@ -1228,14 +1231,34 @@ where SignificantMsg::RaftlogFetched { context, res } => { self.on_raft_log_fetched(context, res); } - SignificantMsg::EnterForceLeaderState { failed_stores } => { - self.on_enter_pre_force_leader(failed_stores) + SignificantMsg::EnterForceLeaderState { + syncer, + failed_stores, + } => { + self.on_enter_pre_force_leader(syncer, failed_stores); } SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(), + SignificantMsg::UnsafeRecoveryDemoteFailedVoters { + syncer, + failed_voters, + } => self.on_unsafe_recovery_pre_demote_failed_voters(syncer, failed_voters), + SignificantMsg::UnsafeRecoveryDestroy(syncer) => { + self.on_unsafe_recovery_destroy(syncer) + } + SignificantMsg::UnsafeRecoveryWaitApply(syncer) => { + self.on_unsafe_recovery_wait_apply(syncer) + } + SignificantMsg::UnsafeRecoveryFillOutReport(syncer) => { + self.on_unsafe_recovery_fill_out_report(syncer) + } } } - fn on_enter_pre_force_leader(&mut self, failed_stores: HashSet) { + fn on_enter_pre_force_leader( + &mut self, + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + ) { match self.fsm.peer.force_leader { Some(ForceLeaderState::PreForceLeader { .. }) => { self.on_force_leader_fail(); @@ -1250,10 +1273,19 @@ where None => {} } + if !self.fsm.peer.is_initialized() { + warn!( + "Unsafe recovery, cannot force leader since this peer is not initialized"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + let ticks = if self.fsm.peer.is_leader() { if self.fsm.hibernate_state.group_state() == GroupState::Ordered { warn!( - "reject pre force leader due to already being leader"; + "Unsafe recovery, reject pre force leader due to already being leader"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); @@ -1275,7 +1307,7 @@ where || self.fsm.hibernate_state.group_state() == GroupState::Chaos { warn!( - "reject pre force leader due to leader lease may not expired"; + "Unsafe recovery, reject pre force leader due to leader lease may not expired"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); @@ -1292,12 +1324,13 @@ where if let Some(ticks) = ticks { info!( - "enter wait ticks"; + "Unsafe recovery, enter wait ticks"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), "ticks" => ticks, ); self.fsm.peer.force_leader = Some(ForceLeaderState::WaitTicks { + syncer, failed_stores, ticks, }); @@ -1311,16 +1344,17 @@ where } let expected_alive_voter = self.get_force_leader_expected_alive_voter(&failed_stores); - if self - .fsm - .peer - .raft_group - .raft - .prs() - .has_quorum(&expected_alive_voter) + if !expected_alive_voter.is_empty() + && self + .fsm + .peer + .raft_group + .raft + .prs() + .has_quorum(&expected_alive_voter) { warn!( - "reject pre force leader due to has quorum"; + "Unsafe recovery, reject pre force leader due to has quorum"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); @@ -1328,7 +1362,7 @@ where } info!( - "enter pre force leader state"; + "Unsafe recovery, enter pre force leader state"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), "alive_voter" => ?expected_alive_voter, @@ -1339,7 +1373,14 @@ where // two force leader in same term. self.fsm.peer.raft_group.raft.pre_vote = false; // trigger vote request to all voters, will check the vote result in `check_force_leader` - self.fsm.peer.raft_group.campaign().unwrap(); + if let Err(e) = self.fsm.peer.raft_group.campaign() { + warn!( + "Unsafe recovery, campaign failed"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => ?e, + ); + } assert_eq!(self.fsm.peer.get_role(), StateRole::Candidate); if !self .fsm @@ -1352,7 +1393,7 @@ where .unwrap() { warn!( - "pre force leader failed to campaign"; + "Unsafe recovery, pre force leader failed to campaign"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); @@ -1360,7 +1401,10 @@ where return; } - self.fsm.peer.force_leader = Some(ForceLeaderState::PreForceLeader { failed_stores }); + self.fsm.peer.force_leader = Some(ForceLeaderState::PreForceLeader { + syncer, + failed_stores, + }); self.fsm.has_ready = true; } @@ -1372,14 +1416,14 @@ where fn on_enter_force_leader(&mut self) { info!( - "enter force leader state"; + "Unsafe recovery, enter force leader state"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); assert_eq!(self.fsm.peer.get_role(), StateRole::Candidate); let failed_stores = match self.fsm.peer.force_leader.take() { - Some(ForceLeaderState::PreForceLeader { failed_stores }) => failed_stores, + Some(ForceLeaderState::PreForceLeader { failed_stores, .. }) => failed_stores, _ => unreachable!(), }; @@ -1413,7 +1457,10 @@ where // make sure it's not hibernated self.reset_raft_tick(GroupState::Ordered); - self.fsm.peer.force_leader = Some(ForceLeaderState::ForceLeader { failed_stores }); + self.fsm.peer.force_leader = Some(ForceLeaderState::ForceLeader { + time: TiInstant::now_coarse(), + failed_stores, + }); self.fsm.has_ready = true; } @@ -1444,8 +1491,11 @@ where self.fsm.peer.raft_group.raft.set_check_quorum(true); self.fsm.peer.raft_group.raft.pre_vote = true; if self.fsm.peer.raft_group.raft.promotable() { - // let it trigger election immediately. - let _ = self.fsm.peer.raft_group.campaign(); + // Do not campaign directly here, otherwise on_role_changed() won't called for follower state + let _ = self.ctx.router.send( + self.region_id(), + PeerMsg::CasualMessage(CasualMessage::Campaign), + ); } self.fsm.has_ready = true; } @@ -1472,13 +1522,15 @@ where #[inline] fn check_force_leader(&mut self) { if let Some(ForceLeaderState::WaitTicks { + syncer, failed_stores, ticks, }) = &mut self.fsm.peer.force_leader { if *ticks == 0 { + let syncer_clone = syncer.clone(); let s = mem::take(failed_stores); - self.on_enter_pre_force_leader(s); + self.on_enter_pre_force_leader(syncer_clone, s); } else { *ticks -= 1; } @@ -1538,7 +1590,7 @@ where match check() { Err(err) => { warn!( - "pre force leader check failed"; + "Unsafe recovery, pre force leader check failed"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), "alive_voter" => ?expected_alive_voter, @@ -1547,6 +1599,11 @@ where self.on_force_leader_fail(); } Ok(granted) => { + info!( + "Unsafe recovery, expected live voters:"; + "voters" => ?expected_alive_voter, + "granted" => granted + ); if granted == expected_alive_voter.len() { self.on_enter_force_leader(); } @@ -1891,6 +1948,69 @@ where } } + fn check_unsafe_recovery_state(&mut self) { + match &self.fsm.peer.unsafe_recovery_state { + Some(UnsafeRecoveryState::WaitApply { .. }) => self + .fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false), + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters, + target_index, + demote_after_exit, + }) => { + if self.fsm.peer.raft_group.raft.raft_log.applied >= *target_index { + if *demote_after_exit { + if !self.fsm.peer.is_force_leader() { + error!( + "Unsafe recovery, lost forced leadership after exiting joint state"; + "region_id" => self.region().get_id(), + ); + return; + } + let syncer_clone = syncer.clone(); + let failed_voters_clone = failed_voters.clone(); + self.unsafe_recovery_demote_failed_voters( + syncer_clone, + failed_voters_clone, + ); + } else { + if self.fsm.peer.in_joint_state() { + info!( + "Unsafe recovery, exiting joint state"; + "region_id" => self.region().get_id() + ); + if self.fsm.peer.is_force_leader() { + self.propose_raft_command_internal( + exit_joint_request(self.region(), &self.fsm.peer.peer), + Callback::::write(Box::new(|resp| { + if resp.response.get_header().has_error() { + error!( + "Unsafe recovery, fail to exit joint state"; + "err" => ?resp.response.get_header().get_error(), + ); + } + })), + DiskFullOpt::AllowedOnAlmostFull, + ); + } else { + error!( + "Unsafe recovery, lost forced leadership while trying to exit joint state"; + "region_id" => self.region().get_id(), + ); + } + } + + self.fsm.peer.unsafe_recovery_state = None; + } + } + } + // Destroy does not need be processed, the state is cleaned up together with peer. + Some(_) | None => {} + } + } + fn on_apply_res(&mut self, res: ApplyTaskRes) { fail_point!("on_apply_res", |_| {}); match res { @@ -1955,11 +2075,8 @@ where } } } - // After a log has been applied, check if we need to trigger the unsafe recovery reporting procedure. if self.fsm.peer.unsafe_recovery_state.is_some() { - self.fsm - .peer - .unsafe_recovery_maybe_finish_wait_apply(self.ctx, /*force=*/ false); + self.check_unsafe_recovery_state(); } } @@ -3045,7 +3162,7 @@ where if self.fsm.peer.unsafe_recovery_state.is_some() { self.fsm .peer - .unsafe_recovery_maybe_finish_wait_apply(self.ctx, /*force=*/ true); + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ true); } { @@ -3308,7 +3425,7 @@ where // until new leader elected, but we can't revert this operation // because its result is already persisted in apply worker // TODO: should we transfer leader here? - let demote_self = is_learner(&self.fsm.peer.peer); + let demote_self = is_learner(&self.fsm.peer.peer) && !self.fsm.peer.is_force_leader(); if remove_self || demote_self { warn!( "Removing or demoting leader"; @@ -5146,6 +5263,18 @@ where return; } + if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { + // If the force leader state lasts a long time, it probably means PD recovery process aborts for some reasons. + // So just exit to avoid blocking the read and write requests for this peer. + if time.saturating_elapsed() > self.ctx.cfg.peer_stale_state_check_interval.0 { + warn!( + "Unsafe recovery, step down as force leader due to holding it too long"; + "duration" => ?time.saturating_elapsed(), + ); + self.on_exit_force_leader(); + } + } + if self.ctx.cfg.hibernate_regions { let group_state = self.fsm.hibernate_state.group_state(); if group_state == GroupState::Idle { @@ -5529,6 +5658,57 @@ fn new_compact_log_request( request } +fn demote_failed_voters_request( + region: &metapb::Region, + peer: &metapb::Peer, + failed_voters: Vec, +) -> Option { + let failed_voter_ids = HashSet::from_iter(failed_voters.iter().map(|voter| voter.get_id())); + let mut req = new_admin_request(region.get_id(), peer.clone()); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + let mut change_peer_reqs: Vec = region + .get_peers() + .iter() + .filter_map(|peer| { + if failed_voter_ids.contains(&peer.get_id()) + && peer.get_role() == metapb::PeerRole::Voter + { + let mut peer_clone = peer.clone(); + peer_clone.set_role(metapb::PeerRole::Learner); + let mut cp = pdpb::ChangePeer::default(); + cp.set_change_type(ConfChangeType::AddLearnerNode); + cp.set_peer(peer_clone); + return Some(cp); + } + None + }) + .collect(); + + // Promote self if it is a learner. + if peer.get_role() == metapb::PeerRole::Learner { + let mut cp = pdpb::ChangePeer::default(); + cp.set_change_type(ConfChangeType::AddNode); + let mut promote = peer.clone(); + promote.set_role(metapb::PeerRole::Voter); + cp.set_peer(promote); + change_peer_reqs.push(cp); + } + if change_peer_reqs.is_empty() { + return None; + } + req.set_admin_request(new_change_peer_v2_request(change_peer_reqs)); + Some(req) +} + +fn exit_joint_request(region: &metapb::Region, peer: &metapb::Peer) -> RaftCmdRequest { + let mut req = new_admin_request(region.get_id(), peer.clone()); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.set_admin_request(new_change_peer_v2_request(vec![])); + req +} + impl<'a, EK, ER, T: Transport> PeerFsmDelegate<'a, EK, ER, T> where EK: KvEngine, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index bc6051dc6ef..c3836807647 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -16,7 +16,10 @@ use batch_system::{ HandlerBuilder, PollHandler, Priority, }; use crossbeam::channel::{unbounded, Sender, TryRecvError, TrySendError}; -use engine_traits::{Engines, KvEngine, Mutable, PerfContextKind, WriteBatch}; +use engine_traits::{ + CompactedEvent, DeleteStrategy, Engines, KvEngine, Mutable, PerfContextKind, RaftEngine, + RaftLogBatch, Range, WriteBatch, WriteOptions, +}; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use fail::fail_point; use futures::compat::Future01CompatExt; @@ -24,8 +27,7 @@ use futures::FutureExt; use kvproto::import_sstpb::SstMeta; use kvproto::import_sstpb::SwitchMode; use kvproto::metapb::{self, Region, RegionEpoch}; -use kvproto::pdpb::QueryStats; -use kvproto::pdpb::StoreStats; +use kvproto::pdpb::{self, QueryStats, StoreStats}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest}; use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}; use kvproto::replication_modepb::{ReplicationMode, ReplicationStatus}; @@ -34,8 +36,6 @@ use raft::StateRole; use time::{self, Timespec}; use collections::HashMap; -use engine_traits::CompactedEvent; -use engine_traits::{RaftEngine, RaftLogBatch, WriteOptions}; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use pd_client::{FeatureGate, PdClient}; use sst_importer::SSTImporter; @@ -621,8 +621,11 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> inspector.record_store_wait(send_time.saturating_elapsed()); self.ctx.pending_latency_inspect.push(inspector); } - StoreMsg::UnsafeRecoveryReport => self.store_heartbeat_pd(true), - StoreMsg::CreatePeer(region) => self.on_create_peer(region), + StoreMsg::UnsafeRecoveryReport(report) => self.store_heartbeat_pd(Some(report)), + StoreMsg::UnsafeRecoveryCreatePeer { syncer, create } => { + self.on_unsafe_recovery_create_peer(create); + drop(syncer); + } } } } @@ -2158,7 +2161,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - fn store_heartbeat_pd(&mut self, send_detailed_report: bool) { + fn store_heartbeat_pd(&mut self, report: Option) { let mut stats = StoreStats::default(); stats.set_store_id(self.ctx.store_id()); @@ -2236,7 +2239,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let task = PdTask::StoreHeartbeat { stats, store_info, - send_detailed_report, + report, dr_autosync_status: self .ctx .global_replication_state @@ -2253,7 +2256,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } fn on_pd_store_heartbeat_tick(&mut self) { - self.store_heartbeat_pd(false); + self.store_heartbeat_pd(None); self.register_pd_store_heartbeat_tick(); } @@ -2640,39 +2643,35 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.router.report_status_update() } - fn on_create_peer(&self, region: Region) { - info!("creating a peer"; "peer" => ?region); - let mut kv_wb = self.ctx.engines.kv.write_batch(); - let region_state_key = keys::region_state_key(region.get_id()); - match self - .ctx - .engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) + fn on_unsafe_recovery_create_peer(&self, region: Region) { + info!("Unsafe recovery, creating a peer"; "peer" => ?region); + let mut meta = self.ctx.store_meta.lock().unwrap(); + if let Some((_, id)) = meta + .region_ranges + .range(( + Excluded(data_key(region.get_start_key())), + Unbounded::>, + )) + .next() { - Ok(Some(region_state)) => { - info!( - "target region already exists, existing region: {:?}, want to create: {:?}", - region_state, region - ); - return; - } - Ok(None) => {} - Err(e) => { - panic!("cannot determine whether {:?} exists, err {:?}", region, e) + let exist_region = &meta.regions[id]; + if enc_start_key(exist_region) < data_end_key(region.get_end_key()) { + if exist_region.get_id() == region.get_id() { + warn!( + "Unsafe recovery, region has already been created."; + "region" => ?region, + "exist_region" => ?exist_region, + ); + return; + } else { + error!( + "Unsafe recovery, region to be created overlaps with an existing region"; + "region" => ?region, + "exist_region" => ?exist_region, + ); + return; + } } - }; - peer_storage::write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None) - .unwrap_or_else(|e| { - panic!( - "fail to add peer state into write batch while creating {:?} err {:?}", - region, e - ) - }); - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - if let Err(e) = kv_wb.write_opt(&write_opts) { - panic!("fail to write while creating {:?} err {:?}", region, e); } let (sender, mut peer) = match PeerFsm::create( self.ctx.store.get_id(), @@ -2684,33 +2683,29 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER ) { Ok((sender, peer)) => (sender, peer), Err(e) => { - panic!( - "fail to create peer fsm while creating {:?} err {:?}", - region, e + error!( + "Unsafe recovery, fail to create peer fsm"; + "region" => ?region, + "err" => ?e, ); + return; } }; let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); peer.peer.init_replication_mode(&mut *replication_state); + drop(replication_state); peer.peer.activate(self.ctx); - let mut meta = self.ctx.store_meta.lock().unwrap(); - for (_, id) in meta.region_ranges.range(( - Excluded(data_key(region.get_start_key())), - Unbounded::>, - )) { - let exist_region = &meta.regions[id]; - if enc_start_key(exist_region) >= data_end_key(region.get_end_key()) { - break; - } - panic!( - "{:?} is overlapped with an existing region {:?}", - region, exist_region - ); - } + + let start_key = keys::enc_start_key(®ion); + let end_key = keys::enc_end_key(®ion); if meta - .region_ranges - .insert(enc_end_key(®ion), region.get_id()) + .regions + .insert(region.get_id(), region.clone()) .is_some() + || meta + .region_ranges + .insert(end_key.clone(), region.get_id()) + .is_some() || meta .readers .insert(region.get_id(), ReadDelegate::from_peer(peer.get_peer())) @@ -2721,10 +2716,38 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .is_some() { panic!( - "key conflicts while insert region {:?} into store meta", - region + "Unsafe recovery, key conflicts while inserting region {:?} into store meta", + region, ); } + drop(meta); + + if let Err(e) = self.ctx.engines.kv.delete_all_in_range( + DeleteStrategy::DeleteByKey, + &[Range::new(&start_key, &end_key)], + ) { + panic!( + "Unsafe recovery, fail to clean up stale data while creating the new region {:?}, the error is {:?}", + region, e, + ); + } + let mut kv_wb = self.ctx.engines.kv.write_batch(); + if let Err(e) = peer_storage::write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None) + { + panic!( + "Unsafe recovery, fail to add peer state for {:?} into write batch, the error is {:?}", + region, e, + ); + } + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + if let Err(e) = kv_wb.write_opt(&write_opts) { + panic!( + "Unsafe recovery, fail to write to disk while creating peer {:?}, the error is {:?}", + region, e, + ); + } + let mailbox = BasicMailbox::new(sender, peer, self.ctx.router.state_cnt().clone()); self.ctx.router.register(region.get_id(), mailbox); self.ctx diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index f075ac67b4a..7efd4e4882c 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -3,14 +3,12 @@ // #[PerformanceCriticalPath] use std::borrow::Cow; use std::fmt; -use std::sync::atomic::AtomicUsize; -use std::sync::Arc; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use kvproto::kvrpcpb::ExtraOp as TxnExtraOp; use kvproto::metapb; use kvproto::metapb::RegionEpoch; -use kvproto::pdpb::CheckPolicy; +use kvproto::pdpb::{self, CheckPolicy}; use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; use kvproto::raft_serverpb::RaftMessage; use kvproto::replication_modepb::ReplicationStatus; @@ -18,14 +16,18 @@ use kvproto::{import_sstpb::SstMeta, kvrpcpb::DiskFullOpt}; use raft::{GetEntriesContext, SnapshotStatus}; use smallvec::{smallvec, SmallVec}; +use super::{AbstractPeer, RegionSnapshot}; use crate::store::fsm::apply::TaskRes as ApplyTaskRes; use crate::store::fsm::apply::{CatchUpLogs, ChangeObserver}; use crate::store::metrics::RaftEventDurationType; +use crate::store::peer::{ + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, +}; use crate::store::util::{KeysInfoFormatter, LatencyInspector}; use crate::store::{RaftlogFetchResult, SnapKey}; use collections::HashSet; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; -use super::{AbstractPeer, RegionSnapshot}; #[derive(Debug)] pub struct ReadResponse { @@ -314,9 +316,17 @@ where res: Box, }, EnterForceLeaderState { + syncer: UnsafeRecoveryForceLeaderSyncer, failed_stores: HashSet, }, ExitForceLeaderState, + UnsafeRecoveryDemoteFailedVoters { + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + }, + UnsafeRecoveryDestroy(UnsafeRecoveryExecutePlanSyncer), + UnsafeRecoveryWaitApply(UnsafeRecoveryWaitApplySyncer), + UnsafeRecoveryFillOutReport(UnsafeRecoveryFillOutReportSyncer), } /// Message that will be sent to a peer. @@ -396,6 +406,9 @@ pub enum CasualMessage { // Snapshot is applied SnapshotApplied, + + // Trigger raft to campaign which is used after exiting force leader + Campaign, } impl fmt::Debug for CasualMessage { @@ -454,6 +467,7 @@ impl fmt::Debug for CasualMessage { CasualMessage::RefreshRegionBuckets { .. } => write!(fmt, "RefreshRegionBuckets"), CasualMessage::RenewLease => write!(fmt, "RenewLease"), CasualMessage::SnapshotApplied => write!(fmt, "SnapshotApplied"), + CasualMessage::Campaign => write!(fmt, "Campaign"), } } } @@ -543,8 +557,6 @@ pub enum PeerMsg { /// Asks region to change replication mode. UpdateReplicationMode, Destroy(u64), - UpdateRegionForUnsafeRecover(metapb::Region), - UnsafeRecoveryWaitApply(Arc), } impl fmt::Debug for PeerMsg { @@ -573,10 +585,6 @@ impl fmt::Debug for PeerMsg { PeerMsg::HeartbeatPd => write!(fmt, "HeartbeatPd"), PeerMsg::UpdateReplicationMode => write!(fmt, "UpdateReplicationMode"), PeerMsg::Destroy(peer_id) => write!(fmt, "Destroy {}", peer_id), - PeerMsg::UpdateRegionForUnsafeRecover(region) => { - write!(fmt, "Update Region {} to {:?}", region.get_id(), region) - } - PeerMsg::UnsafeRecoveryWaitApply(counter) => write!(fmt, "WaitApply {:?}", *counter), } } } @@ -621,8 +629,11 @@ where #[cfg(any(test, feature = "testexport"))] Validate(Box), - UnsafeRecoveryReport, - CreatePeer(metapb::Region), + UnsafeRecoveryReport(pdpb::StoreReport), + UnsafeRecoveryCreatePeer { + syncer: UnsafeRecoveryExecutePlanSyncer, + create: metapb::Region, + }, } impl fmt::Debug for StoreMsg @@ -651,8 +662,10 @@ where StoreMsg::Validate(_) => write!(fmt, "Validate config"), StoreMsg::UpdateReplicationMode(_) => write!(fmt, "UpdateReplicationMode"), StoreMsg::LatencyInspect { .. } => write!(fmt, "LatencyInspect"), - StoreMsg::UnsafeRecoveryReport => write!(fmt, "UnsafeRecoveryReport"), - StoreMsg::CreatePeer(_) => write!(fmt, "CreatePeer"), + StoreMsg::UnsafeRecoveryReport(..) => write!(fmt, "UnsafeRecoveryReport"), + StoreMsg::UnsafeRecoveryCreatePeer { .. } => { + write!(fmt, "UnsafeRecoveryCreatePeer") + } } } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index cabaddaa06d..d9c635a97db 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -6,7 +6,7 @@ use std::collections::VecDeque; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; -use std::{cmp, mem, u64, usize}; +use std::{cmp, fmt, mem, u64, usize}; use bitflags::bitflags; use bytes::Bytes; @@ -21,7 +21,7 @@ use getset::Getters; use kvproto::errorpb; use kvproto::kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp, LockInfo}; use kvproto::metapb::{self, PeerRole}; -use kvproto::pdpb::PeerStats; +use kvproto::pdpb::{self, PeerStats}; use kvproto::raft_cmdpb::{ self, AdminCmdType, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, PutRequest, RaftCmdRequest, RaftCmdResponse, Request, TransferLeaderRequest, TransferLeaderResponse, @@ -33,7 +33,7 @@ use kvproto::replication_modepb::{ DrAutoSyncState, RegionReplicationState, RegionReplicationStatus, ReplicationMode, }; use parking_lot::RwLockUpgradableReadGuard; -use protobuf::Message; +use protobuf::{Message, RepeatedField}; use raft::eraftpb::{self, ConfChangeType, Entry, EntryType, MessageType}; use raft::{ self, Changer, GetEntriesContext, LightReady, ProgressState, ProgressTracker, RawNode, Ready, @@ -50,11 +50,11 @@ use crate::errors::RAFTSTORE_IS_BUSY; use crate::store::async_io::write::WriteMsg; use crate::store::async_io::write_router::WriteRouter; use crate::store::fsm::apply::CatchUpLogs; -use crate::store::fsm::store::PollContext; +use crate::store::fsm::store::{PollContext, RaftRouter}; use crate::store::fsm::{apply, Apply, ApplyMetrics, ApplyTask, Proposal}; use crate::store::hibernate_state::GroupState; use crate::store::memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}; -use crate::store::msg::{RaftCommand, StoreMsg}; +use crate::store::msg::{PeerMsg, RaftCommand, SignificantMsg, StoreMsg}; use crate::store::txn_ext::LocksStatus; use crate::store::util::{admin_cmd_epoch_lookup, RegionReadProgress}; use crate::store::worker::{ @@ -465,27 +465,213 @@ pub struct ReadyResult { /// 6. After the plan steps are all applied, exit force leader state pub enum ForceLeaderState { WaitTicks { + syncer: UnsafeRecoveryForceLeaderSyncer, failed_stores: HashSet, ticks: usize, }, PreForceLeader { + syncer: UnsafeRecoveryForceLeaderSyncer, failed_stores: HashSet, }, ForceLeader { + time: TiInstant, failed_stores: HashSet, }, } +// Following shared states are used while reporting to PD for unsafe recovery and shared among +// all the regions per their life cycle. +// The work flow is like: +// 1. report phase +// start_unsafe_recovery_report +// -> broadcast wait-apply commands +// -> wait for all the peers' apply indices meet their targets +// -> broadcast fill out report commands +// -> wait for all the peers fill out the reports for themselves +// -> send a store report (through store heartbeat) +// 2. force leader phase +// dispatch force leader commands +// -> wait for all the peers that received the command become force leader +// -> start_unsafe_recovery_report +// 3. plan execution phase +// dispatch recovery plans +// -> wait for all the creates, deletes and demotes to finish, for the demotes, +// procedures are: +// -> exit joint state if it is already in joint state +// -> demote failed voters, and promote self to be a voter if it is a learner +// -> exit joint state +// -> start_unsafe_recovery_report + +// Intends to use RAII to sync unsafe recovery procedures between peers, in addition to that, +// it uses a closure to avoid having a raft router as a member variable, which is statically +// dispatched, thus needs to propagate the generics everywhere. +pub struct InvokeClosureOnDrop(Box); + +impl fmt::Debug for InvokeClosureOnDrop { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "InvokeClosureOnDrop") + } +} + +impl Drop for InvokeClosureOnDrop { + fn drop(&mut self) { + self.0(); + } +} + +pub fn start_unsafe_recovery_report( + router: &RaftRouter, + report_id: u64, + exit_force_leader_before_reporting: bool, +) { + if exit_force_leader_before_reporting { + router.broadcast_normal(|| PeerMsg::SignificantMsg(SignificantMsg::ExitForceLeaderState)); + } + let wait_apply = UnsafeRecoveryWaitApplySyncer::new(report_id, router.clone()); + router.broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::UnsafeRecoveryWaitApply(wait_apply.clone())) + }); +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryForceLeaderSyncer(Arc); + +impl UnsafeRecoveryForceLeaderSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let inner = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, force leader finished."); + let router_ptr = thread_safe_router.lock().unwrap(); + start_unsafe_recovery_report(&*router_ptr, report_id, false); + })); + UnsafeRecoveryForceLeaderSyncer(Arc::new(inner)) + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryExecutePlanSyncer { + _closure: Arc, + abort: Arc>, +} + +impl UnsafeRecoveryExecutePlanSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let abort = Arc::new(Mutex::new(false)); + let abort_clone = abort.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, plan execution finished"); + if *abort_clone.lock().unwrap() { + warn!("Unsafe recovery, plan execution aborted"); + return; + } + let router_ptr = thread_safe_router.lock().unwrap(); + start_unsafe_recovery_report(&*router_ptr, report_id, true); + })); + UnsafeRecoveryExecutePlanSyncer { + _closure: Arc::new(closure), + abort, + } + } + + pub fn abort(&self) { + *self.abort.lock().unwrap() = true; + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryWaitApplySyncer { + _closure: Arc, + abort: Arc>, +} + +impl UnsafeRecoveryWaitApplySyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let abort = Arc::new(Mutex::new(false)); + let abort_clone = abort.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, wait apply finished"); + if *abort_clone.lock().unwrap() { + warn!("Unsafe recovery, wait apply aborted"); + return; + } + let router_ptr = thread_safe_router.lock().unwrap(); + let fill_out_report = + UnsafeRecoveryFillOutReportSyncer::new(report_id, (*router_ptr).clone()); + (*router_ptr).broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::UnsafeRecoveryFillOutReport( + fill_out_report.clone(), + )) + }); + })); + UnsafeRecoveryWaitApplySyncer { + _closure: Arc::new(closure), + abort, + } + } + + pub fn abort(&self) { + *self.abort.lock().unwrap() = true; + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryFillOutReportSyncer { + _closure: Arc, + reports: Arc>>, +} + +impl UnsafeRecoveryFillOutReportSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let reports = Arc::new(Mutex::new(vec![])); + let reports_clone = reports.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, peer reports collected"); + let mut store_report = pdpb::StoreReport::default(); + { + let reports_ptr = reports_clone.lock().unwrap(); + store_report.set_peer_reports(RepeatedField::from_vec((*reports_ptr).to_vec())); + } + store_report.set_step(report_id); + let router_ptr = thread_safe_router.lock().unwrap(); + if let Err(e) = (*router_ptr).send_control(StoreMsg::UnsafeRecoveryReport(store_report)) + { + error!("Unsafe recovery, fail to schedule reporting"; "err" => ?e); + } + })); + UnsafeRecoveryFillOutReportSyncer { + _closure: Arc::new(closure), + reports, + } + } + + pub fn report_for_self(&self, report: pdpb::PeerReport) { + let mut reports_ptr = self.reports.lock().unwrap(); + (*reports_ptr).push(report); + } +} + pub enum UnsafeRecoveryState { // Stores the state that is necessary for the wait apply stage of unsafe recovery process. // This state is set by the peer fsm. Once set, it is checked every time this peer applies a - // new entry or a snapshot, if the target index is met, the caller substract 1 from the task - // counter, whoever executes the last task is responsible for triggering the reporting logic by - // sending a store heartbeat message to store fsm. + // new entry or a snapshot, if the target index is met, this state is reset / droppeds. The + // syncer holds a reference counted inner object that is shared among all the peers, whose + // destructor triggers the next step of unsafe recovery report process. WaitApply { target_index: u64, - task_counter: Arc, + syncer: UnsafeRecoveryWaitApplySyncer, + }, + DemoteFailedVoters { + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + target_index: u64, + // Failed regions may be stuck in joint state, if that is the case, we need to ask the + // region to exit joint state before proposing the demotion. + demote_after_exit: bool, }, + Destroy(UnsafeRecoveryExecutePlanSyncer), } #[derive(Getters)] @@ -1659,7 +1845,7 @@ where pub fn maybe_force_forward_commit_index(&mut self) -> bool { let failed_stores = match &self.force_leader { - Some(ForceLeaderState::ForceLeader { failed_stores }) => failed_stores, + Some(ForceLeaderState::ForceLeader { failed_stores, .. }) => failed_stores, _ => unreachable!(), }; @@ -2038,7 +2224,7 @@ where if self.unsafe_recovery_state.is_some() { debug!("unsafe recovery finishes applying a snapshot"); - self.unsafe_recovery_maybe_finish_wait_apply(ctx, /*force=*/ false); + self.unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false); } } // If `apply_snap_ctx` is none, it means this snapshot does not @@ -3186,7 +3372,7 @@ where let kind = ConfChangeKind::confchange_kind(change_peers.len()); if kind == ConfChangeKind::LeaveJoint { - if self.peer.get_role() == PeerRole::DemotingVoter { + if self.peer.get_role() == PeerRole::DemotingVoter && !self.is_force_leader() { return Err(box_err!( "{} ignore leave joint command that demoting leader", self.tag @@ -4490,33 +4676,30 @@ where .propose_check_epoch(cmd, self.term()) .is_none() } - - pub fn unsafe_recovery_maybe_finish_wait_apply( - &mut self, - ctx: &PollContext, - force: bool, - ) { - if let Some(unsafe_recovery_state) = &self.unsafe_recovery_state { - let UnsafeRecoveryState::WaitApply { - target_index, - task_counter, - } = unsafe_recovery_state; + + #[inline] + pub fn is_force_leader(&self) -> bool { + matches!( + self.force_leader, + Some(ForceLeaderState::ForceLeader { .. }) + ) + } + + pub fn unsafe_recovery_maybe_finish_wait_apply(&mut self, force: bool) { + if let Some(UnsafeRecoveryState::WaitApply { target_index, .. }) = + &self.unsafe_recovery_state + { if self.raft_group.raft.raft_log.applied >= *target_index || force { - info!( - "unsafe recovery finish wait apply"; - "region_id" => self.region().get_id(), - "peer_id" => self.peer_id(), - "target_index" => target_index, - "applied" => self.raft_group.raft.raft_log.applied, - "force" => force, - "counter" => task_counter.load(Ordering::SeqCst) - ); - if task_counter.fetch_sub(1, Ordering::SeqCst) == 1 { - if let Err(e) = ctx.router.send_control(StoreMsg::UnsafeRecoveryReport) { - error!("fail to send detailed report after recovery tasks finished"; "err" => ?e); - } + if self.is_force_leader() { + info!( + "Unsafe recovery, finish wait apply"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "target_index" => target_index, + "applied" => self.raft_group.raft.raft_log.applied, + "force" => force, + ); } - // Reset the state if the wait is finished. self.unsafe_recovery_state = None; } } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index d056e0d5497..d8640fc979b 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1093,6 +1093,11 @@ where self.last_term } + #[inline] + pub fn raft_state(&self) -> &RaftLocalState { + &self.raft_state + } + #[inline] pub fn applied_index(&self) -> u64 { self.apply_state.get_applied_index() diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 9c46daf201e..1f2d357b80c 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -23,7 +23,8 @@ pub use self::cleanup_sst::{Runner as CleanupSSTRunner, Task as CleanupSSTTask}; pub use self::compact::{Runner as CompactRunner, Task as CompactTask}; pub use self::consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}; pub use self::pd::{ - FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, Task as PdTask, + new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, + Runner as PdRunner, Task as PdTask, }; pub use self::query_stats::QueryStats; pub use self::raftlog_fetch::{Runner as RaftlogFetchRunner, Task as RaftlogFetchTask}; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index bdca2d161f5..4652a35876a 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -3,12 +3,12 @@ use std::cmp::Ordering as CmpOrdering; use std::fmt::{self, Display, Formatter}; use std::sync::mpsc::{self, Receiver, Sender}; -use std::sync::{atomic::AtomicUsize, atomic::Ordering, Arc}; +use std::sync::{atomic::Ordering, Arc}; use std::thread::{Builder, JoinHandle}; use std::time::{Duration, Instant}; use std::{cmp, io}; -use engine_traits::{KvEngine, RaftEngine, CF_RAFT}; +use engine_traits::{KvEngine, RaftEngine}; #[cfg(feature = "failpoints")] use fail::fail_point; use kvproto::kvrpcpb::DiskFullOpt; @@ -16,7 +16,7 @@ use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, SplitRequest, }; -use kvproto::raft_serverpb::{PeerState, RaftMessage, RegionLocalState}; +use kvproto::raft_serverpb::RaftMessage; use kvproto::replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}; use kvproto::{metapb, pdpb}; use ordered_float::OrderedFloat; @@ -26,31 +26,31 @@ use yatp::Remote; use crate::store::cmd_resp::new_error; use crate::store::metrics::*; -use crate::store::util::{ - is_epoch_stale, ConfChangeKind, KeysInfoFormatter, LatencyInspector, RaftstoreDuration, -}; +use crate::store::util::{is_epoch_stale, KeysInfoFormatter, LatencyInspector, RaftstoreDuration}; use crate::store::worker::query_stats::QueryStats; use crate::store::worker::split_controller::{SplitInfo, TOP_N}; use crate::store::worker::{AutoSplitController, ReadStats, WriteStats}; use crate::store::{ + peer::{UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryForceLeaderSyncer}, + transport::SignificantRouter, Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, - RegionReadProgressRegistry, SnapManager, StoreInfo, StoreMsg, TxnExt, + RegionReadProgressRegistry, SignificantMsg, SnapManager, StoreInfo, StoreMsg, TxnExt, }; use collections::HashMap; +use collections::HashSet; use concurrency_manager::ConcurrencyManager; use futures::compat::Future01CompatExt; use futures::FutureExt; use pd_client::metrics::*; use pd_client::{Error, PdClient, RegionStat}; -use protobuf::Message; use resource_metering::{Collector, CollectorGuard, CollectorRegHandle, RawRecords}; use tikv_util::metrics::ThreadInfoStatistics; use tikv_util::time::UnixSecs; use tikv_util::timer::GLOBAL_TIMER_HANDLE; use tikv_util::topn::TopN; use tikv_util::worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}; -use tikv_util::{box_err, box_try, debug, error, info, thd_name, warn}; +use tikv_util::{box_err, debug, error, info, thd_name, warn}; type RecordPairVec = Vec; @@ -135,7 +135,7 @@ where StoreHeartbeat { stats: pdpb::StoreStats, store_info: StoreInfo, - send_detailed_report: bool, + report: Option, dr_autosync_status: Option, }, ReportBatchSplit { @@ -1024,7 +1024,7 @@ where &mut self, mut stats: pdpb::StoreStats, store_info: StoreInfo, - send_detailed_report: bool, + store_report: Option, dr_autosync_status: Option, ) { let store_stats = self.engine_store_server_helper.handle_compute_store_stats(); @@ -1117,119 +1117,72 @@ where let slow_score = self.slow_score.get(); stats.set_slow_score(slow_score as u64); - let mut optional_report = None; - if send_detailed_report { - let mut store_report = pdpb::StoreReport::new(); - store_info - .kv_engine - .scan_cf( - CF_RAFT, - keys::REGION_META_MIN_KEY, - keys::REGION_META_MAX_KEY, - false, - |key, value| { - let (_, suffix) = box_try!(keys::decode_region_meta_key(key)); - if suffix != keys::REGION_STATE_SUFFIX { - return Ok(true); - } - - let mut region_local_state = RegionLocalState::default(); - region_local_state.merge_from_bytes(value)?; - if region_local_state.get_state() == PeerState::Tombstone { - return Ok(true); - } - let raft_local_state = match store_info - .raft_engine - .get_raft_state(region_local_state.get_region().get_id()) - .unwrap() - { - None => return Ok(true), - Some(value) => value, - }; - let mut peer_report = pdpb::PeerReport::new(); - peer_report.set_region_state(region_local_state); - peer_report.set_raft_state(raft_local_state); - store_report.mut_peer_reports().push(peer_report); - Ok(true) - }, - ) - .unwrap(); - optional_report = Some(store_report); - } let router = self.router.clone(); - let scheduler = self.scheduler.clone(); - let stats_copy = stats.clone(); - let resp = - self.pd_client - .store_heartbeat(stats, optional_report, dr_autosync_status.clone()); + let resp = self + .pd_client + .store_heartbeat(stats, store_report, dr_autosync_status); let f = async move { match resp.await { Ok(mut resp) => { if let Some(status) = resp.replication_status.take() { let _ = router.send_control(StoreMsg::UpdateReplicationMode(status)); } - if resp.get_require_detailed_report() { - // This store needs to report detailed info of hosted regions to PD. - // - // The info has to be up to date, meaning that all committed changes til now have to be applied before the report is sent. - // The entire process may include: - // 1. `broadcast_normal` "wait apply" messsages to all peers. - // 2. `on_unsafe_recovery_wait_apply` examines whether the peer have not-yet-applied entries, if so, memorize the target index. - // 3. `on_apply_res` checks whether entries before the "unsafe recovery report target commit index" have all been applied. - // The one who finally finds out the number of remaining tasks is 0 schedules an unsafe recovery reporting store heartbeat. - info!("required to send detailed report in the next heartbeat"); - // Init the counter with 1 in case the msg processing is faster than the distributing thus cause FSMs race to send a report. - let counter = Arc::new(AtomicUsize::new(1)); - let counter_clone = counter.clone(); - router.broadcast_normal(|| { - let _ = counter_clone.fetch_add(1, Ordering::Relaxed); - PeerMsg::UnsafeRecoveryWaitApply(counter_clone.clone()) - }); - // Reporting needs to be triggered here in case there is no message to be sent or messages processing finished before above function returns. - if counter.fetch_sub(1, Ordering::Relaxed) == 1 { - let task = Task::StoreHeartbeat { - stats: stats_copy, - store_info, - send_detailed_report: true, - dr_autosync_status, - }; - if let Err(e) = scheduler.schedule(task) { - error!("notify pd failed"; "err" => ?e); + if let Some(mut plan) = resp.recovery_plan.take() { + info!("Unsafe recovery, received a recovery plan"); + if plan.has_force_leader() { + let mut failed_stores = HashSet::default(); + for failed_store in plan.get_force_leader().get_failed_stores() { + failed_stores.insert(*failed_store); } - } - } else if resp.has_plan() { - info!("asked to execute recovery plan"); - for create in resp.get_plan().get_creates() { - info!("asked to create region"; "region" => ?create); - if let Err(e) = - router.send_control(StoreMsg::CreatePeer(create.clone())) - { - error!("fail to send creat peer message for recovery"; "err" => ?e); + let syncer = UnsafeRecoveryForceLeaderSyncer::new( + plan.get_step(), + router.clone(), + ); + for region in plan.get_force_leader().get_enter_force_leaders() { + if let Err(e) = router.significant_send( + *region, + SignificantMsg::EnterForceLeaderState { + syncer: syncer.clone(), + failed_stores: failed_stores.clone(), + }, + ) { + error!("fail to send force leader message for recovery"; "err" => ?e); + } } - } - for delete in resp.get_plan().get_deletes() { - info!("asked to delete peer"; "peer" => delete); - if let Err(e) = router.force_send(*delete, PeerMsg::Destroy(*delete)) { - error!("fail to send delete peer message for recovery"; "err" => ?e); + } else { + let syncer = UnsafeRecoveryExecutePlanSyncer::new( + plan.get_step(), + router.clone(), + ); + for create in plan.take_creates().into_iter() { + if let Err(e) = + router.send_control(StoreMsg::UnsafeRecoveryCreatePeer { + syncer: syncer.clone(), + create, + }) + { + error!("fail to send create peer message for recovery"; "err" => ?e); + } } - } - for update in resp.get_plan().get_updates() { - info!("asked to update region's range"; "region" => ?update); - if let Err(e) = router.force_send( - update.get_id(), - PeerMsg::UpdateRegionForUnsafeRecover(update.clone()), - ) { - error!("fail to send update range message for recovery"; "err" => ?e); + for delete in plan.take_tombstones().into_iter() { + if let Err(e) = router.significant_send( + delete, + SignificantMsg::UnsafeRecoveryDestroy(syncer.clone()), + ) { + error!("fail to send delete peer message for recovery"; "err" => ?e); + } + } + for mut demote in plan.take_demotes().into_iter() { + if let Err(e) = router.significant_send( + demote.get_region_id(), + SignificantMsg::UnsafeRecoveryDemoteFailedVoters { + syncer: syncer.clone(), + failed_voters: demote.take_failed_voters().into_vec(), + }, + ) { + error!("fail to send update peer list message for recovery"; "err" => ?e); + } } - } - let task = Task::StoreHeartbeat { - stats: stats_copy, - store_info, - send_detailed_report: true, - dr_autosync_status, - }; - if let Err(e) = scheduler.schedule(task) { - error!("notify pd failed"; "err" => ?e); } } } @@ -1358,7 +1311,6 @@ where "try to change peer"; "region_id" => region_id, "changes" => ?change_peer_v2.get_changes(), - "kind" => ?ConfChangeKind::confchange_kind(change_peer_v2.get_changes().len()), ); let req = new_change_peer_v2_request(change_peer_v2.take_changes().into()); send_admin_request(&router, region_id, epoch, peer, req, Callback::None, Default::default()); @@ -1787,14 +1739,9 @@ where Task::StoreHeartbeat { stats, store_info, - send_detailed_report, - dr_autosync_status, - } => self.handle_store_heartbeat( - stats, - store_info, - send_detailed_report, + report, dr_autosync_status, - ), + } => self.handle_store_heartbeat(stats, store_info, report, dr_autosync_status), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), Task::ValidatePeer { region, peer } => self.handle_validate_peer(region, peer), Task::ReadStats { read_stats } => self.handle_read_stats(read_stats), @@ -1889,7 +1836,7 @@ fn new_change_peer_request(change_type: ConfChangeType, peer: metapb::Peer) -> A req } -fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { +pub fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::ChangePeerV2); let change_peer_reqs = changes diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 3e402a03713..833920be724 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1460,19 +1460,33 @@ impl Cluster { .unwrap(); } - pub fn enter_force_leader( + pub fn enter_force_leader(&mut self, region_id: u64, store_id: u64, failed_stores: Vec) { + let mut plan = pdpb::RecoveryPlan::default(); + let mut force_leader = pdpb::ForceLeader::default(); + force_leader.set_enter_force_leaders([region_id].to_vec()); + force_leader.set_failed_stores(failed_stores.to_vec()); + plan.set_force_leader(force_leader); + // Triggers the unsafe recovery plan execution. + self.pd_client.must_set_unsafe_recovery_plan(store_id, plan); + self.must_send_store_heartbeat(store_id); + } + + pub fn must_enter_force_leader( &mut self, region_id: u64, store_id: u64, - failed_stores: HashSet, + failed_stores: Vec, ) { - let router = self.sim.rl().get_router(store_id).unwrap(); - router - .significant_send( - region_id, - SignificantMsg::EnterForceLeaderState { failed_stores }, - ) - .unwrap(); + self.enter_force_leader(region_id, store_id, failed_stores); + let mut store_report = None; + for _ in 0..20 { + store_report = self.pd_client.must_get_store_report(store_id); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); } pub fn exit_force_leader(&mut self, region_id: u64, store_id: u64) { @@ -1722,57 +1736,6 @@ impl Cluster { StoreRouter::send(&router, StoreMsg::Tick(StoreTick::PdStoreHeartbeat)).unwrap(); } - pub fn must_update_region_for_unsafe_recover(&mut self, node_id: u64, region: &metapb::Region) { - let router = self.sim.rl().get_router(node_id).unwrap(); - let mut try_cnt = 0; - loop { - if try_cnt % 50 == 0 { - // In case the message is ignored, re-send it every 50 tries. - router - .force_send( - region.get_id(), - PeerMsg::UpdateRegionForUnsafeRecover(region.clone()), - ) - .unwrap(); - } - if let Ok(Some(current)) = block_on(self.pd_client.get_region_by_id(region.get_id())) { - if current.get_start_key() == region.get_start_key() - && current.get_end_key() == region.get_end_key() - { - return; - } - } - if try_cnt > 500 { - panic!("region {:?} is not updated", region); - } - try_cnt += 1; - sleep_ms(20); - } - } - - pub fn must_recreate_region_for_unsafe_recover( - &mut self, - node_id: u64, - region: &metapb::Region, - ) { - let router = self.sim.rl().get_router(node_id).unwrap(); - let mut try_cnt = 0; - loop { - if try_cnt % 50 == 0 { - // In case the message is ignored, re-send it every 50 tries. - StoreRouter::send(&router, StoreMsg::CreatePeer(region.clone())).unwrap(); - } - if let Ok(Some(_)) = block_on(self.pd_client.get_region_by_id(region.get_id())) { - return; - } - if try_cnt > 250 { - panic!("region {:?} is not created", region); - } - try_cnt += 1; - sleep_ms(20); - } - } - pub fn gc_peer( &mut self, region_id: u64, diff --git a/components/test_raftstore/src/pd.rs b/components/test_raftstore/src/pd.rs index e3238480a31..ab494178820 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_raftstore/src/pd.rs @@ -325,7 +325,8 @@ struct PdCluster { pub check_merge_target_integrity: bool, unsafe_recovery_require_report: bool, - unsafe_recovery_store_reported: HashMap, + unsafe_recovery_store_reports: HashMap, + unsafe_recovery_plan: HashMap, } impl PdCluster { @@ -360,7 +361,8 @@ impl PdCluster { region_replication_status: HashMap::default(), check_merge_target_integrity: true, unsafe_recovery_require_report: false, - unsafe_recovery_store_reported: HashMap::default(), + unsafe_recovery_store_reports: HashMap::default(), + unsafe_recovery_plan: HashMap::default(), } } @@ -517,11 +519,11 @@ impl PdCluster { region.get_region_epoch().clone(), ); assert!(end_key > start_key); - let created_by_unsafe_recover = (!start_key.is_empty() || !end_key.is_empty()) - && incoming_epoch.get_version() == 1 - && incoming_epoch.get_conf_ver() == 1; + let created_by_unsafe_recovery = (!start_key.is_empty() || !end_key.is_empty()) + && incoming_epoch.get_version() == 0 + && incoming_epoch.get_conf_ver() == 0; let overlaps = self.get_overlap(start_key, end_key); - if created_by_unsafe_recover { + if created_by_unsafe_recovery { // Allow recreated region by unsafe recover to overwrite other regions with a "older" // epoch. return Ok(overlaps); @@ -727,10 +729,14 @@ impl PdCluster { self.min_resolved_ts } - fn handle_store_heartbeat(&mut self) -> Result { + fn handle_store_heartbeat(&mut self, store_id: u64) -> Result { let mut resp = pdpb::StoreHeartbeatResponse::default(); resp.set_require_detailed_report(self.unsafe_recovery_require_report); self.unsafe_recovery_require_report = false; + if let Some((_, plan)) = self.unsafe_recovery_plan.remove_entry(&store_id) { + debug!("Unsafe recovery, sending recovery plan"; "store_id" => store_id, "plan" => ?plan); + resp.set_recovery_plan(plan); + } Ok(resp) } @@ -739,19 +745,16 @@ impl PdCluster { self.unsafe_recovery_require_report = require_report; } - fn get_store_reported(&self, store_id: &u64) -> i32 { - *self - .unsafe_recovery_store_reported - .get(store_id) - .unwrap_or(&0) + fn set_unsafe_recovery_plan(&mut self, store_id: u64, recovery_plan: pdpb::RecoveryPlan) { + self.unsafe_recovery_plan.insert(store_id, recovery_plan); } - fn store_reported_inc(&mut self, store_id: u64) { - let reported = self - .unsafe_recovery_store_reported - .entry(store_id) - .or_insert(0); - *reported += 1; + fn get_store_report(&mut self, store_id: u64) -> Option { + self.unsafe_recovery_store_reports.remove(&store_id) + } + + fn set_store_report(&mut self, store_id: u64, report: pdpb::StoreReport) { + let _ = self.unsafe_recovery_store_reports.insert(store_id, report); } } @@ -1308,8 +1311,12 @@ impl TestPdClient { self.cluster.wl().set_require_report(require_report); } - pub fn must_get_store_reported(&self, store_id: &u64) -> i32 { - self.cluster.rl().get_store_reported(store_id) + pub fn must_get_store_report(&self, store_id: u64) -> Option { + self.cluster.wl().get_store_report(store_id) + } + + pub fn must_set_unsafe_recovery_plan(&self, store_id: u64, plan: pdpb::RecoveryPlan) { + self.cluster.wl().set_unsafe_recovery_plan(store_id, plan) } } @@ -1595,11 +1602,11 @@ impl PdClient for TestPdClient { cluster.store_stats.insert(store_id, stats); - if report.is_some() { - cluster.store_reported_inc(store_id); + if let Some(store_report) = report { + cluster.set_store_report(store_id, store_report); } - let mut resp = cluster.handle_store_heartbeat().unwrap(); + let mut resp = cluster.handle_store_heartbeat(store_id).unwrap(); if let Some(ref status) = cluster.replication_status { resp.set_replication_status(status.clone()); diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index cc090bbb782..d3a9d2a6d43 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -5,13 +5,14 @@ use std::sync::Arc; use std::time::Duration; use futures::executor::block_on; +use kvproto::{metapb, pdpb}; use pd_client::PdClient; use raftstore::store::util::find_peer; use test_raftstore::*; use tikv_util::{config::ReadableDuration, mpsc}; #[test] -fn test_unsafe_recover_send_report() { +fn test_unsafe_recovery_send_report() { let mut cluster = new_server_cluster(0, 3); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); @@ -47,12 +48,13 @@ fn test_unsafe_recover_send_report() { cluster.stop_node(nodes[2]); // Triggers the unsafe recovery store reporting process. - pd_client.must_set_require_report(true); + let plan = pdpb::RecoveryPlan::default(); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); cluster.must_send_store_heartbeat(nodes[0]); // No store report is sent, since there are peers have unapplied entries. for _ in 0..20 { - assert_eq!(pd_client.must_get_store_reported(&nodes[0]), 0); + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); sleep_ms(100); } @@ -60,15 +62,139 @@ fn test_unsafe_recover_send_report() { drop(apply_released_tx); // Store reports are sent once the entries are applied. - let mut reported = false; + let mut store_report = None; for _ in 0..20 { - if pd_client.must_get_store_reported(&nodes[0]) > 0 { - reported = true; + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { break; } sleep_ms(100); } - assert_eq!(reported, true); + assert_ne!(store_report, None); + fail::remove("on_handle_apply_store_1"); +} + +#[test] +fn test_unsafe_recovery_execution_result_report() { + let mut cluster = new_server_cluster(0, 3); + // Prolong force leader time. + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::minutes(5); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::minutes(10); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(2); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.put(b"random_key1", b"random_val1").unwrap(); + + // Split the region into 2, and remove one of them, so that we can test both region peer list + // update and region creation. + pd_client.must_split_region( + region, + pdpb::CheckPolicy::Usekey, + vec![b"random_key1".to_vec()], + ); + let region1 = pd_client.get_region(b"random_key".as_ref()).unwrap(); + let region2 = pd_client.get_region(b"random_key1".as_ref()).unwrap(); + let region1_store0_peer = find_peer(®ion1, nodes[0]).unwrap().to_owned(); + pd_client.must_remove_peer(region1.get_id(), region1_store0_peer); + cluster.must_remove_region(nodes[0], region1.get_id()); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region2.get_id(), + region2.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + } + + cluster.must_enter_force_leader(region2.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let to_be_removed: Vec = region2 + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region2.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_end_key(b"random_key1".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + plan.mut_creates().push(create); + + // Blocks the raft apply process on store 1 entirely . + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("on_handle_apply_store_1", move || { + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..20 { + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); + sleep_ms(100); + } + + // Unblocks the apply process. + drop(apply_released_tx); + + // Store reports are sent once the entries are applied. + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + for peer_report in store_report.unwrap().get_peer_reports() { + let region = peer_report.get_region_state().get_region(); + if region.get_id() == 101 { + assert_eq!(region.get_end_key(), b"random_key1".to_vec()); + } else { + assert_eq!(region.get_id(), region2.get_id()); + for peer in region.get_peers() { + if peer.get_store_id() != nodes[0] { + assert_eq!(peer.get_role(), metapb::PeerRole::Learner); + } + } + } + } fail::remove("on_handle_apply_store_1"); } @@ -128,12 +254,13 @@ fn test_unsafe_recover_wait_for_snapshot_apply() { .unwrap(); // Triggers the unsafe recovery store reporting process. - pd_client.must_set_require_report(true); + let plan = pdpb::RecoveryPlan::default(); + pd_client.must_set_unsafe_recovery_plan(nodes[1], plan); cluster.must_send_store_heartbeat(nodes[1]); // No store report is sent, since there are peers have unapplied entries. for _ in 0..20 { - assert_eq!(pd_client.must_get_store_reported(&nodes[1]), 0); + assert_eq!(pd_client.must_get_store_report(nodes[1]), None); sleep_ms(100); } @@ -141,16 +268,199 @@ fn test_unsafe_recover_wait_for_snapshot_apply() { drop(apply_released_tx); // Store reports are sent once the entries are applied. - let mut reported = false; + let mut store_report = None; for _ in 0..20 { - if pd_client.must_get_store_reported(&nodes[1]) > 0 { - reported = true; + store_report = pd_client.must_get_store_report(nodes[1]); + if store_report.is_some() { break; } sleep_ms(100); } - assert_eq!(reported, true); + assert_ne!(store_report, None); fail::remove("worker_gc_raft_log"); fail::remove("worker_gc_raft_log_finished"); fail::remove("raft_before_apply_snap_callback"); } + +#[test] +fn test_unsafe_recovery_demotion_reentrancy() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[2]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + } + + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + + // Blocks the raft apply process on store 1 entirely . + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("on_handle_apply_store_1", move || { + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..10 { + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); + sleep_ms(100); + } + + // Send the plan again. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + // Unblocks the apply process. + drop(apply_released_tx); + + let mut demoted = false; + for _ in 0..10 { + let region_in_pd = block_on(pd_client.get_region_by_id(region.get_id())) + .unwrap() + .unwrap(); + assert_eq!(region_in_pd.get_peers().len(), 3); + demoted = region_in_pd + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + sleep_ms(100); + } + assert_eq!(demoted, true); + fail::remove("on_handle_apply_store_1"); +} + +#[test] +fn test_unsafe_recovery_create_destroy_reentrancy() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.put(b"random_key1", b"random_val1").unwrap(); + + // Split the region into 2, and remove one of them, so that we can test both region peer list + // update and region creation. + pd_client.must_split_region( + region, + pdpb::CheckPolicy::Usekey, + vec![b"random_key1".to_vec()], + ); + let region1 = pd_client.get_region(b"random_key".as_ref()).unwrap(); + let region2 = pd_client.get_region(b"random_key1".as_ref()).unwrap(); + let region1_store0_peer = find_peer(®ion1, nodes[0]).unwrap().to_owned(); + pd_client.must_remove_peer(region1.get_id(), region1_store0_peer); + cluster.must_remove_region(nodes[0], region1.get_id()); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region2.get_id(), + region2.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + } + + cluster.must_enter_force_leader(region2.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_end_key(b"random_key1".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + plan.mut_creates().push(create); + + plan.mut_tombstones().push(region2.get_id()); + + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + sleep_ms(100); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + + // Store reports are sent once the entries are applied. + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + let report = store_report.unwrap(); + let peer_reports = report.get_peer_reports(); + assert_eq!(peer_reports.len(), 1); + let reported_region = peer_reports[0].get_region_state().get_region(); + assert_eq!(reported_region.get_id(), 101); + assert_eq!(reported_region.get_peers().len(), 1); + assert_eq!(reported_region.get_peers()[0].get_id(), 102); + fail::remove("on_handle_apply_store_1"); +} diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index 0b1c533b542..487389e407f 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -4,18 +4,33 @@ use std::iter::FromIterator; use std::sync::Arc; use std::time::Duration; -use collections::HashSet; use futures::executor::block_on; -use kvproto::metapb; +use kvproto::{metapb, pdpb}; use pd_client::PdClient; -use raft::eraftpb::ConfChangeType; -use raft::eraftpb::MessageType; +use raft::eraftpb::{ConfChangeType, MessageType}; +use raftstore::store::util::find_peer; use test_raftstore::*; use tikv_util::config::ReadableDuration; use tikv_util::HandyRwLock; +fn confirm_quorum_is_lost(cluster: &mut Cluster, region: &metapb::Region) { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); +} + #[test] -fn test_unsafe_recover_update_region() { +fn test_unsafe_recovery_demote_failed_voters() { let mut cluster = new_server_cluster(0, 3); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); @@ -26,30 +41,118 @@ fn test_unsafe_recover_update_region() { pd_client.disable_default_operator(); let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); cluster.stop_node(nodes[1]); cluster.stop_node(nodes[2]); - cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); - let mut update = metapb::Region::default(); - update.set_id(1); - update.set_end_key(b"anykey2".to_vec()); - for p in region.get_peers() { - if p.get_store_id() == nodes[0] { - update.mut_peers().push(p.clone()); + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + demoted = true; + for peer in region.get_peers() { + if peer.get_id() != nodes[0] && peer.get_role() == metapb::PeerRole::Voter { + demoted = false; + } + } + if demoted { + break; } + sleep_ms(200); } - update.mut_region_epoch().set_version(1); - update.mut_region_epoch().set_conf_ver(1); - // Removes the boostrap region, since it overlaps with any regions we create. - cluster.must_update_region_for_unsafe_recover(nodes[0], &update); + assert_eq!(demoted, true); +} + +// Demote non-exist voters will not work, but TiKV should still report to PD. +#[test] +fn test_unsafe_recovery_demote_non_exist_voters() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - assert_eq!(region.get_end_key(), b"anykey2"); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + let mut peer = metapb::Peer::default(); + peer.set_id(12345); + peer.set_store_id(region.get_id()); + peer.set_role(metapb::PeerRole::Voter); + demote.mut_failed_voters().push(peer); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + let report = store_report.unwrap(); + let peer_reports = report.get_peer_reports(); + assert_eq!(peer_reports.len(), 1); + let reported_region = peer_reports[0].get_region_state().get_region(); + assert_eq!(reported_region.get_id(), region.get_id()); + assert_eq!(reported_region.get_peers().len(), 3); + let demoted = reported_region + .get_peers() + .iter() + .any(|peer| peer.get_role() != metapb::PeerRole::Voter); + assert_eq!(demoted, false); + + let region_in_pd = block_on(pd_client.get_region_by_id(region.get_id())) + .unwrap() + .unwrap(); + assert_eq!(region_in_pd.get_peers().len(), 3); + let demoted = region_in_pd + .get_peers() + .iter() + .any(|peer| peer.get_role() != metapb::PeerRole::Voter); + assert_eq!(demoted, false); } #[test] -fn test_unsafe_recover_create_region() { +fn test_unsafe_recovery_auto_promote_learner() { let mut cluster = new_server_cluster(0, 3); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); @@ -60,25 +163,178 @@ fn test_unsafe_recover_create_region() { pd_client.disable_default_operator(); let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store0 = find_peer(®ion, nodes[0]).unwrap(); + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + // replace one peer with learner + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store0.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(nodes[0], peer_on_store0.get_id()), + ); + // Sleep 100 ms to wait for the new learner to be initialized. + sleep_ms(100); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + let mut promoted = false; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + promoted = region + .get_peers() + .iter() + .find(|peer| peer.get_store_id() == nodes[0]) + .unwrap() + .get_role() + == metapb::PeerRole::Voter; + + demoted = region + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted && promoted { + break; + } + sleep_ms(100); + } + assert_eq!(demoted, true); + assert_eq!(promoted, true); +} + +#[test] +fn test_unsafe_recovery_already_in_joint_state() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store0 = find_peer(®ion, nodes[0]).unwrap(); + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store2.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(nodes[2], peer_on_store2.get_id()), + ); + // Wait the new learner to be initialized. + sleep_ms(100); + pd_client.must_joint_confchange( + region.get_id(), + vec![ + ( + ConfChangeType::AddLearnerNode, + new_learner_peer(nodes[0], peer_on_store0.get_id()), + ), + ( + ConfChangeType::AddNode, + new_peer(nodes[2], peer_on_store2.get_id()), + ), + ], + ); cluster.stop_node(nodes[1]); cluster.stop_node(nodes[2]); cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); - let mut update = metapb::Region::default(); - update.set_id(1); - update.set_end_key(b"anykey".to_vec()); - for p in region.get_peers() { - if p.get_store_id() == nodes[0] { - update.mut_peers().push(p.clone()); + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + let mut promoted = false; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + promoted = region + .get_peers() + .iter() + .find(|peer| peer.get_store_id() == nodes[0]) + .unwrap() + .get_role() + == metapb::PeerRole::Voter; + + demoted = region + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted && promoted { + break; } + sleep_ms(100); } - update.mut_region_epoch().set_version(1); - update.mut_region_epoch().set_conf_ver(1); - // Removes the bootstrap region, since it overlaps with any regions we create. - cluster.must_update_region_for_unsafe_recover(nodes[0], &update); - block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + assert_eq!(demoted, true); + assert_eq!(promoted, true); +} + +#[test] +fn test_unsafe_recovery_create_region() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let store0_peer = find_peer(®ion, nodes[0]).unwrap().to_owned(); + + // Removes the boostrap region, since it overlaps with any regions we create. + pd_client.must_remove_peer(region.get_id(), store0_peer); + cluster.must_remove_region(nodes[0], region.get_id()); + + configure_for_lease_read(&mut cluster, None, None); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); + let mut create = metapb::Region::default(); create.set_id(101); create.set_start_key(b"anykey".to_vec()); @@ -86,9 +342,19 @@ fn test_unsafe_recover_create_region() { peer.set_id(102); peer.set_store_id(nodes[0]); create.mut_peers().push(peer); - cluster.must_recreate_region_for_unsafe_recover(nodes[0], &create); - let region = pd_client.get_region(b"anykey1").unwrap(); - assert_eq!(create.get_id(), region.get_id()); + let mut plan = pdpb::RecoveryPlan::default(); + plan.mut_creates().push(create); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + let mut created = false; + for _ in 1..11 { + let region = pd_client.get_region(b"anykey1").unwrap(); + if region.get_id() == 101 { + created = true; + } + sleep_ms(200); + } + assert_eq!(created, true); } fn must_get_error_recovery_in_progress( @@ -125,23 +391,17 @@ fn test_force_leader_three_nodes() { let region = cluster.get_region(b"k1"); cluster.must_split(®ion, b"k9"); - let mut region = cluster.get_region(b"k2"); + let region = cluster.get_region(b"k2"); let peer_on_store3 = find_peer(®ion, 3).unwrap(); cluster.must_transfer_leader(region.get_id(), peer_on_store3.clone()); cluster.stop_node(2); cluster.stop_node(3); - let put = new_put_cmd(b"k2", b"v2"); - let req = new_request(region.get_id(), region.take_region_epoch(), vec![put], true); // quorum is lost, can't propose command successfully. - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + confirm_quorum_is_lost(&mut cluster, ®ion); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![2, 3])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![2, 3]); // remove the peers on failed nodes cluster .pd_client @@ -179,7 +439,7 @@ fn test_force_leader_five_nodes() { let region = cluster.get_region(b"k1"); cluster.must_split(®ion, b"k9"); - let mut region = cluster.get_region(b"k2"); + let region = cluster.get_region(b"k2"); let peer_on_store5 = find_peer(®ion, 5).unwrap(); cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); @@ -187,16 +447,10 @@ fn test_force_leader_five_nodes() { cluster.stop_node(4); cluster.stop_node(5); - let put = new_put_cmd(b"k2", b"v2"); - let req = new_request(region.get_id(), region.take_region_epoch(), vec![put], true); // quorum is lost, can't propose command successfully. - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + confirm_quorum_is_lost(&mut cluster, ®ion); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // remove the peers on failed nodes cluster .pd_client @@ -241,7 +495,7 @@ fn test_force_leader_for_learner() { let region = cluster.get_region(b"k1"); cluster.must_split(®ion, b"k9"); - let mut region = cluster.get_region(b"k2"); + let region = cluster.get_region(b"k2"); let peer_on_store5 = find_peer(®ion, 5).unwrap(); cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); @@ -254,6 +508,8 @@ fn test_force_leader_for_learner() { region.get_id(), new_learner_peer(peer_on_store1.get_store_id(), peer_on_store1.get_id()), ); + // Sleep 100 ms to wait for the new learner to be initialized. + sleep_ms(100); must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); @@ -261,14 +517,7 @@ fn test_force_leader_for_learner() { cluster.stop_node(4); cluster.stop_node(5); - let put = new_put_cmd(b"k2", b"v2"); - let req = new_request(region.get_id(), region.take_region_epoch(), vec![put], true); - // quorum is lost, can't propose command successfully. - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + confirm_quorum_is_lost(&mut cluster, ®ion); // wait election timeout std::thread::sleep(Duration::from_millis( @@ -276,7 +525,7 @@ fn test_force_leader_for_learner() { * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() * 2, )); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // promote the learner first and remove the peers on failed nodes cluster .pd_client @@ -328,7 +577,7 @@ fn test_force_leader_on_hibernated_leader() { cluster.stop_node(4); cluster.stop_node(5); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // remove the peers on failed nodes cluster .pd_client @@ -377,7 +626,7 @@ fn test_force_leader_on_hibernated_follower() { cluster.stop_node(4); cluster.stop_node(5); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // remove the peers on failed nodes cluster .pd_client @@ -449,7 +698,7 @@ fn test_force_leader_trigger_snapshot() { * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() * 5, ); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); sleep_ms( cluster.cfg.raft_store.raft_election_timeout_ticks as u64 @@ -514,18 +763,7 @@ fn test_force_leader_with_uncommitted_conf_change() { cluster.stop_node(4); cluster.stop_node(5); - let put = new_put_cmd(b"k2", b"v2"); - let req = new_request( - region.get_id(), - region.get_region_epoch().clone(), - vec![put], - true, - ); - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + confirm_quorum_is_lost(&mut cluster, ®ion); // an uncommitted conf-change let cmd = new_change_peer_request( @@ -545,7 +783,7 @@ fn test_force_leader_with_uncommitted_conf_change() { * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() * 2, )); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // the uncommitted conf-change is committed successfully after being force leader cluster .pd_client @@ -592,7 +830,7 @@ fn test_force_leader_on_healthy_region() { cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); // try to enter force leader, it can't succeed due to quorum isn't lost - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // make sure it leaves pre force leader state. std::thread::sleep(Duration::from_millis( cluster.cfg.raft_store.raft_election_timeout_ticks as u64 @@ -622,7 +860,7 @@ fn test_force_leader_on_wrong_leader() { let region = cluster.get_region(b"k1"); cluster.must_split(®ion, b"k9"); - let mut region = cluster.get_region(b"k2"); + let region = cluster.get_region(b"k2"); let peer_on_store5 = find_peer(®ion, 5).unwrap(); cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); @@ -639,17 +877,10 @@ fn test_force_leader_on_wrong_leader() { cluster.stop_node(1); cluster.run_node(1).unwrap(); - let put = new_put_cmd(b"k3", b"v3"); - let req = new_request(region.get_id(), region.take_region_epoch(), vec![put], true); - // quorum is lost, can't propose command successfully. - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(100)) - .is_err() - ); + confirm_quorum_is_lost(&mut cluster, ®ion); // try to force leader on peer of node2 which is stale - cluster.enter_force_leader(region.get_id(), 2, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 2, vec![3, 4, 5]); // can't propose confchange as it's not in force leader state let cmd = new_change_peer_request( ConfChangeType::RemoveNode, @@ -692,11 +923,11 @@ fn test_force_leader_twice_on_different_peers() { cluster.run_node(1).unwrap(); cluster.stop_node(2); cluster.run_node(2).unwrap(); + confirm_quorum_is_lost(&mut cluster, ®ion); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); - std::thread::sleep(Duration::from_millis(100)); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // enter force leader on a different peer - cluster.enter_force_leader(region.get_id(), 2, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 2, vec![3, 4, 5]); // leader is the peer of store 2 assert_eq!( cluster.leader_of_region(region.get_id()).unwrap(), @@ -761,10 +992,8 @@ fn test_force_leader_twice_on_same_peer() { cluster.stop_node(2); cluster.run_node(2).unwrap(); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); - std::thread::sleep(Duration::from_millis(100)); - // enter force leader on the same peer - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // remove the peers on failed nodes cluster .pd_client @@ -814,7 +1043,7 @@ fn test_force_leader_multiple_election_rounds() { * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() * 2, )); - cluster.enter_force_leader(region.get_id(), 1, HashSet::from_iter(vec![3, 4, 5])); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); // wait multiple election rounds std::thread::sleep(Duration::from_millis( cluster.cfg.raft_store.raft_election_timeout_ticks as u64