From 228cf2bb424c46bae652dd936072d4b62d45dc16 Mon Sep 17 00:00:00 2001 From: Wen Date: Wed, 20 Sep 2023 23:59:55 -0700 Subject: [PATCH 01/20] Add wen_restart module: - Implement reading LastVotedForkSlots from blockstore. - Add proto file to record the intermediate results. - Also link wen_restart into validator. - Move recreation of tower outside replay_stage so we can get last_vote. --- Cargo.lock | 23 ++++ Cargo.toml | 2 + core/Cargo.toml | 1 + core/src/replay_stage.rs | 19 ++-- core/src/tvu.rs | 7 +- core/src/validator.rs | 27 ++++- local-cluster/src/validator_configs.rs | 1 + validator/src/cli.rs | 22 ++++ wen-restart/Cargo.toml | 43 +++++++ wen-restart/build.rs | 41 +++++++ wen-restart/protos/wen_restart.proto | 24 ++++ wen-restart/src/lib.rs | 7 ++ wen-restart/src/wen_restart.rs | 151 +++++++++++++++++++++++++ 13 files changed, 354 insertions(+), 14 deletions(-) create mode 100644 wen-restart/Cargo.toml create mode 100644 wen-restart/build.rs create mode 100644 wen-restart/protos/wen_restart.proto create mode 100644 wen-restart/src/lib.rs create mode 100644 wen-restart/src/wen_restart.rs diff --git a/Cargo.lock b/Cargo.lock index 30729d6ab7dc08..9a16d89331296e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5786,6 +5786,7 @@ dependencies = [ "solana-version", "solana-vote", "solana-vote-program", + "solana-wen-restart", "static_assertions", "strum", "strum_macros", @@ -7531,6 +7532,28 @@ dependencies = [ "solana-version", ] +[[package]] +name = "solana-wen-restart" +version = "1.17.0" +dependencies = [ + "log", + "prost", + "prost-build", + "prost-types", + "protobuf-src", + "rustc_version 0.4.0", + "serial_test", + "solana-entry", + "solana-gossip", + "solana-ledger", + "solana-logger", + "solana-program", + "solana-runtime", + "solana-sdk", + "solana-streamer", + "solana-vote-program", +] + [[package]] name = "solana-zk-keygen" version = "1.17.0" diff --git a/Cargo.toml b/Cargo.toml index 58cb4f83055604..739104e354fc9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -111,6 +111,7 @@ members = [ "version", "vote", "watchtower", + "wen-restart", "zk-keygen", "zk-token-sdk", ] @@ -370,6 +371,7 @@ solana-udp-client = { path = "udp-client", version = "=1.17.0" } solana-version = { path = "version", version = "=1.17.0" } solana-vote = { path = "vote", version = "=1.17.0" } solana-vote-program = { path = "programs/vote", version = "=1.17.0" } +solana-wen-restart = { path = "wen-restart", version = "=1.17.0" } solana-zk-keygen = { path = "zk-keygen", version = "=1.17.0" } solana-zk-token-proof-program = { path = "programs/zk-token-proof", version = "=1.17.0" } solana-zk-token-sdk = { path = "zk-token-sdk", version = "=1.17.0" } diff --git a/core/Cargo.toml b/core/Cargo.toml index fcab8ff8775912..c3923613b768a2 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -71,6 +71,7 @@ solana-turbine = { workspace = true } solana-version = { workspace = true } solana-vote = { workspace = true } solana-vote-program = { workspace = true } +solana-wen-restart = { workspace = true } strum = { workspace = true, features = ["derive"] } strum_macros = { workspace = true } sys-info = { workspace = true } diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 0fec5020d6dcb9..f30eb59a246f20 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -29,7 +29,6 @@ use { }, rewards_recorder_service::{RewardsMessage, RewardsRecorderSender}, unfrozen_gossip_verified_vote_hashes::UnfrozenGossipVerifiedVoteHashes, - validator::ProcessBlockStore, voting_service::VoteOp, window_service::DuplicateSlotReceiver, }, @@ -483,7 +482,7 @@ impl ReplayStage { ledger_signal_receiver: Receiver, duplicate_slots_receiver: DuplicateSlotReceiver, poh_recorder: Arc>, - maybe_process_blockstore: Option, + maybe_tower: Option, vote_tracker: Arc, cluster_slots: Arc, retransmit_slots_sender: Sender, @@ -502,13 +501,15 @@ impl ReplayStage { banking_tracer: Arc, popular_pruned_forks_receiver: PopularPrunedForksReceiver, ) -> Result { - let mut tower = if let Some(process_blockstore) = maybe_process_blockstore { - let tower = process_blockstore.process_to_create_tower()?; - info!("Tower state: {:?}", tower); - tower - } else { - warn!("creating default tower...."); - Tower::default() + let mut tower = match maybe_tower { + Some(tower) => { + info!("Tower state: {:?}", tower); + tower + } + None => { + warn!("creating default tower...."); + Tower::default() + } }; let ReplayStageConfig { diff --git a/core/src/tvu.rs b/core/src/tvu.rs index 0b8358863fbceb..aee7ecc2146a15 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -11,7 +11,7 @@ use { }, cluster_slots_service::{cluster_slots::ClusterSlots, ClusterSlotsService}, completed_data_sets_service::CompletedDataSetsSender, - consensus::tower_storage::TowerStorage, + consensus::{tower_storage::TowerStorage, Tower}, cost_update_service::CostUpdateService, drop_bank_service::DropBankService, ledger_cleanup_service::LedgerCleanupService, @@ -19,7 +19,6 @@ use { replay_stage::{ReplayStage, ReplayStageConfig}, rewards_recorder_service::RewardsRecorderSender, shred_fetch_stage::ShredFetchStage, - validator::ProcessBlockStore, voting_service::VotingService, warm_quic_cache_service::WarmQuicCacheService, window_service::WindowService, @@ -109,7 +108,7 @@ impl Tvu { ledger_signal_receiver: Receiver, rpc_subscriptions: &Arc, poh_recorder: &Arc>, - maybe_process_block_store: Option, + maybe_tower: Option, tower_storage: Arc, leader_schedule_cache: &Arc, exit: Arc, @@ -292,7 +291,7 @@ impl Tvu { ledger_signal_receiver, duplicate_slots_receiver, poh_recorder.clone(), - maybe_process_block_store, + maybe_tower, vote_tracker, cluster_slots, retransmit_slots_sender, diff --git a/core/src/validator.rs b/core/src/validator.rs index a0c39da764239b..0054d4a125417b 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -119,6 +119,7 @@ use { solana_streamer::{socket::SocketAddrSpace, streamer::StakedNodes}, solana_turbine::{self, broadcast_stage::BroadcastStageType}, solana_vote_program::vote_state, + solana_wen_restart::wen_restart::wait_for_wen_restart, std::{ collections::{HashMap, HashSet}, net::SocketAddr, @@ -259,6 +260,7 @@ pub struct ValidatorConfig { pub block_production_method: BlockProductionMethod, pub generator_config: Option, pub use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup, + pub wen_restart_proto_path: Option, } impl Default for ValidatorConfig { @@ -326,6 +328,7 @@ impl Default for ValidatorConfig { block_production_method: BlockProductionMethod::default(), generator_config: None, use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup::default(), + wen_restart_proto_path: None, } } } @@ -1202,6 +1205,10 @@ impl Validator { ) .unwrap(); + let in_wen_restart = config.wen_restart_proto_path.is_some() && !waited_for_supermajority; + let tower = process_blockstore.process_to_create_tower()?; + let last_vote = tower.last_vote(); + let (replay_vote_sender, replay_vote_receiver) = unbounded(); let tvu = Tvu::new( vote_account, @@ -1218,7 +1225,7 @@ impl Validator { ledger_signal_receiver, &rpc_subscriptions, &poh_recorder, - Some(process_blockstore), + Some(tower), config.tower_storage.clone(), &leader_schedule_cache, exit.clone(), @@ -1257,6 +1264,24 @@ impl Validator { repair_quic_endpoint_sender, )?; + if in_wen_restart { + info!("Waiting for wen_restart phase one to finish"); + match wait_for_wen_restart( + &config.wen_restart_proto_path.clone().unwrap(), + last_vote, + blockstore.clone(), + cluster_info.clone(), + ) { + Ok(()) => { + return Err( + "wen_restart phase one completed, will restart to wait for supermajority" + .to_string(), + ); + } + Err(e) => return Err(format!("wait_for_wen_restart failed: {e:?}")), + }; + } + let tpu = Tpu::new( &cluster_info, &poh_recorder, diff --git a/local-cluster/src/validator_configs.rs b/local-cluster/src/validator_configs.rs index 70211b5dac666b..d480dc2653567e 100644 --- a/local-cluster/src/validator_configs.rs +++ b/local-cluster/src/validator_configs.rs @@ -68,6 +68,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig { block_production_method: config.block_production_method.clone(), generator_config: config.generator_config.clone(), use_snapshot_archives_at_startup: config.use_snapshot_archives_at_startup, + wen_restart_proto_path: config.wen_restart_proto_path.clone(), } } diff --git a/validator/src/cli.rs b/validator/src/cli.rs index 1fbe16cec77639..466c4968ccb1e3 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -1374,6 +1374,25 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .possible_values(BlockProductionMethod::cli_names()) .help(BlockProductionMethod::cli_message()) ) + .arg( + Arg::with_name("wen_restart") + .long("wen-restart") + .value_name("DIR") + .takes_value(true) + .required(false) + .default_value(&default_args.wen_restart_path) + .conflicts_with("wait_for_supermajority") + .help( + "When specified, make validator enter Wen Restart, where it doesn't + vote, create new blocks, or transmit new blocks. The only thing it + does is Gossip last vote information with other validators in Wen + Restart and figure out whether consensus can be reached to proceed + into a cluster restart. + The progress will be saved in the file location provided. When all is + done, exit the validator and use the progress and snapshot generated + previously to enter wait_for_supermajority mode automatically. + ") + ) .args(&get_deprecated_arguments()) .after_help("The default subcommand is run") .subcommand( @@ -1923,6 +1942,8 @@ pub struct DefaultArgs { pub wait_for_restart_window_max_delinquent_stake: String, pub banking_trace_dir_byte_limit: String, + + pub wen_restart_path: String, } impl DefaultArgs { @@ -2001,6 +2022,7 @@ impl DefaultArgs { wait_for_restart_window_min_idle_time: "10".to_string(), wait_for_restart_window_max_delinquent_stake: "5".to_string(), banking_trace_dir_byte_limit: BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT.to_string(), + wen_restart_path: "wen_restart_progress.proto".to_string(), } } } diff --git a/wen-restart/Cargo.toml b/wen-restart/Cargo.toml new file mode 100644 index 00000000000000..b67e56b026f6d3 --- /dev/null +++ b/wen-restart/Cargo.toml @@ -0,0 +1,43 @@ +[package] +name = "solana-wen-restart" +description = "Automatically repair and restart protocol" +documentation = "https://github.com/solana-foundation/solana-improvement-documents/pull/46" +version = { workspace = true } +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +license = { workspace = true } +edition = { workspace = true } +publish = false + +[dependencies] +log = { workspace = true } +prost = { workspace = true } +prost-types = { workspace = true } +solana-gossip = { workspace = true } +solana-ledger = { workspace = true } +solana-logger = { workspace = true } +solana-program = { workspace = true } +solana-runtime = { workspace = true } +solana-sdk = { workspace = true } +solana-vote-program = { workspace = true } + +[dev-dependencies] +serial_test = { workspace = true } +solana-entry = { workspace = true } +solana-streamer = { workspace = true } + +[build-dependencies] +rustc_version = { workspace = true } +prost-build = "0.11.4" + +# windows users should install the protobuf compiler manually and set the PROTOC +# envar to point to the installed binary +[target."cfg(not(windows))".build-dependencies] +protobuf-src = { workspace = true } + +[lib] +name = "solana_wen_restart" + +[package.metadata.docs.rs] +targets = ["x86_64-unknown-linux-gnu"] diff --git a/wen-restart/build.rs b/wen-restart/build.rs new file mode 100644 index 00000000000000..a5a44bfee11cdd --- /dev/null +++ b/wen-restart/build.rs @@ -0,0 +1,41 @@ +extern crate rustc_version; + +use { + rustc_version::{version_meta, Channel}, + std::io::Result, +}; + +fn main() -> Result<()> { + const PROTOC_ENVAR: &str = "PROTOC"; + if std::env::var(PROTOC_ENVAR).is_err() { + #[cfg(not(windows))] + std::env::set_var(PROTOC_ENVAR, protobuf_src::protoc()); + } + + // Copied and adapted from + // https://github.com/Kimundi/rustc-version-rs/blob/1d692a965f4e48a8cb72e82cda953107c0d22f47/README.md#example + // Licensed under Apache-2.0 + MIT + match version_meta().unwrap().channel { + Channel::Stable => { + println!("cargo:rustc-cfg=RUSTC_WITHOUT_SPECIALIZATION"); + } + Channel::Beta => { + println!("cargo:rustc-cfg=RUSTC_WITHOUT_SPECIALIZATION"); + } + Channel::Nightly => { + println!("cargo:rustc-cfg=RUSTC_WITH_SPECIALIZATION"); + } + Channel::Dev => { + println!("cargo:rustc-cfg=RUSTC_WITH_SPECIALIZATION"); + // See https://github.com/solana-labs/solana/issues/11055 + // We may be running the custom `rust-bpf-builder` toolchain, + // which currently needs `#![feature(proc_macro_hygiene)]` to + // be applied. + println!("cargo:rustc-cfg=RUSTC_NEEDS_PROC_MACRO_HYGIENE"); + } + } + + // Generate rust files from protos. + prost_build::compile_protos(&["protos/wen_restart.proto"], &["protos/"])?; + Ok(()) +} diff --git a/wen-restart/protos/wen_restart.proto b/wen-restart/protos/wen_restart.proto new file mode 100644 index 00000000000000..fe5dfd98629b88 --- /dev/null +++ b/wen-restart/protos/wen_restart.proto @@ -0,0 +1,24 @@ +syntax = "proto3"; +package solana.wen_restart_proto; + +message WenRestartProgress { + enum State { + INIT = 0; + LAST_VOTED_FORK_SLOTS = 1; + HEAVIEST_FORK = 2; + GENERATING_SNAPSHOT = 3; + FINISHED_SNAPSHOT = 4; + WAITING_FOR_SUPERMAJORITY = 5; + DONE = 6; + } + + message InitRecord { + uint64 last_vote_slot = 1; + string last_vote_bankhash = 2; + uint32 shred_version = 3; + + } + + State state = 1; + optional InitRecord init_record = 2; +} \ No newline at end of file diff --git a/wen-restart/src/lib.rs b/wen-restart/src/lib.rs new file mode 100644 index 00000000000000..e58a6d04bf831f --- /dev/null +++ b/wen-restart/src/lib.rs @@ -0,0 +1,7 @@ +pub(crate) mod solana { + pub(crate) mod wen_restart_proto { + include!(concat!(env!("OUT_DIR"), "/solana.wen_restart_proto.rs")); + } +} + +pub mod wen_restart; diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs new file mode 100644 index 00000000000000..927106e5739002 --- /dev/null +++ b/wen-restart/src/wen_restart.rs @@ -0,0 +1,151 @@ +//! The `wen-restart` module handles automatically repair in cluster restart + +use { + crate::solana::wen_restart_proto, + log::*, + prost::Message, + solana_gossip::cluster_info::ClusterInfo, + solana_ledger::blockstore::Blockstore, + solana_vote_program::vote_state::VoteTransaction, + std::{ + fs::File, + io::{Error, Write}, + path::PathBuf, + sync::Arc, + }, +}; + +// The number of ancestor slots sent is hard coded at 81000, because that's +// 400ms * 81000 = 9 hours, we assume most restart decisions to be made in 9 +// hours. +const MAX_SLOTS_ON_VOTED_FORKS: u32 = 81000; + +pub fn wait_for_wen_restart( + wen_restart_path: &PathBuf, + last_vote: VoteTransaction, + blockstore: Arc, + cluster_info: Arc, +) -> Result<(), Box> { + // repair and restart option does not work without last voted slot. + let last_vote_slot = last_vote.last_voted_slot().unwrap(); + let mut last_vote_fork = vec![last_vote_slot]; + let mut slot = last_vote_slot; + for _ in 0..MAX_SLOTS_ON_VOTED_FORKS { + match blockstore.meta(slot) { + Ok(Some(slot_meta)) => { + match slot_meta.parent_slot { + Some(parent_slot) => { + last_vote_fork.push(parent_slot); + slot = parent_slot; + } + None => break, + }; + } + _ => break, + } + } + info!( + "wen_restart last voted fork {} {:?}", + last_vote_slot, last_vote_fork + ); + last_vote_fork.sort(); + // Todo(wen): add the following back in after Gossip code is checked in. + // cluster_info.push_last_voted_fork_slots(&last_voted_fork, last_vote.hash()); + // The rest of the protocol will be in another PR. + let cur_progress = wen_restart_proto::WenRestartProgress { + state: wen_restart_proto::wen_restart_progress::State::Init.into(), + init_record: Some(wen_restart_proto::wen_restart_progress::InitRecord { + last_vote_slot, + last_vote_bankhash: last_vote.hash().to_string(), + shred_version: cluster_info.my_shred_version() as u32, + }), + }; + write_wen_restart_records(wen_restart_path, cur_progress)?; + Ok(()) +} + +fn write_wen_restart_records( + records_path: &PathBuf, + new_progress: wen_restart_proto::WenRestartProgress, +) -> Result<(), Error> { + // overwrite anything if exists + let mut file = File::create(records_path)?; + info!("writing new record {:?}", new_progress); + let mut buf = Vec::new(); + buf.reserve(new_progress.encoded_len()); + new_progress.encode(&mut buf)?; + file.write_all(&buf)?; + Ok(()) +} +#[cfg(test)] +mod tests { + use { + crate::wen_restart::*, + solana_entry::entry, + solana_gossip::{cluster_info::ClusterInfo, contact_info::ContactInfo}, + solana_ledger::{blockstore, get_tmp_ledger_path_auto_delete}, + solana_program::{hash::Hash, vote::state::Vote}, + solana_sdk::{ + signature::{Keypair, Signer}, + timing::timestamp, + }, + solana_streamer::socket::SocketAddrSpace, + std::{fs::read, sync::Arc}, + }; + + #[test] + fn test_wen_restart_normal_flow() { + solana_logger::setup(); + let node_keypair = Arc::new(Keypair::new()); + let cluster_info = Arc::new(ClusterInfo::new( + { + let mut contact_info = + ContactInfo::new_localhost(&node_keypair.pubkey(), timestamp()); + contact_info.set_shred_version(2); + contact_info + }, + node_keypair, + SocketAddrSpace::Unspecified, + )); + let ledger_path = get_tmp_ledger_path_auto_delete!(); + let mut wen_restart_proto_path = ledger_path.path().to_path_buf(); + wen_restart_proto_path.push("wen_restart_status.proto"); + let blockstore = Arc::new(blockstore::Blockstore::open(ledger_path.path()).unwrap()); + let last_vote_slot = 400; + for i in 0..last_vote_slot { + let entries = entry::create_ticks(1, 0, Hash::default()); + let shreds = blockstore::entries_to_test_shreds( + &entries, + i + 1, + i, + false, + 0, + true, // merkle_variant + ); + blockstore.insert_shreds(shreds, None, false).unwrap(); + } + let last_vote_bankhash = Hash::new_unique(); + assert!(wait_for_wen_restart( + &wen_restart_proto_path, + VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash.clone())), + blockstore, + cluster_info + ) + .is_ok()); + let buffer = read(wen_restart_proto_path).unwrap(); + let progress = + wen_restart_proto::WenRestartProgress::decode(&mut std::io::Cursor::new(buffer)) + .unwrap(); + assert_eq!( + progress, + wen_restart_proto::WenRestartProgress { + state: wen_restart_proto::wen_restart_progress::State::Init.into(), + init_record: Some(wen_restart_proto::wen_restart_progress::InitRecord { + last_vote_slot, + last_vote_bankhash: last_vote_bankhash.to_string(), + shred_version: 2, + }), + } + ) + } +} From d575ff8fb285882fb75eef382cb8807030bd37e8 Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 21 Sep 2023 00:12:42 -0700 Subject: [PATCH 02/20] Update lock file. --- programs/sbf/Cargo.lock | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index f0d50d53911c8a..639d21eb43b340 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -4839,6 +4839,7 @@ dependencies = [ "solana-version", "solana-vote", "solana-vote-program", + "solana-wen-restart", "strum", "strum_macros", "sys-info", @@ -6487,6 +6488,25 @@ dependencies = [ "thiserror", ] +[[package]] +name = "solana-wen-restart" +version = "1.17.0" +dependencies = [ + "log", + "prost", + "prost-build", + "prost-types", + "protobuf-src", + "rustc_version", + "solana-gossip", + "solana-ledger", + "solana-logger", + "solana-program", + "solana-runtime", + "solana-sdk", + "solana-vote-program", +] + [[package]] name = "solana-zk-token-proof-program" version = "1.17.0" From 652f7240983e6975333d598db0ba5d9094718632 Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 21 Sep 2023 00:26:19 -0700 Subject: [PATCH 03/20] Fix linter errors. --- wen-restart/src/wen_restart.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 927106e5739002..2ba89d8be82ec2 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -71,8 +71,7 @@ fn write_wen_restart_records( // overwrite anything if exists let mut file = File::create(records_path)?; info!("writing new record {:?}", new_progress); - let mut buf = Vec::new(); - buf.reserve(new_progress.encoded_len()); + let mut buf = Vec::with_capacity(new_progress.encoded_len()); new_progress.encode(&mut buf)?; file.write_all(&buf)?; Ok(()) @@ -127,7 +126,7 @@ mod tests { let last_vote_bankhash = Hash::new_unique(); assert!(wait_for_wen_restart( &wen_restart_proto_path, - VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash.clone())), + VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)), blockstore, cluster_info ) From d9e2f93c6d6914364c84bb6a69844fadc023b627 Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 21 Sep 2023 00:42:06 -0700 Subject: [PATCH 04/20] Fix depencies order. --- wen-restart/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wen-restart/Cargo.toml b/wen-restart/Cargo.toml index b67e56b026f6d3..496a14045201a1 100644 --- a/wen-restart/Cargo.toml +++ b/wen-restart/Cargo.toml @@ -28,8 +28,8 @@ solana-entry = { workspace = true } solana-streamer = { workspace = true } [build-dependencies] -rustc_version = { workspace = true } prost-build = "0.11.4" +rustc_version = { workspace = true } # windows users should install the protobuf compiler manually and set the PROTOC # envar to point to the installed binary From baebed81ec51099235c896722df0b34fb73fcdca Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 21 Sep 2023 12:33:18 -0700 Subject: [PATCH 05/20] Update wen_restart explanation and small fixes. --- validator/src/cli.rs | 25 +++++++++++++++++-------- wen-restart/src/wen_restart.rs | 6 ++++-- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/validator/src/cli.rs b/validator/src/cli.rs index e4ad7e3d5fa38d..5c0fe1e0d25b15 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -1382,14 +1382,23 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .default_value(&default_args.wen_restart_path) .conflicts_with("wait_for_supermajority") .help( - "When specified, make validator enter Wen Restart, where it doesn't - vote, create new blocks, or transmit new blocks. The only thing it - does is Gossip last vote information with other validators in Wen - Restart and figure out whether consensus can be reached to proceed - into a cluster restart. - The progress will be saved in the file location provided. When all is - done, exit the validator and use the progress and snapshot generated - previously to enter wait_for_supermajority mode automatically. + "When specified, the validator will enter Wen Restart mode which + pauses normal activity. Validators in this mode will gossip last + vote to reach consensus on a safe restart slot and repair all blocks + on the selected fork. The safe slot will be a descendant of the latest + optimistically confirmed slot to ensure we do not roll back any + optimistically confirmed slots. + + The progress in this mode will be saved in the file location provided. + If consensus is reached, the validator will automatically exit and then + execute wait_for_supermajority logic so the cluster will resume execution. + + After the cluster resumes normal operation, the validator arguments can + be adjusted to remove --wen_restart and update expected_shred_version to + the new shred_version agreed on in the consensus. + + If wen_restart fails, refer to the progress file (in proto3 format) for + further debuggin. ") ) .args(&get_deprecated_arguments()) diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 2ba89d8be82ec2..33ce4841bba18b 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -27,7 +27,9 @@ pub fn wait_for_wen_restart( cluster_info: Arc, ) -> Result<(), Box> { // repair and restart option does not work without last voted slot. - let last_vote_slot = last_vote.last_voted_slot().unwrap(); + let last_vote_slot = last_vote + .last_voted_slot() + .expect("wen_restart doesn't work if local tower is wiped"); let mut last_vote_fork = vec![last_vote_slot]; let mut slot = last_vote_slot; for _ in 0..MAX_SLOTS_ON_VOTED_FORKS { @@ -48,7 +50,7 @@ pub fn wait_for_wen_restart( "wen_restart last voted fork {} {:?}", last_vote_slot, last_vote_fork ); - last_vote_fork.sort(); + last_vote_fork.reverse(); // Todo(wen): add the following back in after Gossip code is checked in. // cluster_info.push_last_voted_fork_slots(&last_voted_fork, last_vote.hash()); // The rest of the protocol will be in another PR. From 57c922800a890f4e9f88269696b2052eadd24176 Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 21 Sep 2023 14:01:13 -0700 Subject: [PATCH 06/20] Generate tower outside tvu. --- core/src/replay_stage.rs | 13 +------------ core/src/tvu.rs | 6 +++--- core/src/validator.rs | 16 ++++++++++++++-- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index f30eb59a246f20..e69bb079b8b9c2 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -482,7 +482,7 @@ impl ReplayStage { ledger_signal_receiver: Receiver, duplicate_slots_receiver: DuplicateSlotReceiver, poh_recorder: Arc>, - maybe_tower: Option, + mut tower: Tower, vote_tracker: Arc, cluster_slots: Arc, retransmit_slots_sender: Sender, @@ -501,17 +501,6 @@ impl ReplayStage { banking_tracer: Arc, popular_pruned_forks_receiver: PopularPrunedForksReceiver, ) -> Result { - let mut tower = match maybe_tower { - Some(tower) => { - info!("Tower state: {:?}", tower); - tower - } - None => { - warn!("creating default tower...."); - Tower::default() - } - }; - let ReplayStageConfig { vote_account, authorized_voter_keypairs, diff --git a/core/src/tvu.rs b/core/src/tvu.rs index aee7ecc2146a15..ec444ae4403d7e 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -108,7 +108,7 @@ impl Tvu { ledger_signal_receiver: Receiver, rpc_subscriptions: &Arc, poh_recorder: &Arc>, - maybe_tower: Option, + tower: Tower, tower_storage: Arc, leader_schedule_cache: &Arc, exit: Arc, @@ -291,7 +291,7 @@ impl Tvu { ledger_signal_receiver, duplicate_slots_receiver, poh_recorder.clone(), - maybe_tower, + tower, vote_tracker, cluster_slots, retransmit_slots_sender, @@ -462,7 +462,7 @@ pub mod tests { OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks), )), &poh_recorder, - None, + Tower::default(), Arc::new(FileTowerStorage::default()), &leader_schedule_cache, exit.clone(), diff --git a/core/src/validator.rs b/core/src/validator.rs index 0054d4a125417b..f46a688c6eaabd 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -1206,7 +1206,19 @@ impl Validator { .unwrap(); let in_wen_restart = config.wen_restart_proto_path.is_some() && !waited_for_supermajority; - let tower = process_blockstore.process_to_create_tower()?; + let tower = match process_blockstore.process_to_create_tower() { + Ok(tower) => { + info!("Tower state: {:?}", tower); + tower + } + Err(e) => { + warn!( + "Unable to retrieve tower: {:?} creating default tower....", + e + ); + Tower::default() + } + }; let last_vote = tower.last_vote(); let (replay_vote_sender, replay_vote_receiver) = unbounded(); @@ -1225,7 +1237,7 @@ impl Validator { ledger_signal_receiver, &rpc_subscriptions, &poh_recorder, - Some(tower), + tower, config.tower_storage.clone(), &leader_schedule_cache, exit.clone(), From 3ac0a02f8892f6bd79276e0856573353cdbf1030 Mon Sep 17 00:00:00 2001 From: Wen <113942165+wen-coding@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:31:32 -0700 Subject: [PATCH 07/20] Update validator/src/cli.rs Co-authored-by: Tyera --- validator/src/cli.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/src/cli.rs b/validator/src/cli.rs index 5c0fe1e0d25b15..0b55cab1310efb 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -1398,7 +1398,7 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { the new shred_version agreed on in the consensus. If wen_restart fails, refer to the progress file (in proto3 format) for - further debuggin. + further debugging. ") ) .args(&get_deprecated_arguments()) From fd7e15771fda3112831bd2c30a15d80ebd540e1f Mon Sep 17 00:00:00 2001 From: Wen <113942165+wen-coding@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:32:08 -0700 Subject: [PATCH 08/20] Update wen-restart/protos/wen_restart.proto Co-authored-by: Tyera --- wen-restart/protos/wen_restart.proto | 31 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/wen-restart/protos/wen_restart.proto b/wen-restart/protos/wen_restart.proto index fe5dfd98629b88..43aec7c4bd6a15 100644 --- a/wen-restart/protos/wen_restart.proto +++ b/wen-restart/protos/wen_restart.proto @@ -1,24 +1,23 @@ syntax = "proto3"; package solana.wen_restart_proto; -message WenRestartProgress { - enum State { - INIT = 0; - LAST_VOTED_FORK_SLOTS = 1; - HEAVIEST_FORK = 2; - GENERATING_SNAPSHOT = 3; - FINISHED_SNAPSHOT = 4; - WAITING_FOR_SUPERMAJORITY = 5; - DONE = 6; - } - - message InitRecord { - uint64 last_vote_slot = 1; - string last_vote_bankhash = 2; - uint32 shred_version = 3; +enum State { + INIT = 0; + LAST_VOTED_FORK_SLOTS = 1; + HEAVIEST_FORK = 2; + GENERATING_SNAPSHOT = 3; + FINISHED_SNAPSHOT = 4; + WAITING_FOR_SUPERMAJORITY = 5; + DONE = 6; +} - } +message InitRecord { + uint64 last_vote_slot = 1; + string last_vote_bankhash = 2; + uint32 shred_version = 3; +} +message WenRestartProgress { State state = 1; optional InitRecord init_record = 2; } \ No newline at end of file From 84ee985e68a19cc4a0726a5ad876714860fc2687 Mon Sep 17 00:00:00 2001 From: Wen <113942165+wen-coding@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:32:58 -0700 Subject: [PATCH 09/20] Update wen-restart/build.rs Co-authored-by: Tyera --- wen-restart/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wen-restart/build.rs b/wen-restart/build.rs index a5a44bfee11cdd..4360117bb445d4 100644 --- a/wen-restart/build.rs +++ b/wen-restart/build.rs @@ -36,6 +36,6 @@ fn main() -> Result<()> { } // Generate rust files from protos. - prost_build::compile_protos(&["protos/wen_restart.proto"], &["protos/"])?; + prost_build::compile_protos(&["proto/wen_restart.proto"], &["proto/"])?; Ok(()) } From 690cef70f9009b319ab8ab888447fda68285b191 Mon Sep 17 00:00:00 2001 From: Wen <113942165+wen-coding@users.noreply.github.com> Date: Thu, 21 Sep 2023 16:34:31 -0700 Subject: [PATCH 10/20] Update wen-restart/src/wen_restart.rs Co-authored-by: Tyera --- wen-restart/src/wen_restart.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 33ce4841bba18b..25a1fad97b04fe 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -54,9 +54,9 @@ pub fn wait_for_wen_restart( // Todo(wen): add the following back in after Gossip code is checked in. // cluster_info.push_last_voted_fork_slots(&last_voted_fork, last_vote.hash()); // The rest of the protocol will be in another PR. - let cur_progress = wen_restart_proto::WenRestartProgress { - state: wen_restart_proto::wen_restart_progress::State::Init.into(), - init_record: Some(wen_restart_proto::wen_restart_progress::InitRecord { + let current_progress = WenRestartProgress { + state: RestartState::Init.into(), + init_record: Some(InitRecord { last_vote_slot, last_vote_bankhash: last_vote.hash().to_string(), shred_version: cluster_info.my_shred_version() as u32, From 8c0c04ad5da28c6a53574d29cee495479e54a5b0 Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 21 Sep 2023 16:34:53 -0700 Subject: [PATCH 11/20] Rename proto directory. --- wen-restart/{protos => proto}/wen_restart.proto | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename wen-restart/{protos => proto}/wen_restart.proto (100%) diff --git a/wen-restart/protos/wen_restart.proto b/wen-restart/proto/wen_restart.proto similarity index 100% rename from wen-restart/protos/wen_restart.proto rename to wen-restart/proto/wen_restart.proto From bcf99423a219aa7973be0a471f64e06f567d56c5 Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 21 Sep 2023 16:50:47 -0700 Subject: [PATCH 12/20] Rename InitRecord to MyLastVotedForkSlots, add imports. --- wen-restart/proto/wen_restart.proto | 4 ++-- wen-restart/src/wen_restart.rs | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/wen-restart/proto/wen_restart.proto b/wen-restart/proto/wen_restart.proto index 43aec7c4bd6a15..1f6423462b55b0 100644 --- a/wen-restart/proto/wen_restart.proto +++ b/wen-restart/proto/wen_restart.proto @@ -11,7 +11,7 @@ enum State { DONE = 6; } -message InitRecord { +message MyLastVotedForkSlots { uint64 last_vote_slot = 1; string last_vote_bankhash = 2; uint32 shred_version = 3; @@ -19,5 +19,5 @@ message InitRecord { message WenRestartProgress { State state = 1; - optional InitRecord init_record = 2; + optional MyLastVotedForkSlots my_last_voted_fork_slots = 2; } \ No newline at end of file diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 25a1fad97b04fe..91e9a8a48c33de 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -1,7 +1,9 @@ //! The `wen-restart` module handles automatically repair in cluster restart use { - crate::solana::wen_restart_proto, + crate::solana::wen_restart_proto::{ + MyLastVotedForkSlots, State as RestartState, WenRestartProgress, + }, log::*, prost::Message, solana_gossip::cluster_info::ClusterInfo, @@ -56,19 +58,19 @@ pub fn wait_for_wen_restart( // The rest of the protocol will be in another PR. let current_progress = WenRestartProgress { state: RestartState::Init.into(), - init_record: Some(InitRecord { + my_last_voted_fork_slots: Some(MyLastVotedForkSlots { last_vote_slot, last_vote_bankhash: last_vote.hash().to_string(), shred_version: cluster_info.my_shred_version() as u32, }), }; - write_wen_restart_records(wen_restart_path, cur_progress)?; + write_wen_restart_records(wen_restart_path, current_progress)?; Ok(()) } fn write_wen_restart_records( records_path: &PathBuf, - new_progress: wen_restart_proto::WenRestartProgress, + new_progress: WenRestartProgress, ) -> Result<(), Error> { // overwrite anything if exists let mut file = File::create(records_path)?; @@ -134,14 +136,12 @@ mod tests { ) .is_ok()); let buffer = read(wen_restart_proto_path).unwrap(); - let progress = - wen_restart_proto::WenRestartProgress::decode(&mut std::io::Cursor::new(buffer)) - .unwrap(); + let progress = WenRestartProgress::decode(&mut std::io::Cursor::new(buffer)).unwrap(); assert_eq!( progress, - wen_restart_proto::WenRestartProgress { - state: wen_restart_proto::wen_restart_progress::State::Init.into(), - init_record: Some(wen_restart_proto::wen_restart_progress::InitRecord { + WenRestartProgress { + state: RestartState::Init.into(), + my_last_voted_fork_slots: Some(MyLastVotedForkSlots { last_vote_slot, last_vote_bankhash: last_vote_bankhash.to_string(), shred_version: 2, From d5375bd5ae998f0e5eee8a2f4e6a2e8eb45fc625 Mon Sep 17 00:00:00 2001 From: Wen <113942165+wen-coding@users.noreply.github.com> Date: Thu, 21 Sep 2023 20:24:35 -0700 Subject: [PATCH 13/20] Update wen-restart/Cargo.toml Co-authored-by: Tyera --- wen-restart/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wen-restart/Cargo.toml b/wen-restart/Cargo.toml index 496a14045201a1..48d816c311b068 100644 --- a/wen-restart/Cargo.toml +++ b/wen-restart/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "solana-wen-restart" -description = "Automatically repair and restart protocol" +description = "Automatic repair and restart protocol" documentation = "https://github.com/solana-foundation/solana-improvement-documents/pull/46" version = { workspace = true } authors = { workspace = true } From ab88c0ab6dd13528cddda1e5c425107f3f68d2ae Mon Sep 17 00:00:00 2001 From: Wen <113942165+wen-coding@users.noreply.github.com> Date: Thu, 21 Sep 2023 20:25:06 -0700 Subject: [PATCH 14/20] Update wen-restart/src/wen_restart.rs Co-authored-by: Tyera --- wen-restart/src/wen_restart.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 91e9a8a48c33de..6386a794beb38c 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -1,4 +1,4 @@ -//! The `wen-restart` module handles automatically repair in cluster restart +//! The `wen-restart` module handles automatic repair during a cluster restart use { crate::solana::wen_restart_proto::{ From 6c5c8ec91ea708c1015acc6fd46946c94861ad64 Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 21 Sep 2023 20:29:45 -0700 Subject: [PATCH 15/20] Move prost-build dependency to project toml. --- Cargo.toml | 1 + wen-restart/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 45ab472a30a610..6ead9f34b81381 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -262,6 +262,7 @@ pretty-hex = "0.3.0" proc-macro2 = "1.0.67" proptest = "1.2" prost = "0.11.9" +prost-build = "0.11.9" prost-types = "0.11.9" protobuf-src = "1.1.0" qstring = "0.7.2" diff --git a/wen-restart/Cargo.toml b/wen-restart/Cargo.toml index 48d816c311b068..b74871801872af 100644 --- a/wen-restart/Cargo.toml +++ b/wen-restart/Cargo.toml @@ -28,7 +28,7 @@ solana-entry = { workspace = true } solana-streamer = { workspace = true } [build-dependencies] -prost-build = "0.11.4" +prost-build = { workspace = true } rustc_version = { workspace = true } # windows users should install the protobuf compiler manually and set the PROTOC From 72ada79b0faf8dc4c1a0b3c82c344245744d2bd4 Mon Sep 17 00:00:00 2001 From: Wen Date: Sun, 1 Oct 2023 22:28:49 -0700 Subject: [PATCH 16/20] No need to continue if the distance between slot and last_vote is already larger than MAX_SLOTS_ON_VOTED_FORKS. --- wen-restart/src/wen_restart.rs | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 6386a794beb38c..44d9d27a199040 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -20,7 +20,7 @@ use { // The number of ancestor slots sent is hard coded at 81000, because that's // 400ms * 81000 = 9 hours, we assume most restart decisions to be made in 9 // hours. -const MAX_SLOTS_ON_VOTED_FORKS: u32 = 81000; +const MAX_SLOTS_ON_VOTED_FORKS: u64 = 81000; pub fn wait_for_wen_restart( wen_restart_path: &PathBuf, @@ -41,6 +41,9 @@ pub fn wait_for_wen_restart( Some(parent_slot) => { last_vote_fork.push(parent_slot); slot = parent_slot; + if last_vote_slot.saturating_sub(slot) > MAX_SLOTS_ON_VOTED_FORKS { + break; + } } None => break, }; @@ -114,19 +117,37 @@ mod tests { let mut wen_restart_proto_path = ledger_path.path().to_path_buf(); wen_restart_proto_path.push("wen_restart_status.proto"); let blockstore = Arc::new(blockstore::Blockstore::open(ledger_path.path()).unwrap()); - let last_vote_slot = 400; - for i in 0..last_vote_slot { + let expected_slots = 400; + let last_vote_slot = MAX_SLOTS_ON_VOTED_FORKS + expected_slots; + let last_parent = MAX_SLOTS_ON_VOTED_FORKS - (std::u16::MAX as u64) + 1; + for i in 0..expected_slots { let entries = entry::create_ticks(1, 0, Hash::default()); + let parent_slot = if i > 0 { + MAX_SLOTS_ON_VOTED_FORKS + i + } else { + last_parent + }; let shreds = blockstore::entries_to_test_shreds( &entries, - i + 1, - i, + MAX_SLOTS_ON_VOTED_FORKS + i + 1, + parent_slot, false, 0, true, // merkle_variant ); blockstore.insert_shreds(shreds, None, false).unwrap(); } + // link directly to slot 1 whose distance to last_vote > MAX_SLOTS_ON_VOTED_FORKS so it will not be included. + let entries = entry::create_ticks(1, 0, Hash::default()); + let shreds = blockstore::entries_to_test_shreds( + &entries, + last_parent, + 1, + false, + 0, + true, // merkle_variant + ); + blockstore.insert_shreds(shreds, None, false).unwrap(); let last_vote_bankhash = Hash::new_unique(); assert!(wait_for_wen_restart( &wen_restart_proto_path, From caea7d71aca6a44adbc61fa01785e54195ec3ffa Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 5 Oct 2023 22:14:06 -0700 Subject: [PATCH 17/20] Use 16k slots instead of 81k slots, a few more wording changes. --- core/src/validator.rs | 5 +---- gossip/src/epoch_slots.rs | 2 +- validator/src/cli.rs | 3 ++- wen-restart/src/wen_restart.rs | 23 ++++++++++------------- 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/core/src/validator.rs b/core/src/validator.rs index f46a688c6eaabd..79a7e353beb333 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -1285,10 +1285,7 @@ impl Validator { cluster_info.clone(), ) { Ok(()) => { - return Err( - "wen_restart phase one completed, will restart to wait for supermajority" - .to_string(), - ); + return Err("wen_restart phase one completedy".to_string()); } Err(e) => return Err(format!("wait_for_wen_restart failed: {e:?}")), }; diff --git a/gossip/src/epoch_slots.rs b/gossip/src/epoch_slots.rs index dc94380b33e5de..186a17aa6ec255 100644 --- a/gossip/src/epoch_slots.rs +++ b/gossip/src/epoch_slots.rs @@ -13,7 +13,7 @@ use { }, }; -const MAX_SLOTS_PER_ENTRY: usize = 2048 * 8; +pub const MAX_SLOTS_PER_ENTRY: usize = 2048 * 8; #[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, AbiExample)] pub struct Uncompressed { pub first_slot: Slot, diff --git a/validator/src/cli.rs b/validator/src/cli.rs index 0b55cab1310efb..3e7c4ab9193385 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -1383,7 +1383,7 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .conflicts_with("wait_for_supermajority") .help( "When specified, the validator will enter Wen Restart mode which - pauses normal activity. Validators in this mode will gossip last + pauses normal activity. Validators in this mode will gossip their last vote to reach consensus on a safe restart slot and repair all blocks on the selected fork. The safe slot will be a descendant of the latest optimistically confirmed slot to ensure we do not roll back any @@ -1392,6 +1392,7 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { The progress in this mode will be saved in the file location provided. If consensus is reached, the validator will automatically exit and then execute wait_for_supermajority logic so the cluster will resume execution. + The progress file will be kept around for future debugging. After the cluster resumes normal operation, the validator arguments can be adjusted to remove --wen_restart and update expected_shred_version to diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 44d9d27a199040..7a5b2011a2aa2c 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -6,7 +6,7 @@ use { }, log::*, prost::Message, - solana_gossip::cluster_info::ClusterInfo, + solana_gossip::{cluster_info::ClusterInfo, epoch_slots::MAX_SLOTS_PER_ENTRY}, solana_ledger::blockstore::Blockstore, solana_vote_program::vote_state::VoteTransaction, std::{ @@ -17,11 +17,6 @@ use { }, }; -// The number of ancestor slots sent is hard coded at 81000, because that's -// 400ms * 81000 = 9 hours, we assume most restart decisions to be made in 9 -// hours. -const MAX_SLOTS_ON_VOTED_FORKS: u64 = 81000; - pub fn wait_for_wen_restart( wen_restart_path: &PathBuf, last_vote: VoteTransaction, @@ -34,14 +29,16 @@ pub fn wait_for_wen_restart( .expect("wen_restart doesn't work if local tower is wiped"); let mut last_vote_fork = vec![last_vote_slot]; let mut slot = last_vote_slot; - for _ in 0..MAX_SLOTS_ON_VOTED_FORKS { + for _ in 0..MAX_SLOTS_PER_ENTRY { match blockstore.meta(slot) { Ok(Some(slot_meta)) => { match slot_meta.parent_slot { Some(parent_slot) => { last_vote_fork.push(parent_slot); slot = parent_slot; - if last_vote_slot.saturating_sub(slot) > MAX_SLOTS_ON_VOTED_FORKS { + if last_vote_slot.saturating_sub(slot) + > MAX_SLOTS_PER_ENTRY.try_into().unwrap() + { break; } } @@ -118,18 +115,18 @@ mod tests { wen_restart_proto_path.push("wen_restart_status.proto"); let blockstore = Arc::new(blockstore::Blockstore::open(ledger_path.path()).unwrap()); let expected_slots = 400; - let last_vote_slot = MAX_SLOTS_ON_VOTED_FORKS + expected_slots; - let last_parent = MAX_SLOTS_ON_VOTED_FORKS - (std::u16::MAX as u64) + 1; + let last_vote_slot = (MAX_SLOTS_PER_ENTRY + expected_slots).try_into().unwrap(); + let last_parent = (MAX_SLOTS_PER_ENTRY >> 1).try_into().unwrap(); for i in 0..expected_slots { let entries = entry::create_ticks(1, 0, Hash::default()); let parent_slot = if i > 0 { - MAX_SLOTS_ON_VOTED_FORKS + i + (MAX_SLOTS_PER_ENTRY + i).try_into().unwrap() } else { last_parent }; let shreds = blockstore::entries_to_test_shreds( &entries, - MAX_SLOTS_ON_VOTED_FORKS + i + 1, + (MAX_SLOTS_PER_ENTRY + i + 1).try_into().unwrap(), parent_slot, false, 0, @@ -137,7 +134,7 @@ mod tests { ); blockstore.insert_shreds(shreds, None, false).unwrap(); } - // link directly to slot 1 whose distance to last_vote > MAX_SLOTS_ON_VOTED_FORKS so it will not be included. + // link directly to slot 1 whose distance to last_vote > MAX_SLOTS_PER_ENTRY so it will not be included. let entries = entry::create_ticks(1, 0, Hash::default()); let shreds = blockstore::entries_to_test_shreds( &entries, From e1fb6922f5c58ce8d2e69edc190a19fa0d337aef Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 5 Oct 2023 22:59:23 -0700 Subject: [PATCH 18/20] Use AncestorIterator which does the same thing. --- wen-restart/src/wen_restart.rs | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 7a5b2011a2aa2c..75e4e21ce9431a 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -7,7 +7,7 @@ use { log::*, prost::Message, solana_gossip::{cluster_info::ClusterInfo, epoch_slots::MAX_SLOTS_PER_ENTRY}, - solana_ledger::blockstore::Blockstore, + solana_ledger::{ancestor_iterator::AncestorIterator, blockstore::Blockstore}, solana_vote_program::vote_state::VoteTransaction, std::{ fs::File, @@ -27,27 +27,9 @@ pub fn wait_for_wen_restart( let last_vote_slot = last_vote .last_voted_slot() .expect("wen_restart doesn't work if local tower is wiped"); - let mut last_vote_fork = vec![last_vote_slot]; - let mut slot = last_vote_slot; - for _ in 0..MAX_SLOTS_PER_ENTRY { - match blockstore.meta(slot) { - Ok(Some(slot_meta)) => { - match slot_meta.parent_slot { - Some(parent_slot) => { - last_vote_fork.push(parent_slot); - slot = parent_slot; - if last_vote_slot.saturating_sub(slot) - > MAX_SLOTS_PER_ENTRY.try_into().unwrap() - { - break; - } - } - None => break, - }; - } - _ => break, - } - } + let mut last_vote_fork: Vec = AncestorIterator::new_inclusive(last_vote_slot, &blockstore) + .take(MAX_SLOTS_PER_ENTRY) + .collect(); info!( "wen_restart last voted fork {} {:?}", last_vote_slot, last_vote_fork From 433a18cb34db08b5ea234c6ecd93fa82649d070f Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 5 Oct 2023 23:05:42 -0700 Subject: [PATCH 19/20] Update Cargo.lock --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 4672d25bee8c84..fc169af41b9065 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7484,7 +7484,7 @@ dependencies = [ [[package]] name = "solana-wen-restart" -version = "1.17.0" +version = "1.18.0" dependencies = [ "log", "prost", From 2a619440b671db1e4cc604946c8b3e5b6373f15e Mon Sep 17 00:00:00 2001 From: Wen Date: Thu, 5 Oct 2023 23:08:39 -0700 Subject: [PATCH 20/20] Update Cargo.lock --- programs/sbf/Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index 9a9a4e211db5f0..04f57847333a87 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -6438,7 +6438,7 @@ dependencies = [ [[package]] name = "solana-wen-restart" -version = "1.17.0" +version = "1.18.0" dependencies = [ "log", "prost",