Skip to content

Commit

Permalink
Add wen_restart module (solana-labs#33344)
Browse files Browse the repository at this point in the history
* Add wen_restart module:
- Implement reading LastVotedForkSlots from blockstore.
- Add proto file to record the intermediate results.
- Also link wen_restart into validator.
- Move recreation of tower outside replay_stage so we can get last_vote.

* Update lock file.

* Fix linter errors.

* Fix depencies order.

* Update wen_restart explanation and small fixes.

* Generate tower outside tvu.

* Update validator/src/cli.rs

Co-authored-by: Tyera <teulberg@gmail.com>

* Update wen-restart/protos/wen_restart.proto

Co-authored-by: Tyera <teulberg@gmail.com>

* Update wen-restart/build.rs

Co-authored-by: Tyera <teulberg@gmail.com>

* Update wen-restart/src/wen_restart.rs

Co-authored-by: Tyera <teulberg@gmail.com>

* Rename proto directory.

* Rename InitRecord to MyLastVotedForkSlots, add imports.

* Update wen-restart/Cargo.toml

Co-authored-by: Tyera <teulberg@gmail.com>

* Update wen-restart/src/wen_restart.rs

Co-authored-by: Tyera <teulberg@gmail.com>

* Move prost-build dependency to project toml.

* No need to continue if the distance between slot and last_vote is
already larger than MAX_SLOTS_ON_VOTED_FORKS.

* Use 16k slots instead of 81k slots, a few more wording changes.

* Use AncestorIterator which does the same thing.

* Update Cargo.lock

* Update Cargo.lock

---------

Co-authored-by: Tyera <teulberg@gmail.com>
  • Loading branch information
2 people authored and deanmlittle committed Oct 8, 2023
1 parent bd3ae3f commit 832b789
Show file tree
Hide file tree
Showing 15 changed files with 387 additions and 18 deletions.
23 changes: 23 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ members = [
"version",
"vote",
"watchtower",
"wen-restart",
"zk-keygen",
"zk-token-sdk",
]
Expand Down Expand Up @@ -261,6 +262,7 @@ pretty-hex = "0.3.0"
proc-macro2 = "1.0.67"
proptest = "1.2"
prost = "0.11.9"
prost-build = "0.11.9"
prost-types = "0.11.9"
protobuf-src = "1.1.0"
qstring = "0.7.2"
Expand Down Expand Up @@ -371,6 +373,7 @@ solana-udp-client = { path = "udp-client", version = "=1.18.0" }
solana-version = { path = "version", version = "=1.18.0" }
solana-vote = { path = "vote", version = "=1.18.0" }
solana-vote-program = { path = "programs/vote", version = "=1.18.0" }
solana-wen-restart = { path = "wen-restart", version = "=1.18.0" }
solana-zk-keygen = { path = "zk-keygen", version = "=1.18.0" }
solana-zk-token-proof-program = { path = "programs/zk-token-proof", version = "=1.18.0" }
solana-zk-token-sdk = { path = "zk-token-sdk", version = "=1.18.0" }
Expand Down
1 change: 1 addition & 0 deletions core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ solana-turbine = { workspace = true }
solana-version = { workspace = true }
solana-vote = { workspace = true }
solana-vote-program = { workspace = true }
solana-wen-restart = { workspace = true }
strum = { workspace = true, features = ["derive"] }
strum_macros = { workspace = true }
sys-info = { workspace = true }
Expand Down
12 changes: 1 addition & 11 deletions core/src/replay_stage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ use {
},
rewards_recorder_service::{RewardsMessage, RewardsRecorderSender},
unfrozen_gossip_verified_vote_hashes::UnfrozenGossipVerifiedVoteHashes,
validator::ProcessBlockStore,
voting_service::VoteOp,
window_service::DuplicateSlotReceiver,
},
Expand Down Expand Up @@ -483,7 +482,7 @@ impl ReplayStage {
ledger_signal_receiver: Receiver<bool>,
duplicate_slots_receiver: DuplicateSlotReceiver,
poh_recorder: Arc<RwLock<PohRecorder>>,
maybe_process_blockstore: Option<ProcessBlockStore>,
mut tower: Tower,
vote_tracker: Arc<VoteTracker>,
cluster_slots: Arc<ClusterSlots>,
retransmit_slots_sender: Sender<Slot>,
Expand All @@ -502,15 +501,6 @@ impl ReplayStage {
banking_tracer: Arc<BankingTracer>,
popular_pruned_forks_receiver: PopularPrunedForksReceiver,
) -> Result<Self, String> {
let mut tower = if let Some(process_blockstore) = maybe_process_blockstore {
let tower = process_blockstore.process_to_create_tower()?;
info!("Tower state: {:?}", tower);
tower
} else {
warn!("creating default tower....");
Tower::default()
};

let ReplayStageConfig {
vote_account,
authorized_voter_keypairs,
Expand Down
9 changes: 4 additions & 5 deletions core/src/tvu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,14 @@ use {
},
cluster_slots_service::{cluster_slots::ClusterSlots, ClusterSlotsService},
completed_data_sets_service::CompletedDataSetsSender,
consensus::tower_storage::TowerStorage,
consensus::{tower_storage::TowerStorage, Tower},
cost_update_service::CostUpdateService,
drop_bank_service::DropBankService,
ledger_cleanup_service::LedgerCleanupService,
repair::{quic_endpoint::LocalRequest, repair_service::RepairInfo},
replay_stage::{ReplayStage, ReplayStageConfig},
rewards_recorder_service::RewardsRecorderSender,
shred_fetch_stage::ShredFetchStage,
validator::ProcessBlockStore,
voting_service::VotingService,
warm_quic_cache_service::WarmQuicCacheService,
window_service::WindowService,
Expand Down Expand Up @@ -109,7 +108,7 @@ impl Tvu {
ledger_signal_receiver: Receiver<bool>,
rpc_subscriptions: &Arc<RpcSubscriptions>,
poh_recorder: &Arc<RwLock<PohRecorder>>,
maybe_process_block_store: Option<ProcessBlockStore>,
tower: Tower,
tower_storage: Arc<dyn TowerStorage>,
leader_schedule_cache: &Arc<LeaderScheduleCache>,
exit: Arc<AtomicBool>,
Expand Down Expand Up @@ -292,7 +291,7 @@ impl Tvu {
ledger_signal_receiver,
duplicate_slots_receiver,
poh_recorder.clone(),
maybe_process_block_store,
tower,
vote_tracker,
cluster_slots,
retransmit_slots_sender,
Expand Down Expand Up @@ -463,7 +462,7 @@ pub mod tests {
OptimisticallyConfirmedBank::locked_from_bank_forks_root(&bank_forks),
)),
&poh_recorder,
None,
Tower::default(),
Arc::new(FileTowerStorage::default()),
&leader_schedule_cache,
exit.clone(),
Expand Down
36 changes: 35 additions & 1 deletion core/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ use {
solana_streamer::{socket::SocketAddrSpace, streamer::StakedNodes},
solana_turbine::{self, broadcast_stage::BroadcastStageType},
solana_vote_program::vote_state,
solana_wen_restart::wen_restart::wait_for_wen_restart,
std::{
collections::{HashMap, HashSet},
net::SocketAddr,
Expand Down Expand Up @@ -259,6 +260,7 @@ pub struct ValidatorConfig {
pub block_production_method: BlockProductionMethod,
pub generator_config: Option<GeneratorConfig>,
pub use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup,
pub wen_restart_proto_path: Option<PathBuf>,
}

impl Default for ValidatorConfig {
Expand Down Expand Up @@ -326,6 +328,7 @@ impl Default for ValidatorConfig {
block_production_method: BlockProductionMethod::default(),
generator_config: None,
use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup::default(),
wen_restart_proto_path: None,
}
}
}
Expand Down Expand Up @@ -1202,6 +1205,22 @@ impl Validator {
)
.unwrap();

let in_wen_restart = config.wen_restart_proto_path.is_some() && !waited_for_supermajority;
let tower = match process_blockstore.process_to_create_tower() {
Ok(tower) => {
info!("Tower state: {:?}", tower);
tower
}
Err(e) => {
warn!(
"Unable to retrieve tower: {:?} creating default tower....",
e
);
Tower::default()
}
};
let last_vote = tower.last_vote();

let (replay_vote_sender, replay_vote_receiver) = unbounded();
let tvu = Tvu::new(
vote_account,
Expand All @@ -1218,7 +1237,7 @@ impl Validator {
ledger_signal_receiver,
&rpc_subscriptions,
&poh_recorder,
Some(process_blockstore),
tower,
config.tower_storage.clone(),
&leader_schedule_cache,
exit.clone(),
Expand Down Expand Up @@ -1257,6 +1276,21 @@ impl Validator {
repair_quic_endpoint_sender,
)?;

if in_wen_restart {
info!("Waiting for wen_restart phase one to finish");
match wait_for_wen_restart(
&config.wen_restart_proto_path.clone().unwrap(),
last_vote,
blockstore.clone(),
cluster_info.clone(),
) {
Ok(()) => {
return Err("wen_restart phase one completedy".to_string());
}
Err(e) => return Err(format!("wait_for_wen_restart failed: {e:?}")),
};
}

let tpu = Tpu::new(
&cluster_info,
&poh_recorder,
Expand Down
2 changes: 1 addition & 1 deletion gossip/src/epoch_slots.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use {
},
};

const MAX_SLOTS_PER_ENTRY: usize = 2048 * 8;
pub const MAX_SLOTS_PER_ENTRY: usize = 2048 * 8;
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, AbiExample)]
pub struct Uncompressed {
pub first_slot: Slot,
Expand Down
1 change: 1 addition & 0 deletions local-cluster/src/validator_configs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
block_production_method: config.block_production_method.clone(),
generator_config: config.generator_config.clone(),
use_snapshot_archives_at_startup: config.use_snapshot_archives_at_startup,
wen_restart_proto_path: config.wen_restart_proto_path.clone(),
}
}

Expand Down
20 changes: 20 additions & 0 deletions programs/sbf/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions validator/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,35 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> {
.possible_values(BlockProductionMethod::cli_names())
.help(BlockProductionMethod::cli_message())
)
.arg(
Arg::with_name("wen_restart")
.long("wen-restart")
.value_name("DIR")
.takes_value(true)
.required(false)
.default_value(&default_args.wen_restart_path)
.conflicts_with("wait_for_supermajority")
.help(
"When specified, the validator will enter Wen Restart mode which
pauses normal activity. Validators in this mode will gossip their last
vote to reach consensus on a safe restart slot and repair all blocks
on the selected fork. The safe slot will be a descendant of the latest
optimistically confirmed slot to ensure we do not roll back any
optimistically confirmed slots.
The progress in this mode will be saved in the file location provided.
If consensus is reached, the validator will automatically exit and then
execute wait_for_supermajority logic so the cluster will resume execution.
The progress file will be kept around for future debugging.
After the cluster resumes normal operation, the validator arguments can
be adjusted to remove --wen_restart and update expected_shred_version to
the new shred_version agreed on in the consensus.
If wen_restart fails, refer to the progress file (in proto3 format) for
further debugging.
")
)
.args(&get_deprecated_arguments())
.after_help("The default subcommand is run")
.subcommand(
Expand Down Expand Up @@ -1931,6 +1960,8 @@ pub struct DefaultArgs {
pub wait_for_restart_window_max_delinquent_stake: String,

pub banking_trace_dir_byte_limit: String,

pub wen_restart_path: String,
}

impl DefaultArgs {
Expand Down Expand Up @@ -2009,6 +2040,7 @@ impl DefaultArgs {
wait_for_restart_window_min_idle_time: "10".to_string(),
wait_for_restart_window_max_delinquent_stake: "5".to_string(),
banking_trace_dir_byte_limit: BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT.to_string(),
wen_restart_path: "wen_restart_progress.proto".to_string(),
}
}
}
Expand Down
43 changes: 43 additions & 0 deletions wen-restart/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
[package]
name = "solana-wen-restart"
description = "Automatic repair and restart protocol"
documentation = "https://github.com/solana-foundation/solana-improvement-documents/pull/46"
version = { workspace = true }
authors = { workspace = true }
repository = { workspace = true }
homepage = { workspace = true }
license = { workspace = true }
edition = { workspace = true }
publish = false

[dependencies]
log = { workspace = true }
prost = { workspace = true }
prost-types = { workspace = true }
solana-gossip = { workspace = true }
solana-ledger = { workspace = true }
solana-logger = { workspace = true }
solana-program = { workspace = true }
solana-runtime = { workspace = true }
solana-sdk = { workspace = true }
solana-vote-program = { workspace = true }

[dev-dependencies]
serial_test = { workspace = true }
solana-entry = { workspace = true }
solana-streamer = { workspace = true }

[build-dependencies]
prost-build = { workspace = true }
rustc_version = { workspace = true }

# windows users should install the protobuf compiler manually and set the PROTOC
# envar to point to the installed binary
[target."cfg(not(windows))".build-dependencies]
protobuf-src = { workspace = true }

[lib]
name = "solana_wen_restart"

[package.metadata.docs.rs]
targets = ["x86_64-unknown-linux-gnu"]
Loading

0 comments on commit 832b789

Please sign in to comment.