Skip to content

Commit

Permalink
Checks if bank snapshot is loadable before fastbooting (solana-labs#343)
Browse files Browse the repository at this point in the history
  • Loading branch information
brooksprumo authored Mar 28, 2024
1 parent b1919bd commit 182d27f
Show file tree
Hide file tree
Showing 5 changed files with 430 additions and 72 deletions.
35 changes: 30 additions & 5 deletions core/src/accounts_hash_verifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use {
hash::Hash,
},
std::{
io::{Error as IoError, Result as IoResult},
sync::{
atomic::{AtomicBool, Ordering},
Arc,
Expand Down Expand Up @@ -71,12 +72,17 @@ impl AccountsHashVerifier {
info!("handling accounts package: {accounts_package:?}");
let enqueued_time = accounts_package.enqueued.elapsed();

let (_, handling_time_us) = measure_us!(Self::process_accounts_package(
let (result, handling_time_us) = measure_us!(Self::process_accounts_package(
accounts_package,
snapshot_package_sender.as_ref(),
&snapshot_config,
&exit,
));
if let Err(err) = result {
error!("Stopping AccountsHashVerifier! Fatal error while processing accounts package: {err}");
exit.store(true, Ordering::Relaxed);
break;
}

datapoint_info!(
"accounts_hash_verifier",
Expand Down Expand Up @@ -208,9 +214,9 @@ impl AccountsHashVerifier {
snapshot_package_sender: Option<&Sender<SnapshotPackage>>,
snapshot_config: &SnapshotConfig,
exit: &AtomicBool,
) {
) -> IoResult<()> {
let accounts_hash =
Self::calculate_and_verify_accounts_hash(&accounts_package, snapshot_config);
Self::calculate_and_verify_accounts_hash(&accounts_package, snapshot_config)?;

Self::save_epoch_accounts_hash(&accounts_package, accounts_hash);

Expand All @@ -221,13 +227,15 @@ impl AccountsHashVerifier {
accounts_hash,
exit,
);

Ok(())
}

/// returns calculated accounts hash
fn calculate_and_verify_accounts_hash(
accounts_package: &AccountsPackage,
snapshot_config: &SnapshotConfig,
) -> AccountsHashKind {
) -> IoResult<AccountsHashKind> {
let accounts_hash_calculation_kind = match accounts_package.package_kind {
AccountsPackageKind::AccountsHashVerifier => CalcAccountsHashKind::Full,
AccountsPackageKind::EpochAccountsHash => CalcAccountsHashKind::Full,
Expand Down Expand Up @@ -303,6 +311,23 @@ impl AccountsHashVerifier {
&accounts_hash_for_reserialize,
bank_incremental_snapshot_persistence.as_ref(),
);

// now write the full snapshot slot file after reserializing so this bank snapshot is loadable
let full_snapshot_archive_slot = match accounts_package.package_kind {
AccountsPackageKind::Snapshot(SnapshotKind::IncrementalSnapshot(base_slot)) => {
base_slot
}
_ => accounts_package.slot,
};
snapshot_utils::write_full_snapshot_slot_file(
&snapshot_info.bank_snapshot_dir,
full_snapshot_archive_slot,
)
.map_err(|err| {
IoError::other(format!(
"failed to calculate accounts hash for {accounts_package:?}: {err}"
))
})?;
}

if accounts_package.package_kind
Expand Down Expand Up @@ -340,7 +365,7 @@ impl AccountsHashVerifier {
);
}

accounts_hash_kind
Ok(accounts_hash_kind)
}

fn _calculate_full_accounts_hash(
Expand Down
126 changes: 61 additions & 65 deletions ledger/src/bank_forks_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,20 +244,70 @@ fn bank_forks_from_snapshot(
.map(SnapshotArchiveInfoGetter::slot)
.unwrap_or(0),
);
let latest_bank_snapshot =
snapshot_utils::get_highest_bank_snapshot_post(&snapshot_config.bank_snapshots_dir);

let will_startup_from_snapshot_archives = match process_options.use_snapshot_archives_at_startup
{
UseSnapshotArchivesAtStartup::Always => true,
UseSnapshotArchivesAtStartup::Never => false,
UseSnapshotArchivesAtStartup::WhenNewest => latest_bank_snapshot
.as_ref()
.map(|bank_snapshot| latest_snapshot_archive_slot > bank_snapshot.slot)
.unwrap_or(true),
let fastboot_snapshot = match process_options.use_snapshot_archives_at_startup {
UseSnapshotArchivesAtStartup::Always => None,
UseSnapshotArchivesAtStartup::Never => {
let Some(bank_snapshot) =
snapshot_utils::get_highest_loadable_bank_snapshot(snapshot_config)
else {
return Err(BankForksUtilsError::NoBankSnapshotDirectory {
flag: use_snapshot_archives_at_startup::cli::LONG_ARG.to_string(),
value: UseSnapshotArchivesAtStartup::Never.to_string(),
});
};
// If a newer snapshot archive was downloaded, it is possible that its slot is
// higher than the local state we will load. Did the user intend for this?
if bank_snapshot.slot < latest_snapshot_archive_slot {
warn!(
"Starting up from local state at slot {}, which is *older* than \
the latest snapshot archive at slot {}. If this is not desired, \
change the --{} CLI option to *not* \"{}\" and restart.",
bank_snapshot.slot,
latest_snapshot_archive_slot,
use_snapshot_archives_at_startup::cli::LONG_ARG,
UseSnapshotArchivesAtStartup::Never.to_string(),
);
}
Some(bank_snapshot)
}
UseSnapshotArchivesAtStartup::WhenNewest => {
snapshot_utils::get_highest_loadable_bank_snapshot(snapshot_config)
.filter(|bank_snapshot| bank_snapshot.slot >= latest_snapshot_archive_slot)
}
};

let bank = if will_startup_from_snapshot_archives {
let bank = if let Some(fastboot_snapshot) = fastboot_snapshot {
let (bank, _) = snapshot_bank_utils::bank_from_snapshot_dir(
&account_paths,
&fastboot_snapshot,
genesis_config,
&process_options.runtime_config,
process_options.debug_keys.clone(),
None,
process_options.account_indexes.clone(),
process_options.limit_load_slot_count_from_snapshot,
process_options.shrink_ratio,
process_options.verify_index,
process_options.accounts_db_config.clone(),
accounts_update_notifier,
exit,
)
.map_err(|err| BankForksUtilsError::BankFromSnapshotsDirectory {
source: err,
path: fastboot_snapshot.snapshot_path(),
})?;

// If the node crashes before taking the next bank snapshot, the next startup will attempt
// to load from the same bank snapshot again. And if `shrink` has run, the account storage
// files that are hard linked in bank snapshot will be *different* than what the bank
// snapshot expects. This would cause the node to crash again. To prevent that, purge all
// the bank snapshots here. In the above scenario, this will cause the node to load from a
// snapshot archive next time, which is safe.
snapshot_utils::purge_all_bank_snapshots(&snapshot_config.bank_snapshots_dir);

bank
} else {
// Given that we are going to boot from an archive, the append vecs held in the snapshot dirs for fast-boot should
// be released. They will be released by the account_background_service anyway. But in the case of the account_paths
// using memory-mounted file system, they are not released early enough to give space for the new append-vecs from
Expand Down Expand Up @@ -292,60 +342,6 @@ fn bank_forks_from_snapshot(
.map(|archive| archive.path().display().to_string())
.unwrap_or("none".to_string()),
})?;
bank
} else {
let bank_snapshot =
latest_bank_snapshot.ok_or_else(|| BankForksUtilsError::NoBankSnapshotDirectory {
flag: use_snapshot_archives_at_startup::cli::LONG_ARG.to_string(),
value: UseSnapshotArchivesAtStartup::Never.to_string(),
})?;

// If a newer snapshot archive was downloaded, it is possible that its slot is
// higher than the local bank we will load. Did the user intend for this?
if bank_snapshot.slot < latest_snapshot_archive_slot {
assert_eq!(
process_options.use_snapshot_archives_at_startup,
UseSnapshotArchivesAtStartup::Never,
);
warn!(
"Starting up from local state at slot {}, which is *older* than \
the latest snapshot archive at slot {}. If this is not desired, \
change the --{} CLI option to *not* \"{}\" and restart.",
bank_snapshot.slot,
latest_snapshot_archive_slot,
use_snapshot_archives_at_startup::cli::LONG_ARG,
UseSnapshotArchivesAtStartup::Never.to_string(),
);
}

let (bank, _) = snapshot_bank_utils::bank_from_snapshot_dir(
&account_paths,
&bank_snapshot,
genesis_config,
&process_options.runtime_config,
process_options.debug_keys.clone(),
None,
process_options.account_indexes.clone(),
process_options.limit_load_slot_count_from_snapshot,
process_options.shrink_ratio,
process_options.verify_index,
process_options.accounts_db_config.clone(),
accounts_update_notifier,
exit,
)
.map_err(|err| BankForksUtilsError::BankFromSnapshotsDirectory {
source: err,
path: bank_snapshot.snapshot_path(),
})?;

// If the node crashes before taking the next bank snapshot, the next startup will attempt
// to load from the same bank snapshot again. And if `shrink` has run, the account storage
// files that are hard linked in bank snapshot will be *different* than what the bank
// snapshot expects. This would cause the node to crash again. To prevent that, purge all
// the bank snapshots here. In the above scenario, this will cause the node to load from a
// snapshot archive next time, which is safe.
snapshot_utils::purge_all_bank_snapshots(&snapshot_config.bank_snapshots_dir);

bank
};

Expand Down
99 changes: 99 additions & 0 deletions local-cluster/tests/local_cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5067,6 +5067,105 @@ fn test_boot_from_local_state() {
}
}

/// Test fastboot to ensure a node can boot in case it crashed while archiving a full snapshot
///
/// 1. Start a node and wait for it to take at least two full snapshots and one more
/// bank snapshot POST afterwards (for simplicity, wait for 2 full and 1 incremental).
/// 2. To simulate a node crashing while archiving a full snapshot, stop the node and
/// then delete the latest full snapshot archive.
/// 3. Restart the node. This should succeed, and boot from the older full snapshot archive,
/// *not* the latest bank snapshot POST.
/// 4. Take another incremental snapshot. This ensures the correct snapshot was loaded,
/// AND ensures the correct accounts hashes are present (which are needed when making
/// the bank snapshot POST for the new incremental snapshot).
#[test]
#[serial]
fn test_boot_from_local_state_missing_archive() {
solana_logger::setup_with_default(RUST_LOG_FILTER);
const FULL_SNAPSHOT_INTERVAL: Slot = 20;
const INCREMENTAL_SNAPSHOT_INTERVAL: Slot = 10;

let validator_config = SnapshotValidatorConfig::new(
FULL_SNAPSHOT_INTERVAL,
INCREMENTAL_SNAPSHOT_INTERVAL,
INCREMENTAL_SNAPSHOT_INTERVAL,
7,
);

let mut cluster_config = ClusterConfig {
node_stakes: vec![100 * DEFAULT_NODE_STAKE],
cluster_lamports: DEFAULT_CLUSTER_LAMPORTS,
validator_configs: make_identical_validator_configs(&validator_config.validator_config, 1),
..ClusterConfig::default()
};
let mut cluster = LocalCluster::new(&mut cluster_config, SocketAddrSpace::Unspecified);

// we need two full snapshots and an incremental snapshot for this test
info!("Waiting for validator to create snapshots...");
LocalCluster::wait_for_next_full_snapshot(
&cluster,
&validator_config.full_snapshot_archives_dir,
Some(Duration::from_secs(5 * 60)),
);
LocalCluster::wait_for_next_full_snapshot(
&cluster,
&validator_config.full_snapshot_archives_dir,
Some(Duration::from_secs(5 * 60)),
);
LocalCluster::wait_for_next_incremental_snapshot(
&cluster,
&validator_config.full_snapshot_archives_dir,
&validator_config.incremental_snapshot_archives_dir,
Some(Duration::from_secs(5 * 60)),
);
debug!(
"snapshot archives:\n\tfull: {:?}\n\tincr: {:?}",
snapshot_utils::get_full_snapshot_archives(
validator_config.full_snapshot_archives_dir.path()
),
snapshot_utils::get_incremental_snapshot_archives(
validator_config.incremental_snapshot_archives_dir.path()
),
);
info!("Waiting for validator to create snapshots... DONE");

// now delete the latest full snapshot archive and restart, to simulate a crash while archiving
// a full snapshot package
info!("Stopping validator...");
let validator_pubkey = cluster.get_node_pubkeys()[0];
let mut validator_info = cluster.exit_node(&validator_pubkey);
info!("Stopping validator... DONE");

info!("Deleting latest full snapshot archive...");
let highest_full_snapshot = snapshot_utils::get_highest_full_snapshot_archive_info(
validator_config.full_snapshot_archives_dir.path(),
)
.unwrap();
fs::remove_file(highest_full_snapshot.path()).unwrap();
info!("Deleting latest full snapshot archive... DONE");

info!("Restarting validator...");
// if we set this to `Never`, the validator should not boot
validator_info.config.use_snapshot_archives_at_startup =
UseSnapshotArchivesAtStartup::WhenNewest;
cluster.restart_node(
&validator_pubkey,
validator_info,
SocketAddrSpace::Unspecified,
);
info!("Restarting validator... DONE");

// ensure we can create new incremental snapshots, since that is what used to fail
info!("Waiting for validator to create snapshots...");
LocalCluster::wait_for_next_incremental_snapshot(
&cluster,
&validator_config.full_snapshot_archives_dir,
&validator_config.incremental_snapshot_archives_dir,
Some(Duration::from_secs(5 * 60)),
);
info!("Waiting for validator to create snapshots... DONE");
}

// We want to simulate the following:
// /--- 1 --- 3 (duplicate block)
// 0
Expand Down
Loading

0 comments on commit 182d27f

Please sign in to comment.