Skip to content

Commit

Permalink
Fix state merkle truncate (#15800)
Browse files Browse the repository at this point in the history
in case of a partial commit on a unsharded state db, two things were
wrong:
1. when we see a partial version, we need to look at the previous version before jumping the the epoch boundary (we forgot to substract 1 from the version)
2. the ledger meta db were not passed in as intended (instead, we passed
   the state merkle meta db)

The test_db_restart smoke test does catch this sometimes but it's not
easy to trigger. I bumpped the number of restarts on that test.

(cherry picked from commit f3d4dba)
  • Loading branch information
msmouse authored and perryjrandall committed Jan 29, 2025
1 parent 6593fb8 commit 8d5d045
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 14 deletions.
2 changes: 1 addition & 1 deletion storage/aptosdb/src/ledger_db/ledger_metadata_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ impl LedgerMetadataDb {
)
}

pub(super) fn db(&self) -> &DB {
pub(crate) fn db(&self) -> &DB {
&self.db
}

Expand Down
18 changes: 10 additions & 8 deletions storage/aptosdb/src/state_store/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ impl StateStore {
if crash_if_difference_is_too_large {
assert_le!(difference, MAX_COMMIT_PROGRESS_DIFFERENCE);
}
truncate_ledger_db(ledger_db, overall_commit_progress)
truncate_ledger_db(ledger_db.clone(), overall_commit_progress)
.expect("Failed to truncate ledger db.");

// State K/V commit progress isn't (can't be) written atomically with the data,
Expand Down Expand Up @@ -433,16 +433,18 @@ impl StateStore {
assert_le!(difference, MAX_COMMIT_PROGRESS_DIFFERENCE);
}
}
let db = state_merkle_db.metadata_db();
let state_merkle_target_version =
find_tree_root_at_or_before(db, &state_merkle_db, overall_commit_progress)
.expect("DB read failed.")
.unwrap_or_else(|| {
panic!(
let state_merkle_target_version = find_tree_root_at_or_before(
ledger_metadata_db,
&state_merkle_db,
overall_commit_progress,
)
.expect("DB read failed.")
.unwrap_or_else(|| {
panic!(
"Could not find a valid root before or at version {}, maybe it was pruned?",
overall_commit_progress
)
});
});
if state_merkle_target_version < state_merkle_max_version {
info!(
state_merkle_max_version = state_merkle_max_version,
Expand Down
11 changes: 7 additions & 4 deletions storage/aptosdb/src/utils/truncation_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#![allow(dead_code)]

use crate::{
ledger_db::{LedgerDb, LedgerDbSchemaBatches},
ledger_db::{ledger_metadata_db::LedgerMetadataDb, LedgerDb, LedgerDbSchemaBatches},
schema::{
db_metadata::{DbMetadataKey, DbMetadataSchema, DbMetadataValue},
epoch_by_version::EpochByVersionSchema,
Expand Down Expand Up @@ -198,7 +198,7 @@ pub(crate) fn truncate_state_merkle_db_single_shard(
}

pub(crate) fn find_tree_root_at_or_before(
ledger_metadata_db: &DB,
ledger_metadata_db: &LedgerMetadataDb,
state_merkle_db: &StateMerkleDb,
version: Version,
) -> Result<Option<Version>> {
Expand All @@ -211,16 +211,19 @@ pub(crate) fn find_tree_root_at_or_before(

// It's possible that it's a partial commit when sharding is not enabled,
// look again for the previous version:
if version == 0 {
return Ok(None);
}
if let Some(closest_version) =
find_closest_node_version_at_or_before(state_merkle_db.metadata_db(), version)?
find_closest_node_version_at_or_before(state_merkle_db.metadata_db(), version - 1)?
{
if root_exists_at_version(state_merkle_db, closest_version)? {
return Ok(Some(closest_version));
}

// Now we are probably looking at a pruned version in this epoch, look for the previous
// epoch ending:
let mut iter = ledger_metadata_db.iter::<EpochByVersionSchema>()?;
let mut iter = ledger_metadata_db.db().iter::<EpochByVersionSchema>()?;
iter.seek_for_prev(&version)?;
if let Some((closest_epoch_version, _)) = iter.next().transpose()? {
if root_exists_at_version(state_merkle_db, closest_epoch_version)? {
Expand Down
2 changes: 1 addition & 1 deletion testsuite/smoke-test/src/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ async fn test_db_restart() {
quit_flag.clone(),
));

for round in 0..3 {
for round in 0..10 {
info!("{LINE} Restart round {round}");
for (v, vid) in restarting_validator_ids.iter().enumerate() {
let validator = swarm.validator_mut(*vid).unwrap();
Expand Down

0 comments on commit 8d5d045

Please sign in to comment.