Skip to content

Commit

Permalink
aptos-debugger: gen-replay-verify-jobs command
Browse files Browse the repository at this point in the history
  • Loading branch information
msmouse committed Sep 12, 2024
1 parent 3da5ac6 commit acef07a
Show file tree
Hide file tree
Showing 9 changed files with 266 additions and 22 deletions.
4 changes: 2 additions & 2 deletions .github/actions/build-aptos-debugger/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ runs:
with:
ref: ${{ inputs.GIT_SHA }}

- uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
- uses:
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

Expand All @@ -39,4 +39,4 @@ runs:
- uses: actions/cache/save@v4
with:
path: target/release/aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA }}
key: alden-hack-0912 #aptos-debugger-${{ inputs.GIT_SHA }}
4 changes: 2 additions & 2 deletions .github/actions/get-aptos-debugger/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ runs:
uses: actions/cache/restore@v4
with:
path: target/release/aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA }}
key: alden-hack-0912 #aptos-debugger-${{ inputs.GIT_SHA }}
fail-on-cache-miss: ${{ inputs.EXPECT_CACHE_HIT }}

- uses: aptos-labs/aptos-core/.github/actions/build-aptos-debugger@0911-alden-cache-build
- uses: aptos-labs/aptos-core/.github/actions/build-aptos-debugger@0911-alden-gen-replay-verify-jobs
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
with:
GIT_SHA: ${{ inputs.GIT_SHA }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ on:
pull_request:
paths:
- ".github/workflows/replay-verify.yaml"
- ".github/workflows/workflow-run-replay-verify.yaml"
- "testsuite/replay_verify.py"
schedule:
- cron: "0 22 * * 0,2,4" # The main branch cadence. This runs every Sun,Tues,Thurs
Expand Down
120 changes: 102 additions & 18 deletions .github/workflows/workflow-run-replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,48 +77,132 @@ on:
default: "high-perf-docker-with-local-ssd"

jobs:
build:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
ranges: ${{ steps.gen-jobs.outputs.ranges }}
steps:
- uses: aptos-labs/aptos-core/.github/actions/get-aptos-debugger@0911-alden-cache-build
- name: Checkout code
uses: actions/checkout@v4
with:
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
GIT_SHA: ${{ inputs.GIT_SHA || github.sha }}
ref: ${{ inputs.GIT_SHA }}

- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
path: target/release/aptos-debugger
key: alden-hack-0912 #aptos-debugger-${{ inputs.GIT_SHA }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT

- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-

- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
target/release/aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--output-json-file job_ranges.json
echo "ranges=$(cat job_ranges.json)" >> $GITHUB_OUTPUT
cat job_ranges.json | jq || true
- name: Cache backup storage config so the replay jobs don't need to checkout entire repo
uses: actions/cache/save@v4
with:
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ github.run_id }}

replay-verify:
needs: build
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
max-parallel: 16
matrix:
number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number
range: ${{ fromJson(needs.prepare.outputs.ranges) }}
steps:
- name: Echo Runner Number
run: echo "Runner is ${{ matrix.number }}"
- name: Job ${{ matrix.range }}
id: parse-job
shell: bash
run: |
read begin end name <<< "${{ matrix.range }}"
echo begin=$begin >> $GITHUB_OUTPUT
echo end=$begin >> $GITHUB_OUTPUT
echo name=$name >> $GITHUB_OUTPUT
- uses: actions/checkout@v4
- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache/restore@v4
with:
ref: ${{ inputs.GIT_SHA }}
path: target/release/aptos-debugger
key: alden-hack-0912 #aptos-debugger-${{ inputs.GIT_SHA }}
fail-on-cache-miss: true

- uses: aptos-labs/aptos-core/.github/actions/get-aptos-debugger@0911-alden-cache-build
- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
GIT_SHA: ${{ inputs.GIT_SHA || github.sha }}
EXPECT_CACHE_HIT: 'true'
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- name: Load cached backup storage config
uses: actions/cache/restore@v4
with:
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ github.run_id }}
fail-on-cache-miss: true

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: Run replay-verify in parallel
shell: bash
run: testsuite/replay_verify.py ${{ matrix.number }} 19 # first argument is the runner number, second argument is the total number of runners
- name: run replay-verify with retries
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
echo ${{ steps.parse-job.outputs.begin }}
echo ${{ steps.parse-job.outputs.end }}
echo ${{ steps.parse-job.outputs.name}}
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions storage/backup/backup-cli/src/metadata/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ impl MetadataView {
self.compaction_timestamps.clone()
}

pub fn all_state_snapshots(&self) -> &[StateSnapshotBackupMeta] {
&self.state_snapshot_backups
}

pub fn select_state_snapshot(
&self,
target_version: Version,
Expand Down
1 change: 1 addition & 0 deletions storage/db-tool/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ aptos-vm = { workspace = true }
bcs = { workspace = true }
clap = { workspace = true }
itertools = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }

[dev-dependencies]
Expand Down
149 changes: 149 additions & 0 deletions storage/db-tool/src/gen_replay_verify_jobs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
// Copyright (c) Aptos Foundation
// SPDX-License-Identifier: Apache-2.0

use aptos_backup_cli::{
metadata::{
cache::{sync_and_load, MetadataCacheOpt},
StateSnapshotBackupMeta,
},
storage::DBToolStorageOpt,
utils::ConcurrentDownloadsOpt,
};
use aptos_logger::warn;
use aptos_types::transaction::Version;
use clap::Parser;
use itertools::Itertools;
use std::{io::Write, iter::once, path::PathBuf};

#[derive(Parser)]
pub struct Opt {
#[clap(flatten)]
metadata_cache_opt: MetadataCacheOpt,
#[clap(flatten)]
storage: DBToolStorageOpt,
#[clap(flatten)]
concurrent_downloads: ConcurrentDownloadsOpt,
#[clap(
long,
help = "The first transaction version required to be replayed and verified. [Defaults to 0]"
)]
start_version: Option<Version>,
#[clap(
long,
help = "Target number of transactions for each job to replay",
default_value = "20000000"
)]
target_job_size: u64,
#[clap(
long,
help = "Determines the oldest epoch to replay, relative to the latest",
default_value = "4000"
)]
max_epochs: u64,
#[clap(long, help = "Output job ranges")]
output_json_file: PathBuf,
}

impl Opt {
pub async fn run(self) -> anyhow::Result<()> {
let storage = self.storage.init_storage().await?;
let metadata_view = sync_and_load(
&self.metadata_cache_opt,
storage,
self.concurrent_downloads.get(),
)
.await?;

let storage_state = metadata_view.get_storage_state()?;
let global_end_version = storage_state
.latest_transaction_version
.expect("No transaction backups.")
+ 1;
let latest_epoch = storage_state
.latest_state_snapshot_epoch
.expect("No state snapshots.");
let max_epochs = self.max_epochs.min(latest_epoch + 1);
let global_min_epoch = latest_epoch + 1 - max_epochs;

let fake_end = StateSnapshotBackupMeta {
epoch: latest_epoch,
version: global_end_version,
manifest: "".to_string(),
};
let job_ranges = metadata_view
.all_state_snapshots()
.iter()
.skip_while(|s| s.epoch < global_min_epoch)
.chain(once(&fake_end))
.collect_vec()
.iter()
.rev()
.tuple_windows()
// to simplify things, if start_version appears in the middle of a range, give up the range
.take_while(|(_end, begin)| begin.version >= self.start_version.unwrap_or(0))
.peekable()
.batching(|it| {
match it.next() {
Some((end, mut begin)) => {
if end.version - begin.version >= self.target_job_size {
// cut big range short, this hopefully automatically skips load tests
let msg = if end.epoch - begin.epoch > 15 {
"!!! Need more snapshots !!!"
} else {
""
};
warn!(
begin = begin,
end = end,
"Big gap between snapshots. {} versions in {} epochs. {}",
end.version - begin.version,
end.epoch - begin.epoch,
msg,
);
Some((
begin.version,
begin.version + self.target_job_size,
format!(
"Partial replay epoch {} - {}, {} txns starting from version {}, another {} versions omitted, until {}. {}",
begin.epoch,
end.epoch - 1,
self.target_job_size,
begin.version,
end.version - begin.version - self.target_job_size,
end.version,
msg
)
))
} else {
while let Some((_prev_end, prev_begin)) = it.peek() {
if end.version - prev_begin.version > self.target_job_size {
break;
}
begin = prev_begin;
let _ = it.next();
}
Some((
begin.version,
end.version,
format!(
"Replay epoch {} - {}, {} txns starting from version {}.",
begin.epoch,
end.epoch - 1,
end.version - begin.version,
begin.version,
)
))
}
},
None => None,
}
})
.map(|(begin, end, name)| format!("{} {} {}", begin, end, name))
.collect_vec();

std::fs::File::create(&self.output_json_file)?
.write_all(&serde_json::to_vec(&job_ranges)?)?;

Ok(())
}
}
4 changes: 4 additions & 0 deletions storage/db-tool/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ extern crate core;
mod backup;
mod backup_maintenance;
mod bootstrap;
mod gen_replay_verify_jobs;
mod replay_verify;
pub mod restore;
#[cfg(test)]
Expand Down Expand Up @@ -33,6 +34,8 @@ pub enum DBTool {

ReplayVerify(replay_verify::Opt),

GenReplayVerifyJobs(gen_replay_verify_jobs::Opt),

#[clap(subcommand)]
Restore(restore::Command),
}
Expand All @@ -49,6 +52,7 @@ impl DBTool {
info!("Replay verify result: {:?}", ret);
ret
},
DBTool::GenReplayVerifyJobs(cmd) => cmd.run().await,
DBTool::Restore(cmd) => cmd.run().await,
}
}
Expand Down

0 comments on commit acef07a

Please sign in to comment.