diff --git a/.github/actions/build-aptos-debugger/action.yml b/.github/actions/build-aptos-debugger/action.yml index 6a9945e09b9bae..9a5f81e6a9d1d3 100644 --- a/.github/actions/build-aptos-debugger/action.yml +++ b/.github/actions/build-aptos-debugger/action.yml @@ -25,7 +25,7 @@ runs: with: ref: ${{ inputs.GIT_SHA }} - - uses: aptos-labs/aptos-core/.github/actions/rust-setup@main + - uses: with: GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }} @@ -39,4 +39,4 @@ runs: - uses: actions/cache/save@v4 with: path: target/release/aptos-debugger - key: aptos-debugger-${{ inputs.GIT_SHA }} + key: alden-hack-0912 #aptos-debugger-${{ inputs.GIT_SHA }} diff --git a/.github/actions/get-aptos-debugger/action.yml b/.github/actions/get-aptos-debugger/action.yml index 3e6d4757a1d95d..5feed2cddeda64 100644 --- a/.github/actions/get-aptos-debugger/action.yml +++ b/.github/actions/get-aptos-debugger/action.yml @@ -31,10 +31,10 @@ runs: uses: actions/cache/restore@v4 with: path: target/release/aptos-debugger - key: aptos-debugger-${{ inputs.GIT_SHA }} + key: alden-hack-0912 #aptos-debugger-${{ inputs.GIT_SHA }} fail-on-cache-miss: ${{ inputs.EXPECT_CACHE_HIT }} - - uses: aptos-labs/aptos-core/.github/actions/build-aptos-debugger@0911-alden-cache-build + - uses: aptos-labs/aptos-core/.github/actions/build-aptos-debugger@0911-alden-gen-replay-verify-jobs if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true' with: GIT_SHA: ${{ inputs.GIT_SHA }} diff --git a/.github/workflows/replay-verify.yaml b/.github/workflows/replay-verify.yaml index 9ea1791265993c..4929494b1889cc 100644 --- a/.github/workflows/replay-verify.yaml +++ b/.github/workflows/replay-verify.yaml @@ -31,6 +31,7 @@ on: pull_request: paths: - ".github/workflows/replay-verify.yaml" + - ".github/workflows/workflow-run-replay-verify.yaml" - "testsuite/replay_verify.py" schedule: - cron: "0 22 * * 0,2,4" # The main branch cadence. This runs every Sun,Tues,Thurs diff --git a/.github/workflows/workflow-run-replay-verify.yaml b/.github/workflows/workflow-run-replay-verify.yaml index e41aa4714e0a48..ce3035694aaf40 100644 --- a/.github/workflows/workflow-run-replay-verify.yaml +++ b/.github/workflows/workflow-run-replay-verify.yaml @@ -77,35 +77,116 @@ on: default: "high-perf-docker-with-local-ssd" jobs: - build: + prepare: runs-on: ${{ inputs.RUNS_ON }} + outputs: + ranges: ${{ steps.gen-jobs.outputs.ranges }} steps: - - uses: aptos-labs/aptos-core/.github/actions/get-aptos-debugger@0911-alden-cache-build + - name: Checkout code + uses: actions/checkout@v4 with: - GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }} - GIT_SHA: ${{ inputs.GIT_SHA || github.sha }} + ref: ${{ inputs.GIT_SHA }} + + - name: Load cached aptos-debugger binary + id: cache-aptos-debugger-binary + uses: actions/cache@v4 + with: + path: target/release/aptos-debugger + key: alden-hack-0912 #aptos-debugger-${{ inputs.GIT_SHA }} + + - name: Prepare for build if not cached + if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true' + uses: aptos-labs/aptos-core/.github/actions/rust-setup@main + with: + GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }} + + - name: Build and strip aptos-debugger binary if not cached + if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true' + shell: bash + run: | + cargo build --release -p aptos-debugger + strip -s target/release/aptos-debugger + + - name: Install GCloud SDK + uses: "google-github-actions/setup-gcloud@v2" + with: + version: ">= 418.0.0" + install_components: "kubectl,gke-gcloud-auth-plugin" + + - name: get timestamp to use in cache key + id: get-timestamp + run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT + + - name: Load cached backup storage metadata cache dir (and save back afterwards) + uses: actions/cache@v4 + with: + path: metadata_cache + key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }} + restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}- + + - name: Generate job ranges + id: gen-jobs + env: + BUCKET: ${{ inputs.BUCKET }} + SUB_DIR: ${{ inputs.SUB_DIR }} + HISTORY_START: ${{ inputs.HISTORY_START || '0' }} + BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} + run: | + target/release/aptos-debugger aptos-db gen-replay-verify-jobs \ + --metadata-cache-dir ./metadata_cache \ + --command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \ + --output-json-file job_ranges.json + + echo "ranges=$(cat job_ranges.json)" >> $GITHUB_OUTPUT + + cat job_ranges.json | jq || true + + - name: Cache backup storage config so the replay jobs don't need to checkout entire repo + uses: actions/cache/save@v4 + with: + path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} + key: backup-config-${{ github.run_id }} replay-verify: - needs: build + needs: prepare timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }} runs-on: ${{ inputs.RUNS_ON }} strategy: fail-fast: false + max-parallel: 16 matrix: - number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number + range: ${{ fromJson(needs.prepare.outputs.ranges) }} steps: - - name: Echo Runner Number - run: echo "Runner is ${{ matrix.number }}" + - name: Job ${{ matrix.range }} + id: parse-job + shell: bash + run: | + read begin end name <<< "${{ matrix.range }}" + echo begin=$begin >> $GITHUB_OUTPUT + echo end=$begin >> $GITHUB_OUTPUT + echo name=$name >> $GITHUB_OUTPUT - - uses: actions/checkout@v4 + - name: Load cached aptos-debugger binary + id: cache-aptos-debugger-binary + uses: actions/cache/restore@v4 with: - ref: ${{ inputs.GIT_SHA }} + path: target/release/aptos-debugger + key: alden-hack-0912 #aptos-debugger-${{ inputs.GIT_SHA }} + fail-on-cache-miss: true - - uses: aptos-labs/aptos-core/.github/actions/get-aptos-debugger@0911-alden-cache-build + - name: Load cached backup storage metadata cache dir + uses: actions/cache/restore@v4 with: - GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }} - GIT_SHA: ${{ inputs.GIT_SHA || github.sha }} - EXPECT_CACHE_HIT: 'true' + path: metadata_cache + key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}- + fail-on-cache-miss: true + + - name: Load cached backup storage config + uses: actions/cache/restore@v4 + with: + path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} + key: backup-config-${{ github.run_id }} + fail-on-cache-miss: true - name: Install GCloud SDK uses: "google-github-actions/setup-gcloud@v2" @@ -113,12 +194,15 @@ jobs: version: ">= 418.0.0" install_components: "kubectl,gke-gcloud-auth-plugin" - - name: Run replay-verify in parallel - shell: bash - run: testsuite/replay_verify.py ${{ matrix.number }} 19 # first argument is the runner number, second argument is the total number of runners + - name: run replay-verify with retries + id: gen-jobs env: BUCKET: ${{ inputs.BUCKET }} SUB_DIR: ${{ inputs.SUB_DIR }} - HISTORY_START: ${{ inputs.HISTORY_START }} + HISTORY_START: ${{ inputs.HISTORY_START || '0' }} TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }} BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} + run: | + echo ${{ steps.parse-job.outputs.begin }} + echo ${{ steps.parse-job.outputs.end }} + echo ${{ steps.parse-job.outputs.name}} diff --git a/Cargo.lock b/Cargo.lock index a2dd1c92a3084b..620ce85107b54f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1245,6 +1245,7 @@ dependencies = [ "bcs 0.1.4", "clap 4.4.14", "itertools 0.13.0", + "serde_json", "tokio", ] diff --git a/storage/backup/backup-cli/src/metadata/view.rs b/storage/backup/backup-cli/src/metadata/view.rs index d42c3a39061a3a..3b857651208de7 100644 --- a/storage/backup/backup-cli/src/metadata/view.rs +++ b/storage/backup/backup-cli/src/metadata/view.rs @@ -105,6 +105,10 @@ impl MetadataView { self.compaction_timestamps.clone() } + pub fn all_state_snapshots(&self) -> &[StateSnapshotBackupMeta] { + &self.state_snapshot_backups + } + pub fn select_state_snapshot( &self, target_version: Version, diff --git a/storage/db-tool/Cargo.toml b/storage/db-tool/Cargo.toml index a579e1ef637104..c4859b5f4ad76c 100644 --- a/storage/db-tool/Cargo.toml +++ b/storage/db-tool/Cargo.toml @@ -26,6 +26,7 @@ aptos-vm = { workspace = true } bcs = { workspace = true } clap = { workspace = true } itertools = { workspace = true } +serde_json = { workspace = true } tokio = { workspace = true } [dev-dependencies] diff --git a/storage/db-tool/src/gen_replay_verify_jobs.rs b/storage/db-tool/src/gen_replay_verify_jobs.rs new file mode 100644 index 00000000000000..27dfd18a272ec2 --- /dev/null +++ b/storage/db-tool/src/gen_replay_verify_jobs.rs @@ -0,0 +1,149 @@ +// Copyright (c) Aptos Foundation +// SPDX-License-Identifier: Apache-2.0 + +use aptos_backup_cli::{ + metadata::{ + cache::{sync_and_load, MetadataCacheOpt}, + StateSnapshotBackupMeta, + }, + storage::DBToolStorageOpt, + utils::ConcurrentDownloadsOpt, +}; +use aptos_logger::warn; +use aptos_types::transaction::Version; +use clap::Parser; +use itertools::Itertools; +use std::{io::Write, iter::once, path::PathBuf}; + +#[derive(Parser)] +pub struct Opt { + #[clap(flatten)] + metadata_cache_opt: MetadataCacheOpt, + #[clap(flatten)] + storage: DBToolStorageOpt, + #[clap(flatten)] + concurrent_downloads: ConcurrentDownloadsOpt, + #[clap( + long, + help = "The first transaction version required to be replayed and verified. [Defaults to 0]" + )] + start_version: Option, + #[clap( + long, + help = "Target number of transactions for each job to replay", + default_value = "20000000" + )] + target_job_size: u64, + #[clap( + long, + help = "Determines the oldest epoch to replay, relative to the latest", + default_value = "4000" + )] + max_epochs: u64, + #[clap(long, help = "Output job ranges")] + output_json_file: PathBuf, +} + +impl Opt { + pub async fn run(self) -> anyhow::Result<()> { + let storage = self.storage.init_storage().await?; + let metadata_view = sync_and_load( + &self.metadata_cache_opt, + storage, + self.concurrent_downloads.get(), + ) + .await?; + + let storage_state = metadata_view.get_storage_state()?; + let global_end_version = storage_state + .latest_transaction_version + .expect("No transaction backups.") + + 1; + let latest_epoch = storage_state + .latest_state_snapshot_epoch + .expect("No state snapshots."); + let max_epochs = self.max_epochs.min(latest_epoch + 1); + let global_min_epoch = latest_epoch + 1 - max_epochs; + + let fake_end = StateSnapshotBackupMeta { + epoch: latest_epoch, + version: global_end_version, + manifest: "".to_string(), + }; + let job_ranges = metadata_view + .all_state_snapshots() + .iter() + .skip_while(|s| s.epoch < global_min_epoch) + .chain(once(&fake_end)) + .collect_vec() + .iter() + .rev() + .tuple_windows() + // to simplify things, if start_version appears in the middle of a range, give up the range + .take_while(|(_end, begin)| begin.version >= self.start_version.unwrap_or(0)) + .peekable() + .batching(|it| { + match it.next() { + Some((end, mut begin)) => { + if end.version - begin.version >= self.target_job_size { + // cut big range short, this hopefully automatically skips load tests + let msg = if end.epoch - begin.epoch > 15 { + "!!! Need more snapshots !!!" + } else { + "" + }; + warn!( + begin = begin, + end = end, + "Big gap between snapshots. {} versions in {} epochs. {}", + end.version - begin.version, + end.epoch - begin.epoch, + msg, + ); + Some(( + begin.version, + begin.version + self.target_job_size, + format!( + "Partial replay epoch {} - {}, {} txns starting from version {}, another {} versions omitted, until {}. {}", + begin.epoch, + end.epoch - 1, + self.target_job_size, + begin.version, + end.version - begin.version - self.target_job_size, + end.version, + msg + ) + )) + } else { + while let Some((_prev_end, prev_begin)) = it.peek() { + if end.version - prev_begin.version > self.target_job_size { + break; + } + begin = prev_begin; + let _ = it.next(); + } + Some(( + begin.version, + end.version, + format!( + "Replay epoch {} - {}, {} txns starting from version {}.", + begin.epoch, + end.epoch - 1, + end.version - begin.version, + begin.version, + ) + )) + } + }, + None => None, + } + }) + .map(|(begin, end, name)| format!("{} {} {}", begin, end, name)) + .collect_vec(); + + std::fs::File::create(&self.output_json_file)? + .write_all(&serde_json::to_vec(&job_ranges)?)?; + + Ok(()) + } +} diff --git a/storage/db-tool/src/lib.rs b/storage/db-tool/src/lib.rs index aa5ce03cb0861d..36c55e1493afe2 100644 --- a/storage/db-tool/src/lib.rs +++ b/storage/db-tool/src/lib.rs @@ -6,6 +6,7 @@ extern crate core; mod backup; mod backup_maintenance; mod bootstrap; +mod gen_replay_verify_jobs; mod replay_verify; pub mod restore; #[cfg(test)] @@ -33,6 +34,8 @@ pub enum DBTool { ReplayVerify(replay_verify::Opt), + GenReplayVerifyJobs(gen_replay_verify_jobs::Opt), + #[clap(subcommand)] Restore(restore::Command), } @@ -49,6 +52,7 @@ impl DBTool { info!("Replay verify result: {:?}", ret); ret }, + DBTool::GenReplayVerifyJobs(cmd) => cmd.run().await, DBTool::Restore(cmd) => cmd.run().await, } }