Skip to content

Commit

Permalink
divide to batches
Browse files Browse the repository at this point in the history
  • Loading branch information
msmouse committed Sep 14, 2024
1 parent f516aa9 commit 18cd4e9
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 117 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
TXNS_TO_SKIP: 0
BACKUP_CONFIG_TEMPLATE_PATH: terraform/helm/fullnode/files/backup/gcs.yaml
# workflow config
RUNS_ON: "runs-on,cpu=16,family=m6id,hdd=500,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
RUNS_ON: "runs-on,cpu=8,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
TIMEOUT_MINUTES: 180

replay-mainnet:
Expand All @@ -94,7 +94,7 @@ jobs:
TXNS_TO_SKIP: 12253479 12277499 148358668
BACKUP_CONFIG_TEMPLATE_PATH: terraform/helm/fullnode/files/backup/gcs.yaml
# workflow config
RUNS_ON: "runs-on,cpu=16,family=m6id,hdd=500,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
RUNS_ON: "runs-on,cpu=8,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
TIMEOUT_MINUTES: 180

test-replay:
Expand All @@ -112,5 +112,5 @@ jobs:
TXNS_TO_SKIP: 0
BACKUP_CONFIG_TEMPLATE_PATH: terraform/helm/fullnode/files/backup/gcs.yaml
# workflow config
RUNS_ON: "runs-on,cpu=16,family=m6id,hdd=500,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
RUNS_ON: "runs-on,cpu=8,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
TIMEOUT_MINUTES: 120 # increase test replay timeout to capture more flaky errors
145 changes: 31 additions & 114 deletions .github/workflows/workflow-run-replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ on:
description: "The runner to use for the job."
type: string
required: true
default: "runs-on,cpu=16,family=m6id,hdd=500,image=aptos-ubuntu-x64,spot=false"
default: "runs-on,cpu=8,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false"
TIMEOUT_MINUTES:
description: "Github job timeout in minutes"
type: number
Expand Down Expand Up @@ -74,13 +74,16 @@ on:
description: "The runner to use for the job."
type: string
required: true
default: "runs-on,cpu=16,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false"
default: "runs-on,cpu=8,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false"

jobs:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
ranges: ${{ steps.gen-jobs.outputs.ranges }}
ranges0: ${{ steps.gen-jobs.outputs.ranges0 }}
ranges1: ${{ steps.gen-jobs.outputs.ranges1 }}
ranges2: ${{ steps.gen-jobs.outputs.ranges2 }}
ranges3: ${{ steps.gen-jobs.outputs.ranges3 }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -94,7 +97,7 @@ jobs:
# copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
# which cleans up the target directory in its post action
path: aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
key: alden-hack-0914 #aptos-debugger-${{ inputs.GIT_SHA || github.sha }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -144,132 +147,46 @@ jobs:
./aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--output-json-file job_ranges.json \
--start-version $HISTORY_START
--start-version $HISTORY_START \
--output-json-file job_ranges.0.json
--output-json-file job_ranges.1.json
--output-json-file job_ranges.2.json
--output-json-file job_ranges.3.json
echo "ranges=$(cat job_ranges.json)" >> $GITHUB_OUTPUT
cat job_ranges.json | jq || true
echo "ranges0=$(cat job_ranges.0.json)" >> $GITHUB_OUTPUT
echo "ranges1=$(cat job_ranges.1.json)" >> $GITHUB_OUTPUT
echo "ranges2=$(cat job_ranges.2.json)" >> $GITHUB_OUTPUT
echo "ranges3=$(cat job_ranges.3.json)" >> $GITHUB_OUTPUT
- name: Cache backup storage config so the replay jobs don't need to checkout entire repo
uses: actions/cache/save@v4
with:
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ github.run_id }}

replay-verify:
replay-verify-batch:
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 180 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
max-parallel: 200
matrix:
range: ${{ fromJson(needs.prepare.outputs.ranges) }}
batch: [
"${{ steps.prepare.outputs.ranges0 }}",
"${{ steps.prepare.outputs.ranges1 }}",
"${{ steps.prepare.outputs.ranges2 }}",
"${{ steps.prepare.outputs.ranges3 }}",
]
steps:
- name: Parse job - ${{ matrix.range }}
id: parse-job
shell: bash
run: |
read name begin end sesc <<< "${{ matrix.range }}"
echo name=$name >> $GITHUB_OUTPUT
echo begin=$begin >> $GITHUB_OUTPUT
echo end=$end>> $GITHUB_OUTPUT
echo desc=$desc>> $GITHUB_OUTPUT
- name: Load cached aptos-debugger binary
uses: actions/cache/restore@v4
with:
path: aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true

- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- name: Load cached backup storage config
uses: actions/cache/restore@v4
with:
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ github.run_id }}
fail-on-cache-miss: true

- id: auth
uses: "google-github-actions/auth@v2"
with:
workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
- name: Call replay-verify-batch workflow
uses: aptos-labs/aptos-core/.github/workflows/workflow-run-replay-verify-batch.yaml@0911-alden-use-gcloud-on-base-image
secrets: inherit
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: phase 1 - restore snapshot, with retries
env:
RANGES_JSON: ${{ matrix.batch }}
GIT_SHA: ${{ inputs.GIT_SHA }}
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
HISTORY_START: ${{ inputs.HISTORY_START }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
for try in {0..3}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--txns-to-skip $TXNS_TO_SKIP \
--start-version ${{ steps.parse-job.outputs.begin }} \
--end-version ${{ steps.parse-job.outputs.begin }} \
\
--lazy-quit \
--enable-storage-sharding \
--target-db-dir db \
--concurrent-downloads 8 \
--replay-concurrency-level 8 \
\
&& exit 0 || true # exit 0 if successful, otherwise retry
done
exit(1)
- name: phase 2 - replay-verify transactions, with retries
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
for try in {0..3}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--txns-to-skip $TXNS_TO_SKIP \
--start-version ${{ steps.parse-job.outputs.begin }} \
--end-version ${{ steps.parse-job.outputs.end }} \
\
--lazy-quit \
--enable-storage-sharding \
--target-db-dir db \
--concurrent-downloads 8 \
--replay-concurrency-level 8 \
\
&& exit 0 || true # exit 0 if successful, otherwise retry
done
exit(1)
RUNS_ON: ${{ inputs.RUNS_ON }}
TIMEOUT_MINUTES: ${{ inputs.TIMEOUT_MINUTES }}

0 comments on commit 18cd4e9

Please sign in to comment.