Skip to content

Commit

Permalink
feat(Prover CLI): requeue cmd (#1719)
Browse files Browse the repository at this point in the history
## What ❔

<!-- What are the changes this PR brings about? -->
<!-- Example: This PR adds a PR template to the repo. -->
<!-- (For bigger PRs adding more context is appreciated) -->

Adds a `requeue` command that requeues a given batch if it's stuck. It
looks at all jobs that are stuck and requeues them. A job is stuck if it
has a large number of attempts and is not successful.

For now, the attempts are set via a cmd flag (`--max-attempts`) which if
not set, is 10 by default.

### Usage Example

```
cd prover/prover_cli
zk f cargo run --release requeue --batch 1
```

## Why ❔

<!-- Why are these changes done? What goal do they contribute to? What
are the principles behind them? -->
<!-- Example: PR templates ensure PR reviewers, observers, and future
iterators are in context about the evolution of repos. -->

We want to be able to requeue a stuck batch  with the CLI.

## Checklist

<!-- Check your PR fulfills the following items. -->
<!-- For draft PRs check the boxes as you complete them. -->

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [ ] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.
- [ ] Spellcheck has been run via `zk spellcheck`.
- [x] Linkcheck has been run via `zk linkcheck`.

---------

Co-authored-by: Ivan Litteri <ivanlitteri@Ivans-MacBook-Pro.local>
Co-authored-by: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Co-authored-by: Joaquin Carletti <56092489+ColoCarletti@users.noreply.github.com>
  • Loading branch information
4 people authored May 14, 2024
1 parent 5f7bda7 commit f722df7
Show file tree
Hide file tree
Showing 12 changed files with 474 additions and 10 deletions.
1 change: 1 addition & 0 deletions core/lib/basic_types/src/prover_dal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ pub struct StuckJobs {
pub id: u64,
pub status: String,
pub attempts: u64,
pub circuit_id: Option<u32>,
}

// TODO (PLA-774): Redundant structure, should be replaced with `std::net::SocketAddr`.
Expand Down
4 changes: 3 additions & 1 deletion prover/prover_cli/src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use clap::{command, Args, Parser, Subcommand};
use zksync_types::url::SensitiveUrl;

use crate::commands::{self, delete, get_file_info, restart};
use crate::commands::{self, delete, get_file_info, requeue, restart};

pub const VERSION_STRING: &str = env!("CARGO_PKG_VERSION");

Expand Down Expand Up @@ -32,6 +32,7 @@ enum ProverCommand {
Delete(delete::Args),
#[command(subcommand)]
Status(commands::StatusCommand),
Requeue(requeue::Args),
Restart(restart::Args),
}

Expand All @@ -41,6 +42,7 @@ pub async fn start() -> anyhow::Result<()> {
ProverCommand::FileInfo(args) => get_file_info::run(args).await?,
ProverCommand::Delete(args) => delete::run(args).await?,
ProverCommand::Status(cmd) => cmd.run(config).await?,
ProverCommand::Requeue(args) => requeue::run(args, config).await?,
ProverCommand::Restart(args) => restart::run(args).await?,
};

Expand Down
1 change: 1 addition & 0 deletions prover/prover_cli/src/commands/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub(crate) mod delete;
pub(crate) mod get_file_info;
pub(crate) mod requeue;
pub(crate) mod restart;
pub(crate) mod status;

Expand Down
87 changes: 87 additions & 0 deletions prover/prover_cli/src/commands/requeue.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use anyhow::Context;
use clap::Args as ClapArgs;
use prover_dal::{ConnectionPool, Prover, ProverDal};
use zksync_types::{basic_fri_types::AggregationRound, prover_dal::StuckJobs, L1BatchNumber};

use crate::cli::ProverCLIConfig;

#[derive(ClapArgs)]
pub struct Args {
#[clap(short, long)]
batch: L1BatchNumber,
/// Maximum number of attempts to re-queue a job.
/// Default value is 10.
/// NOTE: this argument is temporary and will be deprecated once the `config` command is implemented.
#[clap(long, default_value_t = 10)]
max_attempts: u32,
}

pub async fn run(args: Args, config: ProverCLIConfig) -> anyhow::Result<()> {
let pool = ConnectionPool::<Prover>::singleton(config.db_url)
.build()
.await
.context("failed to build a prover_connection_pool")?;

let mut conn = pool
.connection()
.await
.context("failed to acquire a connection")?;

let mut fri_witness_generator_dal = conn.fri_witness_generator_dal();

let stuck_witness_input_jobs = fri_witness_generator_dal
.requeue_stuck_witness_inputs_jobs_for_batch(args.batch, args.max_attempts)
.await;
display_requeued_stuck_jobs(stuck_witness_input_jobs, AggregationRound::BasicCircuits);

let stuck_leaf_aggregations_stuck_jobs = fri_witness_generator_dal
.requeue_stuck_leaf_aggregation_jobs_for_batch(args.batch, args.max_attempts)
.await;
display_requeued_stuck_jobs(
stuck_leaf_aggregations_stuck_jobs,
AggregationRound::LeafAggregation,
);

let stuck_node_aggregations_jobs = fri_witness_generator_dal
.requeue_stuck_node_aggregation_jobs_for_batch(args.batch, args.max_attempts)
.await;
display_requeued_stuck_jobs(
stuck_node_aggregations_jobs,
AggregationRound::NodeAggregation,
);

let stuck_recursion_tip_job = fri_witness_generator_dal
.requeue_stuck_recursion_tip_jobs_for_batch(args.batch, args.max_attempts)
.await;
display_requeued_stuck_jobs(stuck_recursion_tip_job, AggregationRound::RecursionTip);

let stuck_scheduler_jobs = fri_witness_generator_dal
.requeue_stuck_scheduler_jobs_for_batch(args.batch, args.max_attempts)
.await;
display_requeued_stuck_jobs(stuck_scheduler_jobs, AggregationRound::Scheduler);

let stuck_proof_compressor_jobs = conn
.fri_proof_compressor_dal()
.requeue_stuck_jobs_for_batch(args.batch, args.max_attempts)
.await;
for stuck_job in stuck_proof_compressor_jobs {
println!("Re-queuing proof compressor job {stuck_job:?} 🔁",);
}

let stuck_prover_jobs = conn
.fri_prover_jobs_dal()
.requeue_stuck_jobs_for_batch(args.batch, args.max_attempts)
.await;

for stuck_job in stuck_prover_jobs {
println!("Re-queuing prover job {stuck_job:?} 🔁",);
}

Ok(())
}

fn display_requeued_stuck_jobs(stuck_jobs: Vec<StuckJobs>, aggregation_round: AggregationRound) {
for stuck_job in stuck_jobs {
println!("Re-queuing {aggregation_round} stuck job {stuck_job:?} 🔁",);
}
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 41 additions & 0 deletions prover/prover_dal/src/fri_proof_compressor_dal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ impl FriProofCompressorDal<'_, '_> {
id: row.l1_batch_number as u64,
status: row.status,
attempts: row.attempts as u64,
circuit_id: None,
})
.collect()
}
Expand Down Expand Up @@ -374,4 +375,44 @@ impl FriProofCompressorDal<'_, '_> {
.execute(self.storage.conn())
.await
}

pub async fn requeue_stuck_jobs_for_batch(
&mut self,
block_number: L1BatchNumber,
max_attempts: u32,
) -> Vec<StuckJobs> {
{
sqlx::query!(
r#"
UPDATE proof_compression_jobs_fri
SET
status = 'queued',
error = 'Manually requeued',
attempts = 2,
updated_at = NOW(),
processing_started_at = NOW()
WHERE
l1_batch_number = $1
AND attempts >= $2
AND (status = 'in_progress' OR status = 'failed')
RETURNING
status,
attempts
"#,
i64::from(block_number.0),
max_attempts as i32,
)
.fetch_all(self.storage.conn())
.await
.unwrap()
.into_iter()
.map(|row| StuckJobs {
id: block_number.0 as u64,
status: row.status,
attempts: row.attempts as u64,
circuit_id: None,
})
.collect()
}
}
}
Loading

0 comments on commit f722df7

Please sign in to comment.