Skip to content

Commit

Permalink
fix(prover): Disallow state changes from successful (#2233)
Browse files Browse the repository at this point in the history
This PR is done as a fix for boojnet outage.

TL;DR; of outage -- race condition caused by prover jobs moving from
'successful` state to `in_progress`/`in_gpu_proving`.

The PR addresses: 
- no job can move from successful state (considered final state)
- fix local development (contracts were pointing to 0.24.0 instead of
0.24.1) -- can be split to a different PR, if this is problematic.
- add table constraint -- again, can be split in different PR
- add checks for recursion_tip number of jobs (post outage check, should
not happen ever, but better to verify)
  • Loading branch information
EmilLuta authored Jun 13, 2024
1 parent 227e101 commit 2488a76
Show file tree
Hide file tree
Showing 23 changed files with 65 additions and 159 deletions.
2 changes: 1 addition & 1 deletion core/lib/basic_types/src/prover_dal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ pub struct RecursionTipWitnessGeneratorJobInfo {
pub error: Option<String>,
pub created_at: NaiveDateTime,
pub updated_at: NaiveDateTime,
pub number_of_final_node_jobs: Option<i32>,
pub number_of_final_node_jobs: i32,
pub protocol_version: Option<i32>,
pub picked_by: Option<String>,
}
Expand Down

This file was deleted.

2 changes: 1 addition & 1 deletion etc/env/base/contracts.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ PRIORITY_TX_MAX_GAS_LIMIT = 72000000
DEPLOY_L2_BRIDGE_COUNTERPART_GAS_LIMIT = 10000000
GENESIS_ROLLUP_LEAF_INDEX = "54"
GENESIS_PROTOCOL_VERSION = "24"
GENESIS_PROTOCOL_SEMANTIC_VERSION = "0.24.0"
GENESIS_PROTOCOL_SEMANTIC_VERSION = "0.24.1"
L1_WETH_BRIDGE_IMPL_ADDR = "0x5E6D086F5eC079ADFF4FB3774CDf3e8D6a34F7E9"
L1_WETH_BRIDGE_PROXY_ADDR = "0x5E6D086F5eC079ADFF4FB3774CDf3e8D6a34F7E9"
L1_WETH_TOKEN_ADDR = "0x5E6D086F5eC079ADFF4FB3774CDf3e8D6a34F7E9"
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

This file was deleted.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

This file was deleted.

This file was deleted.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE recursion_tip_witness_jobs_fri ALTER COLUMN number_of_final_node_jobs DROP NOT NULL;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE recursion_tip_witness_jobs_fri ALTER COLUMN number_of_final_node_jobs SET NOT NULL;
23 changes: 5 additions & 18 deletions prover/prover_dal/src/fri_proof_compressor_dal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,6 @@ impl FriProofCompressorDal<'_, '_> {
.unwrap();
}

pub async fn skip_proof_compression_job(&mut self, block_number: L1BatchNumber) {
sqlx::query!(
r#"
INSERT INTO
proof_compression_jobs_fri (l1_batch_number, status, created_at, updated_at)
VALUES
($1, $2, NOW(), NOW())
ON CONFLICT (l1_batch_number) DO NOTHING
"#,
i64::from(block_number.0),
ProofCompressionJobStatus::Skipped.to_string(),
)
.fetch_optional(self.storage.conn())
.await
.unwrap();
}

pub async fn get_next_proof_compression_job(
&mut self,
picked_by: &str,
Expand Down Expand Up @@ -177,10 +160,14 @@ impl FriProofCompressorDal<'_, '_> {
updated_at = NOW()
WHERE
l1_batch_number = $3
AND status != $4
AND status != $5
"#,
ProofCompressionJobStatus::Failed.to_string(),
error,
i64::from(block_number.0)
i64::from(block_number.0),
ProofCompressionJobStatus::Successful.to_string(),
ProofCompressionJobStatus::SentToServer.to_string(),
)
.execute(self.storage.conn())
.await
Expand Down
44 changes: 3 additions & 41 deletions prover/prover_dal/src/fri_prover_dal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::{collections::HashMap, convert::TryFrom, str::FromStr, time::Duration};

use zksync_basic_types::{
basic_fri_types::{AggregationRound, CircuitIdRoundTuple, JobIdentifiers},
protocol_version::{ProtocolSemanticVersion, ProtocolVersionId, VersionPatch},
protocol_version::{ProtocolSemanticVersion, ProtocolVersionId},
prover_dal::{
correct_circuit_id, FriProverJobMetadata, JobCountStatistics, ProverJobFriInfo,
ProverJobStatus, StuckJobs,
Expand Down Expand Up @@ -211,6 +211,7 @@ impl FriProverDal<'_, '_> {
updated_at = NOW()
WHERE
id = $2
AND status != 'successful'
"#,
error,
i64::from(id)
Expand Down Expand Up @@ -520,6 +521,7 @@ impl FriProverDal<'_, '_> {
updated_at = NOW()
WHERE
id = $2
AND status != 'successful'
"#,
status,
i64::from(id)
Expand All @@ -529,23 +531,6 @@ impl FriProverDal<'_, '_> {
.unwrap();
}

pub async fn save_successful_sent_proof(&mut self, l1_batch_number: L1BatchNumber) {
sqlx::query!(
r#"
UPDATE prover_jobs_fri
SET
status = 'sent_to_server',
updated_at = NOW()
WHERE
l1_batch_number = $1
"#,
i64::from(l1_batch_number.0)
)
.execute(self.storage.conn())
.await
.unwrap();
}

pub async fn get_scheduler_proof_job_id(
&mut self,
l1_batch_number: L1BatchNumber,
Expand Down Expand Up @@ -698,29 +683,6 @@ impl FriProverDal<'_, '_> {
.collect()
}

pub async fn protocol_version_for_job(&mut self, job_id: u32) -> ProtocolSemanticVersion {
let result = sqlx::query!(
r#"
SELECT
protocol_version,
protocol_version_patch
FROM
prover_jobs_fri
WHERE
id = $1
"#,
job_id as i32
)
.fetch_one(self.storage.conn())
.await
.unwrap();

ProtocolSemanticVersion::new(
ProtocolVersionId::try_from(result.protocol_version.unwrap() as u16).unwrap(),
VersionPatch(result.protocol_version_patch as u32),
)
}

pub async fn delete_prover_jobs_fri_batch_data(
&mut self,
l1_batch_number: L1BatchNumber,
Expand Down
Loading

0 comments on commit 2488a76

Please sign in to comment.