Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(proof-data-handler): exclude batches without object file in GCS #2980

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
9188f8a
feat(proof-data-handler): exclude batches without object file in GCS
pbeza Sep 27, 2024
851e3ed
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 18, 2024
19401af
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 18, 2024
a559c40
Revert unintended code artifact
pbeza Nov 18, 2024
16a0e55
Remove direct dependency on zksync_basic_types
pbeza Nov 18, 2024
5de035b
Use `break` and `return` consistently
pbeza Nov 18, 2024
6e27751
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 18, 2024
8503149
fixup! Remove direct dependency on zksync_basic_types
pbeza Nov 18, 2024
8826898
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 19, 2024
0dd0121
Make batch_ignored_timeout configurable + refine comments
pbeza Nov 19, 2024
1f70f44
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 19, 2024
8a80b02
fixup! Make batch_ignored_timeout configurable + refine comments
pbeza Nov 19, 2024
fe2534b
fixup! Make batch_ignored_timeout configurable + refine comments
pbeza Nov 19, 2024
7e9f9a8
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 19, 2024
ada62fe
Retry failed batches after 1 minute instead of 10 minutes
pbeza Nov 20, 2024
c61c1fa
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 20, 2024
733c27d
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 16 additions & 3 deletions core/lib/config/src/configs/proof_data_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ pub struct TeeConfig {
pub tee_support: bool,
/// All batches before this one are considered to be processed.
pub first_tee_processed_batch: L1BatchNumber,
/// Timeout in seconds for retrying TEE proof generation if it fails. Retries continue
/// indefinitely until successful.
/// Timeout in seconds for retrying the preparation of input for TEE proof generation if it
/// previously failed (e.g., due to a transient network issue) or if it was picked by a TEE
/// prover but the TEE proof was not submitted within that time.
pub tee_proof_generation_timeout_in_secs: u16,
/// Timeout in hours after which a batch will be permanently ignored if repeated retries failed.
pub tee_batch_permanently_ignored_timeout_in_hours: u16,
}

impl Default for TeeConfig {
Expand All @@ -21,6 +24,8 @@ impl Default for TeeConfig {
first_tee_processed_batch: Self::default_first_tee_processed_batch(),
tee_proof_generation_timeout_in_secs:
Self::default_tee_proof_generation_timeout_in_secs(),
tee_batch_permanently_ignored_timeout_in_hours:
Self::default_tee_batch_permanently_ignored_timeout_in_hours(),
}
}
}
Expand All @@ -35,12 +40,20 @@ impl TeeConfig {
}

pub fn default_tee_proof_generation_timeout_in_secs() -> u16 {
600
60
}

pub fn default_tee_batch_permanently_ignored_timeout_in_hours() -> u16 {
10 * 24
}

pub fn tee_proof_generation_timeout(&self) -> Duration {
Duration::from_secs(self.tee_proof_generation_timeout_in_secs.into())
}

pub fn tee_batch_permanently_ignored_timeout(&self) -> Duration {
Duration::from_secs(3600 * u64::from(self.tee_batch_permanently_ignored_timeout_in_hours))
}
}

#[derive(Debug, Deserialize, Clone, PartialEq)]
Expand Down
1 change: 1 addition & 0 deletions core/lib/config/src/testonly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,7 @@ impl Distribution<configs::ProofDataHandlerConfig> for EncodeDist {
tee_support: self.sample(rng),
first_tee_processed_batch: L1BatchNumber(rng.gen()),
tee_proof_generation_timeout_in_secs: self.sample(rng),
tee_batch_permanently_ignored_timeout_in_hours: self.sample(rng),
},
}
}
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions core/lib/dal/doc/TeeProofGenerationDal.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
title: Status Diagram
---
stateDiagram-v2
[*] --> unpicked : insert_tee_proof_generation_job
unpicked --> picked_by_prover : lock_batch_for_proving
[*] --> picked_by_prover : lock
picked_by_prover --> generated : save_proof_artifacts_metadata
picked_by_prover --> unpicked : unlock_batch
picked_by_prover --> permanently_ignored : unlock_batch
picked_by_prover --> failed : unlock_batch
failed --> picked_by_prover : lock
permanently_ignored --> [*]
generated --> [*]
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
-- There were manually added tee_proof_generation_details entries with status 'permanently_ignore'.

UPDATE tee_proof_generation_details SET status = 'permanently_ignored' WHERE status = 'permanently_ignore';

-- Entries with the status 'unpicked' were not used at all after the migration to the logic
-- introduced in https://github.com/matter-labs/zksync-era/pull/3017. This was overlooked.

DELETE FROM tee_proof_generation_details WHERE status = 'unpicked';
20 changes: 19 additions & 1 deletion core/lib/dal/src/models/storage_tee_proof.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use chrono::NaiveDateTime;
use chrono::{DateTime, NaiveDateTime, Utc};
use zksync_types::L1BatchNumber;

use crate::tee_proof_generation_dal::LockedBatch;

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct StorageTeeProof {
Expand All @@ -8,3 +11,18 @@ pub struct StorageTeeProof {
pub updated_at: NaiveDateTime,
pub attestation: Option<Vec<u8>>,
}

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct StorageLockedBatch {
pub l1_batch_number: i64,
pub created_at: NaiveDateTime,
}

impl From<StorageLockedBatch> for LockedBatch {
fn from(tx: StorageLockedBatch) -> LockedBatch {
LockedBatch {
l1_batch_number: L1BatchNumber::from(tx.l1_batch_number as u32),
created_at: DateTime::<Utc>::from_naive_utc_and_offset(tx.created_at, Utc),
}
}
}
69 changes: 49 additions & 20 deletions core/lib/dal/src/tee_proof_generation_dal.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#![doc = include_str!("../doc/TeeProofGenerationDal.md")]
use std::time::Duration;

use chrono::{DateTime, Utc};
use strum::{Display, EnumString};
use zksync_db_connection::{
connection::Connection,
Expand All @@ -10,21 +11,47 @@ use zksync_db_connection::{
};
use zksync_types::{tee_types::TeeType, L1BatchNumber};

use crate::{models::storage_tee_proof::StorageTeeProof, Core};
use crate::{
models::storage_tee_proof::{StorageLockedBatch, StorageTeeProof},
Core,
};

#[derive(Debug)]
pub struct TeeProofGenerationDal<'a, 'c> {
pub(crate) storage: &'a mut Connection<'c, Core>,
}

#[derive(Debug, EnumString, Display)]
enum TeeProofGenerationJobStatus {
#[strum(serialize = "unpicked")]
Unpicked,
#[derive(Debug, Clone, Copy, EnumString, Display)]
pub enum TeeProofGenerationJobStatus {
/// The batch has been picked by a TEE prover and is currently being processed.
#[strum(serialize = "picked_by_prover")]
PickedByProver,
/// The proof has been successfully generated and submitted for the batch.
#[strum(serialize = "generated")]
Generated,
/// The proof generation for the batch has failed, which can happen if its inputs (GCS blob
/// files) are incomplete or the API is unavailable. Failed batches are retried for a specified
/// period, as defined in the configuration.
#[strum(serialize = "failed")]
Failed,
/// The batch will not be processed again because the proof generation has been failing for an
/// extended period, as specified in the configuration.
#[strum(serialize = "permanently_ignored")]
PermanentlyIgnored,
}

/// Represents a locked batch picked by a TEE prover. A batch is locked when taken by a TEE prover
/// ([TeeProofGenerationJobStatus::PickedByProver]). It can transition to one of three states:
/// 1. [TeeProofGenerationJobStatus::Generated].
/// 2. [TeeProofGenerationJobStatus::Failed].
/// 3. [TeeProofGenerationJobStatus::PermanentlyIgnored].
#[derive(Clone, Debug)]
pub struct LockedBatch {
/// Locked batch number.
pub l1_batch_number: L1BatchNumber,
/// The creation time of the job for this batch. It is used to determine if the batch should
/// transition to [TeeProofGenerationJobStatus::PermanentlyIgnored] or [TeeProofGenerationJobStatus::Failed].
pub created_at: DateTime<Utc>,
}

impl TeeProofGenerationDal<'_, '_> {
Expand All @@ -33,10 +60,11 @@ impl TeeProofGenerationDal<'_, '_> {
tee_type: TeeType,
processing_timeout: Duration,
min_batch_number: L1BatchNumber,
) -> DalResult<Option<L1BatchNumber>> {
) -> DalResult<Option<LockedBatch>> {
let processing_timeout = pg_interval_from_duration(processing_timeout);
let min_batch_number = i64::from(min_batch_number.0);
sqlx::query!(
let locked_batch = sqlx::query_as!(
StorageLockedBatch,
r#"
WITH upsert AS (
SELECT
Expand All @@ -57,11 +85,8 @@ impl TeeProofGenerationDal<'_, '_> {
AND (
tee.l1_batch_number IS NULL
OR (
tee.status = $3
OR (
tee.status = $2
AND tee.prover_taken_at < NOW() - $4::INTERVAL
)
(tee.status = $2 OR tee.status = $3)
AND tee.prover_taken_at < NOW() - $4::INTERVAL
)
)
FETCH FIRST ROW ONLY
Expand All @@ -87,11 +112,12 @@ impl TeeProofGenerationDal<'_, '_> {
updated_at = NOW(),
prover_taken_at = NOW()
RETURNING
l1_batch_number
l1_batch_number,
created_at
"#,
tee_type.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::Failed.to_string(),
processing_timeout,
min_batch_number
)
Expand All @@ -100,14 +126,17 @@ impl TeeProofGenerationDal<'_, '_> {
.with_arg("processing_timeout", &processing_timeout)
.with_arg("l1_batch_number", &min_batch_number)
.fetch_optional(self.storage)
.await
.map(|record| record.map(|record| L1BatchNumber(record.l1_batch_number as u32)))
.await?
.map(Into::into);

Ok(locked_batch)
}

pub async fn unlock_batch(
&mut self,
l1_batch_number: L1BatchNumber,
tee_type: TeeType,
status: TeeProofGenerationJobStatus,
) -> DalResult<()> {
let batch_number = i64::from(l1_batch_number.0);
sqlx::query!(
Expand All @@ -120,7 +149,7 @@ impl TeeProofGenerationDal<'_, '_> {
l1_batch_number = $2
AND tee_type = $3
"#,
TeeProofGenerationJobStatus::Unpicked.to_string(),
status.to_string(),
batch_number,
tee_type.to_string()
)
Expand Down Expand Up @@ -266,7 +295,7 @@ impl TeeProofGenerationDal<'_, '_> {
"#,
batch_number,
tee_type.to_string(),
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
);
let instrumentation = Instrumented::new("insert_tee_proof_generation_job")
.with_arg("l1_batch_number", &batch_number)
Expand All @@ -281,7 +310,7 @@ impl TeeProofGenerationDal<'_, '_> {
}

/// For testing purposes only.
pub async fn get_oldest_unpicked_batch(&mut self) -> DalResult<Option<L1BatchNumber>> {
pub async fn get_oldest_picked_by_prover_batch(&mut self) -> DalResult<Option<L1BatchNumber>> {
let query = sqlx::query!(
r#"
SELECT
Expand All @@ -295,7 +324,7 @@ impl TeeProofGenerationDal<'_, '_> {
LIMIT
1
"#,
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
);
let batch_number = Instrumented::new("get_oldest_unpicked_batch")
.with(query)
Expand Down
2 changes: 2 additions & 0 deletions core/lib/env_config/src/proof_data_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ mod tests {
tee_support: true,
first_tee_processed_batch: L1BatchNumber(1337),
tee_proof_generation_timeout_in_secs: 600,
tee_batch_permanently_ignored_timeout_in_hours: 240,
},
}
}
Expand All @@ -41,6 +42,7 @@ mod tests {
PROOF_DATA_HANDLER_TEE_SUPPORT="true"
PROOF_DATA_HANDLER_FIRST_TEE_PROCESSED_BATCH="1337"
PROOF_DATA_HANDLER_TEE_PROOF_GENERATION_TIMEOUT_IN_SECS="600"
PROOF_DATA_HANDLER_TEE_BATCH_PERMANENTLY_IGNORED_TIMEOUT_IN_HOURS="240"
"#;
let mut lock = MUTEX.lock();
lock.set_env(config);
Expand Down
1 change: 0 additions & 1 deletion core/lib/object_store/src/retries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ impl Request<'_> {
backoff_secs *= 2;
}
Err(err) => {
tracing::warn!(%err, "Failed request with a fatal error");
break Err(err);
}
}
Expand Down
11 changes: 11 additions & 0 deletions core/lib/protobuf_config/src/proof_data_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ impl ProtoRepr for proto::ProofDataHandler {
.unwrap_or_else(
configs::TeeConfig::default_tee_proof_generation_timeout_in_secs,
),
tee_batch_permanently_ignored_timeout_in_hours: self
.tee_batch_permanently_ignored_timeout_in_hours
.map(|x| x as u16)
.unwrap_or_else(
configs::TeeConfig::default_tee_batch_permanently_ignored_timeout_in_hours,
),
},
})
}
Expand All @@ -42,6 +48,11 @@ impl ProtoRepr for proto::ProofDataHandler {
tee_proof_generation_timeout_in_secs: Some(
this.tee_config.tee_proof_generation_timeout_in_secs.into(),
),
tee_batch_permanently_ignored_timeout_in_hours: Some(
this.tee_config
.tee_batch_permanently_ignored_timeout_in_hours
.into(),
),
}
}
}
1 change: 1 addition & 0 deletions core/lib/protobuf_config/src/proto/config/prover.proto
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,5 @@ message ProofDataHandler {
optional bool tee_support = 3; // optional
optional uint64 first_tee_processed_batch = 4; // optional
optional uint32 tee_proof_generation_timeout_in_secs = 5; // optional
optional uint32 tee_batch_permanently_ignored_timeout_in_hours = 6; // optional
}
2 changes: 1 addition & 1 deletion core/lib/types/src/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use serde_json::Value;
use serde_with::{hex::Hex, serde_as};
use strum::Display;
use zksync_basic_types::{
tee_types::TeeType,
web3::{AccessList, Bytes, Index},
Bloom, L1BatchNumber, H160, H256, H64, U256, U64,
};
Expand All @@ -16,6 +15,7 @@ pub use crate::transaction_request::{
use crate::{
debug_flat_call::{DebugCallFlat, ResultDebugCallFlat},
protocol_version::L1VerifierConfig,
tee_types::TeeType,
Address, L2BlockNumber, ProtocolVersionId,
};

Expand Down
2 changes: 0 additions & 2 deletions core/node/proof_data_handler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ tracing.workspace = true

[dev-dependencies]
hyper.workspace = true
chrono.workspace = true
zksync_multivm.workspace = true
serde_json.workspace = true
tower.workspace = true
zksync_basic_types.workspace = true
zksync_contracts.workspace = true
Loading
Loading