From 7d96c1c745d84f48592456b28a82ed76acd42cc5 Mon Sep 17 00:00:00 2001 From: Patrick Beza Date: Thu, 28 Nov 2024 12:13:51 +0100 Subject: [PATCH 1/7] fix(tee): fix race condition in batch locking After [scaling][1] [zksync-tee-prover][2] to two instances/replicas on Azure for azure-stage2, azure-testnet2, and azure-mainnet2, we started experiencing [duplicated proving for some batches][3]. While this is not an erroneous situation, it is wasteful from a resource perspective. This was due to a race condition in batch locking. This PR fixes the issue by adding atomic batch locking. [1]: https://github.com/matter-labs/gitops-kubernetes/pull/7033/files [2]: https://github.com/matter-labs/zksync-era/blob/aaca32b6ab411d5cdc1234c20af8b5c1092195d7/core/bin/zksync_tee_prover/src/main.rs [3]: https://grafana.matterlabs.dev/goto/M1I_Bq7HR?orgId=1 --- ...34096d9357e229fc86fa82ace3a4ec32406b7.json | 26 +++++ ...82b0fa233913582fe9091cc1e8954dfd0eb1b.json | 30 ++++++ ...8203a62629904bc4956249e690a8ad7a48983.json | 32 ------ core/lib/dal/src/models/storage_tee_proof.rs | 5 + core/lib/dal/src/tee_proof_generation_dal.rs | 99 +++++++++++-------- 5 files changed, 121 insertions(+), 71 deletions(-) create mode 100644 core/lib/dal/.sqlx/query-32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7.json create mode 100644 core/lib/dal/.sqlx/query-4777de5d3f313f1eb8c3b6a4c1782b0fa233913582fe9091cc1e8954dfd0eb1b.json delete mode 100644 core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json diff --git a/core/lib/dal/.sqlx/query-32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7.json b/core/lib/dal/.sqlx/query-32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7.json new file mode 100644 index 000000000000..7491c656225b --- /dev/null +++ b/core/lib/dal/.sqlx/query-32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7.json @@ -0,0 +1,26 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n FETCH FIRST ROW ONLY\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "l1_batch_number", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [ + "Text", + "Text", + "Text", + "Interval", + "Int8" + ] + }, + "nullable": [ + false + ] + }, + "hash": "32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7" +} diff --git a/core/lib/dal/.sqlx/query-4777de5d3f313f1eb8c3b6a4c1782b0fa233913582fe9091cc1e8954dfd0eb1b.json b/core/lib/dal/.sqlx/query-4777de5d3f313f1eb8c3b6a4c1782b0fa233913582fe9091cc1e8954dfd0eb1b.json new file mode 100644 index 000000000000..37adf92582e7 --- /dev/null +++ b/core/lib/dal/.sqlx/query-4777de5d3f313f1eb8c3b6a4c1782b0fa233913582fe9091cc1e8954dfd0eb1b.json @@ -0,0 +1,30 @@ +{ + "db_name": "PostgreSQL", + "query": "\n INSERT INTO\n tee_proof_generation_details (\n l1_batch_number, tee_type, status, created_at, updated_at, prover_taken_at\n )\n VALUES\n (\n $1,\n $2,\n $3,\n NOW(),\n NOW(),\n NOW()\n )\n ON CONFLICT (l1_batch_number, tee_type) DO\n UPDATE\n SET\n status = $3,\n updated_at = NOW(),\n prover_taken_at = NOW()\n RETURNING\n l1_batch_number,\n created_at\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "l1_batch_number", + "type_info": "Int8" + }, + { + "ordinal": 1, + "name": "created_at", + "type_info": "Timestamp" + } + ], + "parameters": { + "Left": [ + "Int8", + "Text", + "Text" + ] + }, + "nullable": [ + false, + false + ] + }, + "hash": "4777de5d3f313f1eb8c3b6a4c1782b0fa233913582fe9091cc1e8954dfd0eb1b" +} diff --git a/core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json b/core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json deleted file mode 100644 index 7ca2c9e7e9fa..000000000000 --- a/core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "\n WITH upsert AS (\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n FETCH FIRST ROW ONLY\n )\n \n INSERT INTO\n tee_proof_generation_details (\n l1_batch_number, tee_type, status, created_at, updated_at, prover_taken_at\n )\n SELECT\n l1_batch_number,\n $1,\n $2,\n NOW(),\n NOW(),\n NOW()\n FROM\n upsert\n ON CONFLICT (l1_batch_number, tee_type) DO\n UPDATE\n SET\n status = $2,\n updated_at = NOW(),\n prover_taken_at = NOW()\n RETURNING\n l1_batch_number,\n created_at\n ", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "l1_batch_number", - "type_info": "Int8" - }, - { - "ordinal": 1, - "name": "created_at", - "type_info": "Timestamp" - } - ], - "parameters": { - "Left": [ - "Text", - "Text", - "Text", - "Interval", - "Int8" - ] - }, - "nullable": [ - false, - false - ] - }, - "hash": "e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983" -} diff --git a/core/lib/dal/src/models/storage_tee_proof.rs b/core/lib/dal/src/models/storage_tee_proof.rs index 6e031674b585..8ab9517cefa9 100644 --- a/core/lib/dal/src/models/storage_tee_proof.rs +++ b/core/lib/dal/src/models/storage_tee_proof.rs @@ -13,6 +13,11 @@ pub struct StorageTeeProof { pub attestation: Option>, } +#[derive(Debug, Clone, sqlx::FromRow)] +pub struct StorageBatch { + pub l1_batch_number: i64, +} + #[derive(Debug, Clone, sqlx::FromRow)] pub struct StorageLockedBatch { pub l1_batch_number: i64, diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index e6b2df974b26..16cc321c080b 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -13,7 +13,7 @@ use zksync_db_connection::{ use zksync_types::{tee_types::TeeType, L1BatchNumber}; use crate::{ - models::storage_tee_proof::{StorageLockedBatch, StorageTeeProof}, + models::storage_tee_proof::{StorageBatch, StorageLockedBatch, StorageTeeProof}, Core, }; @@ -64,72 +64,93 @@ impl TeeProofGenerationDal<'_, '_> { ) -> DalResult> { let processing_timeout = pg_interval_from_duration(processing_timeout); let min_batch_number = i64::from(min_batch_number.0); + let mut transaction = self.storage.start_transaction().await?; + let batch_number = sqlx::query_as!( + StorageBatch, + r#" + SELECT + p.l1_batch_number + FROM + proof_generation_details p + LEFT JOIN + tee_proof_generation_details tee + ON + p.l1_batch_number = tee.l1_batch_number + AND tee.tee_type = $1 + WHERE + ( + p.l1_batch_number >= $5 + AND p.vm_run_data_blob_url IS NOT NULL + AND p.proof_gen_data_blob_url IS NOT NULL + ) + AND ( + tee.l1_batch_number IS NULL + OR ( + (tee.status = $2 OR tee.status = $3) + AND tee.prover_taken_at < NOW() - $4::INTERVAL + ) + ) + FETCH FIRST ROW ONLY + "#, + tee_type.to_string(), + TeeProofGenerationJobStatus::PickedByProver.to_string(), + TeeProofGenerationJobStatus::Failed.to_string(), + processing_timeout, + min_batch_number + ) + .instrument("lock_batch_for_proving#get_batch_no") + .with_arg("tee_type", &tee_type) + .with_arg("processing_timeout", &processing_timeout) + .with_arg("min_batch_number", &min_batch_number) + .fetch_optional(&mut transaction) + .await?; + + let batch_number = match batch_number { + Some(batch) => batch.l1_batch_number, + None => { + transaction.commit().await?; + return Ok(None); + } + }; + let locked_batch = sqlx::query_as!( StorageLockedBatch, r#" - WITH upsert AS ( - SELECT - p.l1_batch_number - FROM - proof_generation_details p - LEFT JOIN - tee_proof_generation_details tee - ON - p.l1_batch_number = tee.l1_batch_number - AND tee.tee_type = $1 - WHERE - ( - p.l1_batch_number >= $5 - AND p.vm_run_data_blob_url IS NOT NULL - AND p.proof_gen_data_blob_url IS NOT NULL - ) - AND ( - tee.l1_batch_number IS NULL - OR ( - (tee.status = $2 OR tee.status = $3) - AND tee.prover_taken_at < NOW() - $4::INTERVAL - ) - ) - FETCH FIRST ROW ONLY - ) - INSERT INTO tee_proof_generation_details ( l1_batch_number, tee_type, status, created_at, updated_at, prover_taken_at ) - SELECT - l1_batch_number, + VALUES + ( $1, $2, + $3, NOW(), NOW(), NOW() - FROM - upsert + ) ON CONFLICT (l1_batch_number, tee_type) DO UPDATE SET - status = $2, + status = $3, updated_at = NOW(), prover_taken_at = NOW() RETURNING l1_batch_number, created_at "#, + batch_number, tee_type.to_string(), TeeProofGenerationJobStatus::PickedByProver.to_string(), - TeeProofGenerationJobStatus::Failed.to_string(), - processing_timeout, - min_batch_number ) - .instrument("lock_batch_for_proving") + .instrument("lock_batch_for_proving#insert") + .with_arg("batch_number", &batch_number) .with_arg("tee_type", &tee_type) - .with_arg("processing_timeout", &processing_timeout) - .with_arg("l1_batch_number", &min_batch_number) - .fetch_optional(self.storage) + .fetch_optional(&mut transaction) .await? .map(Into::into); + transaction.commit().await?; Ok(locked_batch) } From 464b5d4f1a189304fc442eebe847fee2a5d714d5 Mon Sep 17 00:00:00 2001 From: Patrick Beza Date: Fri, 29 Nov 2024 19:08:35 +0100 Subject: [PATCH 2/7] Addressed review comments --- core/lib/dal/src/models/storage_tee_proof.rs | 5 ----- core/lib/dal/src/tee_proof_generation_dal.rs | 10 +++++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/core/lib/dal/src/models/storage_tee_proof.rs b/core/lib/dal/src/models/storage_tee_proof.rs index 8ab9517cefa9..6e031674b585 100644 --- a/core/lib/dal/src/models/storage_tee_proof.rs +++ b/core/lib/dal/src/models/storage_tee_proof.rs @@ -13,11 +13,6 @@ pub struct StorageTeeProof { pub attestation: Option>, } -#[derive(Debug, Clone, sqlx::FromRow)] -pub struct StorageBatch { - pub l1_batch_number: i64, -} - #[derive(Debug, Clone, sqlx::FromRow)] pub struct StorageLockedBatch { pub l1_batch_number: i64, diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index 16cc321c080b..6ea743afd75d 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -13,7 +13,7 @@ use zksync_db_connection::{ use zksync_types::{tee_types::TeeType, L1BatchNumber}; use crate::{ - models::storage_tee_proof::{StorageBatch, StorageLockedBatch, StorageTeeProof}, + models::storage_tee_proof::{StorageLockedBatch, StorageTeeProof}, Core, }; @@ -65,8 +65,7 @@ impl TeeProofGenerationDal<'_, '_> { let processing_timeout = pg_interval_from_duration(processing_timeout); let min_batch_number = i64::from(min_batch_number.0); let mut transaction = self.storage.start_transaction().await?; - let batch_number = sqlx::query_as!( - StorageBatch, + let batch_number = sqlx::query!( r#" SELECT p.l1_batch_number @@ -90,7 +89,9 @@ impl TeeProofGenerationDal<'_, '_> { AND tee.prover_taken_at < NOW() - $4::INTERVAL ) ) - FETCH FIRST ROW ONLY + LIMIT 1 + FOR UPDATE OF p + SKIP LOCKED "#, tee_type.to_string(), TeeProofGenerationJobStatus::PickedByProver.to_string(), @@ -108,7 +109,6 @@ impl TeeProofGenerationDal<'_, '_> { let batch_number = match batch_number { Some(batch) => batch.l1_batch_number, None => { - transaction.commit().await?; return Ok(None); } }; From 574896b0c344bbf05546d7a88abfafafc3077ef9 Mon Sep 17 00:00:00 2001 From: Patrick Beza Date: Fri, 29 Nov 2024 19:19:11 +0100 Subject: [PATCH 3/7] fixup! Addressed review comments --- ...09348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename core/lib/dal/.sqlx/{query-32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7.json => query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json} (87%) diff --git a/core/lib/dal/.sqlx/query-32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7.json b/core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json similarity index 87% rename from core/lib/dal/.sqlx/query-32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7.json rename to core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json index 7491c656225b..6266f93e6545 100644 --- a/core/lib/dal/.sqlx/query-32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7.json +++ b/core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n FETCH FIRST ROW ONLY\n ", + "query": "\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n LIMIT 1\n FOR UPDATE OF p\n SKIP LOCKED\n ", "describe": { "columns": [ { @@ -22,5 +22,5 @@ false ] }, - "hash": "32785942bbbcc7e8212c8e401ab34096d9357e229fc86fa82ace3a4ec32406b7" + "hash": "8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa" } From 6796543ce36a68238eaad3211031f421a0b041f3 Mon Sep 17 00:00:00 2001 From: Patrick Beza Date: Tue, 3 Dec 2024 12:46:58 +0100 Subject: [PATCH 4/7] Address Alex's code review comments --- core/lib/dal/src/tee_proof_generation_dal.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index 6ea743afd75d..5887a0d33313 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -65,6 +65,11 @@ impl TeeProofGenerationDal<'_, '_> { let processing_timeout = pg_interval_from_duration(processing_timeout); let min_batch_number = i64::from(min_batch_number.0); let mut transaction = self.storage.start_transaction().await?; + + // Lock rows in the proof_generation_details table to prevent race conditions. The + // tee_proof_generation_details table may not have corresponding entries yet if this is the + // first time the query is invoked for a batch. Locking rows in proof_generation_details + // ensures that two different TEE prover instances will not try to prove the same batch. let batch_number = sqlx::query!( r#" SELECT From ecd8c335272f6d80ca017f521c5282c469490168 Mon Sep 17 00:00:00 2001 From: Patrick Beza Date: Tue, 3 Dec 2024 13:11:05 +0100 Subject: [PATCH 5/7] fixup! Address Alex's code review comments --- core/lib/dal/src/tee_proof_generation_dal.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index 5887a0d33313..61a9e23ffea5 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -67,7 +67,7 @@ impl TeeProofGenerationDal<'_, '_> { let mut transaction = self.storage.start_transaction().await?; // Lock rows in the proof_generation_details table to prevent race conditions. The - // tee_proof_generation_details table may not have corresponding entries yet if this is the + // tee_proof_generation_details table does not have corresponding entries yet if this is the // first time the query is invoked for a batch. Locking rows in proof_generation_details // ensures that two different TEE prover instances will not try to prove the same batch. let batch_number = sqlx::query!( From a1f99abbbf3b0b7fbf2aaef2676ba23ad3c77991 Mon Sep 17 00:00:00 2001 From: Patrick Beza Date: Tue, 3 Dec 2024 14:24:18 +0100 Subject: [PATCH 6/7] Address Harald's code review comments --- ...6822a6e92698fcfcf5d0a9252d84b75459b2664.json} | 4 ++-- core/lib/dal/src/tee_proof_generation_dal.rs | 16 ++++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) rename core/lib/dal/.sqlx/{query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json => query-b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664.json} (87%) diff --git a/core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json b/core/lib/dal/.sqlx/query-b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664.json similarity index 87% rename from core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json rename to core/lib/dal/.sqlx/query-b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664.json index 6266f93e6545..b206d337201b 100644 --- a/core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json +++ b/core/lib/dal/.sqlx/query-b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n LIMIT 1\n FOR UPDATE OF p\n SKIP LOCKED\n ", + "query": "\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n LIMIT 1\n ", "describe": { "columns": [ { @@ -22,5 +22,5 @@ false ] }, - "hash": "8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa" + "hash": "b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664" } diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index 61a9e23ffea5..12761a3d6d34 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -66,10 +66,16 @@ impl TeeProofGenerationDal<'_, '_> { let min_batch_number = i64::from(min_batch_number.0); let mut transaction = self.storage.start_transaction().await?; - // Lock rows in the proof_generation_details table to prevent race conditions. The - // tee_proof_generation_details table does not have corresponding entries yet if this is the - // first time the query is invoked for a batch. Locking rows in proof_generation_details - // ensures that two different TEE prover instances will not try to prove the same batch. + // Lock the entire tee_proof_generation_details table in EXCLUSIVE mode to prevent race + // conditions. Locking the table ensures that two different TEE prover instances will not + // try to prove the same batch. + sqlx::query("LOCK TABLE tee_proof_generation_details IN EXCLUSIVE MODE") + .instrument("lock_batch_for_proving#lock_table") + .execute(&mut transaction) + .await?; + + // The tee_proof_generation_details table does not have corresponding entries yet if this is + // the first time the query is invoked for a batch. let batch_number = sqlx::query!( r#" SELECT @@ -95,8 +101,6 @@ impl TeeProofGenerationDal<'_, '_> { ) ) LIMIT 1 - FOR UPDATE OF p - SKIP LOCKED "#, tee_type.to_string(), TeeProofGenerationJobStatus::PickedByProver.to_string(), From de8de1148cb8e721b507d291df901320d1b5cec7 Mon Sep 17 00:00:00 2001 From: Patrick Beza Date: Tue, 3 Dec 2024 17:06:00 +0100 Subject: [PATCH 7/7] Revert "Address Harald's code review comments" This reverts commit a1f99abbbf3b0b7fbf2aaef2676ba23ad3c77991. --- ...989f73e353da3bc6af7ecb81102c4194df631aa.json} | 4 ++-- core/lib/dal/src/tee_proof_generation_dal.rs | 16 ++++++---------- 2 files changed, 8 insertions(+), 12 deletions(-) rename core/lib/dal/.sqlx/{query-b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664.json => query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json} (87%) diff --git a/core/lib/dal/.sqlx/query-b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664.json b/core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json similarity index 87% rename from core/lib/dal/.sqlx/query-b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664.json rename to core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json index b206d337201b..6266f93e6545 100644 --- a/core/lib/dal/.sqlx/query-b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664.json +++ b/core/lib/dal/.sqlx/query-8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n LIMIT 1\n ", + "query": "\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n LIMIT 1\n FOR UPDATE OF p\n SKIP LOCKED\n ", "describe": { "columns": [ { @@ -22,5 +22,5 @@ false ] }, - "hash": "b6961d273f833f8babaf16f256822a6e92698fcfcf5d0a9252d84b75459b2664" + "hash": "8ead57cdda5909348f31f8c4d989f73e353da3bc6af7ecb81102c4194df631aa" } diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index 12761a3d6d34..61a9e23ffea5 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -66,16 +66,10 @@ impl TeeProofGenerationDal<'_, '_> { let min_batch_number = i64::from(min_batch_number.0); let mut transaction = self.storage.start_transaction().await?; - // Lock the entire tee_proof_generation_details table in EXCLUSIVE mode to prevent race - // conditions. Locking the table ensures that two different TEE prover instances will not - // try to prove the same batch. - sqlx::query("LOCK TABLE tee_proof_generation_details IN EXCLUSIVE MODE") - .instrument("lock_batch_for_proving#lock_table") - .execute(&mut transaction) - .await?; - - // The tee_proof_generation_details table does not have corresponding entries yet if this is - // the first time the query is invoked for a batch. + // Lock rows in the proof_generation_details table to prevent race conditions. The + // tee_proof_generation_details table does not have corresponding entries yet if this is the + // first time the query is invoked for a batch. Locking rows in proof_generation_details + // ensures that two different TEE prover instances will not try to prove the same batch. let batch_number = sqlx::query!( r#" SELECT @@ -101,6 +95,8 @@ impl TeeProofGenerationDal<'_, '_> { ) ) LIMIT 1 + FOR UPDATE OF p + SKIP LOCKED "#, tee_type.to_string(), TeeProofGenerationJobStatus::PickedByProver.to_string(),