From b00e46c6c74cecd54b135870cff960447a1425a5 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 2 Sep 2025 10:51:41 +0800 Subject: [PATCH 01/33] refine --- .../servers/http/v1/http_query_handlers.rs | 11 ++- .../servers/http/v1/query/execute_state.rs | 85 ++++++++++--------- .../src/servers/http/v1/query/http_query.rs | 3 +- .../src/servers/http/v1/query/page_manager.rs | 9 +- .../src/servers/http/v1/streaming_load.rs | 14 +-- 5 files changed, 61 insertions(+), 61 deletions(-) diff --git a/src/query/service/src/servers/http/v1/http_query_handlers.rs b/src/query/service/src/servers/http/v1/http_query_handlers.rs index 8c5cc12775c9b..0aeac2c74af19 100644 --- a/src/query/service/src/servers/http/v1/http_query_handlers.rs +++ b/src/query/service/src/servers/http/v1/http_query_handlers.rs @@ -31,7 +31,6 @@ use databend_common_exception::ErrorCode; use databend_common_expression::DataSchemaRef; use databend_common_management::WorkloadGroupResourceManager; use databend_common_metrics::http::metrics_incr_http_response_errors_count; -use fastrace::func_path; use fastrace::prelude::*; use http::HeaderMap; use http::HeaderValue; @@ -295,7 +294,7 @@ async fn query_final_handler( Path(query_id): Path, ) -> PoemResult { ctx.check_node_id(&query_id)?; - let root = get_http_tracing_span(func_path!(), ctx, &query_id); + let root = get_http_tracing_span("http::query_final_handler", ctx, &query_id); let _t = SlowRequestLogTracker::new(ctx); async { info!( @@ -336,7 +335,7 @@ async fn query_cancel_handler( Path(query_id): Path, ) -> PoemResult { ctx.check_node_id(&query_id)?; - let root = get_http_tracing_span(func_path!(), ctx, &query_id); + let root = get_http_tracing_span("http::query_cancel_handler", ctx, &query_id); let _t = SlowRequestLogTracker::new(ctx); async { info!( @@ -368,7 +367,7 @@ async fn query_state_handler( Path(query_id): Path, ) -> PoemResult { ctx.check_node_id(&query_id)?; - let root = get_http_tracing_span(func_path!(), ctx, &query_id); + let root = get_http_tracing_span("http::query_state_handler", ctx, &query_id); async { let http_query_manager = HttpQueryManager::instance(); @@ -455,7 +454,7 @@ async fn query_page_handler( }; let query_page_handle = { - let root = get_http_tracing_span(func_path!(), ctx, &query_id); + let root = get_http_tracing_span("http::query_page_handler", ctx, &query_id); let _t = SlowRequestLogTracker::new(ctx); query_page_handle.in_span(root) }; @@ -554,7 +553,7 @@ pub(crate) async fn query_handler( }; let query_handle = { - let root = get_http_tracing_span(func_path!(), ctx, &ctx.query_id); + let root = get_http_tracing_span("http::query_handler", ctx, &ctx.query_id); let _t = SlowRequestLogTracker::new(ctx); query_handle.in_span(root) }; diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index c3202657427ee..1edbb96db8369 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -397,7 +397,7 @@ impl ExecuteState { let ctx_clone = ctx.clone(); let block_sender_closer = block_sender.closer(); - let res = execute( + let res = Self::execute( interpreter, plan.schema(), ctx_clone, @@ -418,49 +418,50 @@ impl ExecuteState { Ok(()) } -} -async fn execute( - interpreter: Arc, - schema: DataSchemaRef, - ctx: Arc, - block_sender: SizedChannelSender, - executor: Arc>, -) -> Result<(), ExecutionError> { - let make_error = || format!("failed to execute {}", interpreter.name()); - - let mut data_stream = interpreter - .execute(ctx.clone()) - .await - .with_context(make_error)?; - match data_stream.next().await { - None => { - let block = DataBlock::empty_with_schema(schema); - block_sender.send(block, 0).await; - Executor::stop::<()>(&executor, Ok(())); - block_sender.close(); - } - Some(Err(err)) => { - Executor::stop(&executor, Err(err)); - block_sender.close(); - } - Some(Ok(block)) => { - let size = block.num_rows(); - block_sender.send(block, size).await; - while let Some(block_r) = data_stream.next().await { - match block_r { - Ok(block) => { - block_sender.send(block.clone(), block.num_rows()).await; - } - Err(err) => { - block_sender.close(); - return Err(err.with_context(make_error())); - } - }; + #[fastrace::trace(name = "ExecuteState::execute")] + async fn execute( + interpreter: Arc, + schema: DataSchemaRef, + ctx: Arc, + block_sender: SizedChannelSender, + executor: Arc>, + ) -> Result<(), ExecutionError> { + let make_error = || format!("failed to execute {}", interpreter.name()); + + let mut data_stream = interpreter + .execute(ctx.clone()) + .await + .with_context(make_error)?; + match data_stream.next().await { + None => { + let block = DataBlock::empty_with_schema(schema); + block_sender.send(block, 0).await; + Executor::stop::<()>(&executor, Ok(())); + block_sender.close(); + } + Some(Err(err)) => { + Executor::stop(&executor, Err(err)); + block_sender.close(); + } + Some(Ok(block)) => { + let size = block.num_rows(); + block_sender.send(block, size).await; + while let Some(block_r) = data_stream.next().await { + match block_r { + Ok(block) => { + block_sender.send(block.clone(), block.num_rows()).await; + } + Err(err) => { + block_sender.close(); + return Err(err.with_context(make_error())); + } + }; + } + Executor::stop::<()>(&executor, Ok(())); + block_sender.close(); } - Executor::stop::<()>(&executor, Ok(())); - block_sender.close(); } + Ok(()) } - Ok(()) } diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 8cfccce77673a..8bd6b38663f76 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -651,7 +651,7 @@ impl HttpQuery { } #[async_backtrace::framed] - #[fastrace::trace] + #[fastrace::trace(name = "HttpQuery::get_response_page",properties = {"page_no": "{page_no}"})] pub async fn get_response_page(&self, page_no: usize) -> Result { let data = Some(self.get_page(page_no).await?); let state = self.get_state(); @@ -922,6 +922,7 @@ impl HttpQuery { } #[async_backtrace::framed] + #[fastrace::trace(name = "HttpQuery::on_heartbeat")] pub fn on_heartbeat(&self) -> bool { let mut expire_state = self.state.lock(); match *expire_state { diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index 1a4fa4d9f45ba..44e1e9aa8b3a0 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -24,6 +24,8 @@ use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_io::prelude::FormatSettings; +use fastrace::future::FutureExt; +use fastrace::Span; use itertools::Itertools; use log::debug; use log::info; @@ -88,6 +90,7 @@ impl PageManager { } #[async_backtrace::framed] + #[fastrace::trace(name = "PageManager::get_a_page")] pub async fn get_a_page(&mut self, page_no: usize, tp: &Wait) -> Result { let next_no = self.total_pages; if page_no == next_no { @@ -95,6 +98,7 @@ impl PageManager { if !self.end { let end = self.collect_new_page(&mut serializer, tp).await?; let num_row = serializer.num_rows(); + log::debug!(num_row, wait_type:? = tp; "collect_new_page"); self.total_rows += num_row; let page = Page { data: Arc::new(serializer), @@ -226,7 +230,10 @@ impl PageManager { // timeout() will return Ok if the future completes immediately break; } - match tokio::time::timeout(d, self.block_receiver.recv()).await { + match tokio::time::timeout(d, self.block_receiver.recv()) + .in_span(Span::enter_with_local_parent("PageManager::recv_block")) + .await + { Ok(Some(block)) => { debug!( "[HTTP-QUERY] Received new data block with {} rows", diff --git a/src/query/service/src/servers/http/v1/streaming_load.rs b/src/query/service/src/servers/http/v1/streaming_load.rs index 076404639956e..ca774e703d7ad 100644 --- a/src/query/service/src/servers/http/v1/streaming_load.rs +++ b/src/query/service/src/servers/http/v1/streaming_load.rs @@ -33,7 +33,6 @@ use databend_common_sql::plans::Plan; use databend_common_sql::Planner; use databend_common_storages_stage::BytesBatch; use databend_storages_common_session::TxnState; -use fastrace::func_path; use fastrace::future::FutureExt; use futures::StreamExt; use http::StatusCode; @@ -61,6 +60,7 @@ use crate::servers::http::error::JsonErrorOnly; use crate::servers::http::error::QueryError; use crate::servers::http::middleware::json_header::encode_json_header; use crate::servers::http::middleware::sanitize_request_headers; +use crate::servers::http::v1::http_query_handlers::get_http_tracing_span; use crate::sessions::QueriesQueueManager; use crate::sessions::QueryContext; use crate::sessions::QueryEntry; @@ -93,11 +93,7 @@ fn execute_query( tracking_payload.query_id = Some(id.clone()); tracking_payload.mem_stat = Some(mem_stat); let _tracking_guard = ThreadTracker::tracking(tracking_payload); - let root = crate::servers::http::v1::http_query_handlers::get_http_tracing_span( - func_path!(), - &http_query_context, - &id, - ); + let root = get_http_tracing_span("http::execute_query", &http_query_context, &id); ThreadTracker::tracking_future(fut.in_span(root)) } @@ -113,11 +109,7 @@ pub async fn streaming_load_handler( tracking_payload.query_id = Some(ctx.query_id.clone()); tracking_payload.mem_stat = Some(query_mem_stat.clone()); let _tracking_guard = ThreadTracker::tracking(tracking_payload); - let root = crate::servers::http::v1::http_query_handlers::get_http_tracing_span( - func_path!(), - ctx, - &ctx.query_id, - ); + let root = get_http_tracing_span("http::streaming_load_handler", ctx, &ctx.query_id); let mut session_conf: Option = match req.headers().get(HEADER_QUERY_CONTEXT) { Some(v) => { let s = v.to_str().unwrap().to_string(); From 65c792e901689263ae09d59fd6d0ffe7b2d44d92 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 2 Sep 2025 11:59:22 +0800 Subject: [PATCH 02/33] cleanup --- .../servers/http/v1/query/execute_state.rs | 7 +- .../src/servers/http/v1/query/sized_spsc.rs | 68 ++++++++++--------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index 1edbb96db8369..eea4ce6844f36 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -436,7 +436,7 @@ impl ExecuteState { match data_stream.next().await { None => { let block = DataBlock::empty_with_schema(schema); - block_sender.send(block, 0).await; + block_sender.send(block).await; Executor::stop::<()>(&executor, Ok(())); block_sender.close(); } @@ -445,12 +445,11 @@ impl ExecuteState { block_sender.close(); } Some(Ok(block)) => { - let size = block.num_rows(); - block_sender.send(block, size).await; + block_sender.send(block).await; while let Some(block_r) = data_stream.next().await { match block_r { Ok(block) => { - block_sender.send(block.clone(), block.num_rows()).await; + block_sender.send(block.clone()).await; } Err(err) => { block_sender.close(); diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 25bf6d6e55c37..e3a29b8cde3dd 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -23,17 +23,18 @@ use std::sync::Arc; use std::sync::Mutex; use databend_common_base::base::tokio::sync::Notify; +use databend_common_expression::DataBlock; -struct SizedChannelInner { +struct SizedChannelInner { max_size: usize, - values: VecDeque<(T, usize)>, + values: VecDeque, is_recv_stopped: bool, is_send_stopped: bool, } struct Stopped {} -pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) { +pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) { let chan = Arc::new(SizedChannel::create(max_size)); let cloned = chan.clone(); (SizedChannelSender { chan }, SizedChannelReceiver { @@ -41,7 +42,7 @@ pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelRec }) } -impl SizedChannelInner { +impl SizedChannelInner { pub fn create(max_size: usize) -> Self { SizedChannelInner { max_size, @@ -52,23 +53,24 @@ impl SizedChannelInner { } pub fn size(&self) -> usize { - self.values.iter().map(|x| x.1).sum::() + self.values.iter().map(|x| x.num_rows()).sum::() } - pub fn try_send(&mut self, value: T, size: usize) -> Result, Stopped> { + pub fn try_send(&mut self, value: DataBlock) -> Result, Stopped> { let current_size = self.size(); + let value_size = value.num_rows(); if self.is_recv_stopped || self.is_send_stopped { Err(Stopped {}) - } else if current_size + size <= self.max_size || current_size == 0 { - self.values.push_back((value, size)); + } else if current_size + value_size <= self.max_size || current_size == 0 { + self.values.push_back(value); Ok(None) } else { Ok(Some(value)) } } - pub fn try_recv(&mut self) -> Result, Stopped> { - let v = self.values.pop_front().map(|x| x.0); + pub fn try_recv(&mut self) -> Result, Stopped> { + let v = self.values.pop_front(); if v.is_none() && self.is_send_stopped { Err(Stopped {}) } else { @@ -89,13 +91,13 @@ impl SizedChannelInner { } } -struct SizedChannel { - inner: Mutex>, +struct SizedChannel { + inner: Mutex, notify_on_sent: Notify, notify_on_recv: Notify, } -impl SizedChannel { +impl SizedChannel { fn create(max_size: usize) -> Self { SizedChannel { inner: Mutex::new(SizedChannelInner::create(max_size)), @@ -104,21 +106,21 @@ impl SizedChannel { } } - fn try_send(&self, value: T, size: usize) -> Result, Stopped> { + fn try_send(&self, value: DataBlock) -> Result, Stopped> { let mut guard = self.inner.lock().unwrap(); - guard.try_send(value, size) + guard.try_send(value) } - pub fn try_recv(&self) -> Result, Stopped> { + pub fn try_recv(&self) -> Result, Stopped> { let mut guard = self.inner.lock().unwrap(); guard.try_recv() } #[async_backtrace::framed] - pub async fn send(&self, value: T, size: usize) -> bool { + pub async fn send(&self, value: DataBlock) -> bool { let mut to_send = value; loop { - match self.try_send(to_send, size) { + match self.try_send(to_send) { Ok(Some(v)) => { to_send = v; self.notify_on_recv.notified().await; @@ -133,7 +135,7 @@ impl SizedChannel { } #[async_backtrace::framed] - pub async fn recv(&self) -> Option { + pub async fn recv(&self) -> Option { loop { match self.try_recv() { Ok(Some(v)) => { @@ -170,17 +172,17 @@ impl SizedChannel { } } -pub struct SizedChannelReceiver { - chan: Arc>, +pub struct SizedChannelReceiver { + chan: Arc, } -impl SizedChannelReceiver { +impl SizedChannelReceiver { #[async_backtrace::framed] - pub async fn recv(&self) -> Option { + pub async fn recv(&self) -> Option { self.chan.recv().await } - pub fn try_recv(&self) -> Option { + pub fn try_recv(&self) -> Option { self.chan.try_recv().unwrap_or_default() } @@ -194,32 +196,32 @@ impl SizedChannelReceiver { } #[derive(Clone)] -pub struct SizedChannelSender { - chan: Arc>, +pub struct SizedChannelSender { + chan: Arc, } -impl SizedChannelSender { +impl SizedChannelSender { #[async_backtrace::framed] - pub async fn send(&self, value: T, size: usize) -> bool { - self.chan.send(value, size).await + pub async fn send(&self, value: DataBlock) -> bool { + self.chan.send(value).await } pub fn close(&self) { self.chan.stop_send() } - pub fn closer(&self) -> SizedChannelSenderCloser { + pub fn closer(&self) -> SizedChannelSenderCloser { SizedChannelSenderCloser { chan: self.chan.clone(), } } } -pub struct SizedChannelSenderCloser { - chan: Arc>, +pub struct SizedChannelSenderCloser { + chan: Arc, } -impl SizedChannelSenderCloser { +impl SizedChannelSenderCloser { pub fn close(&self) { self.chan.stop_send() } From 956d94304cc1b7017325c00ea0c19c77bb9b3f00 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 2 Sep 2025 13:10:58 +0800 Subject: [PATCH 03/33] x --- src/query/service/src/servers/http/v1/mod.rs | 1 + .../http/v1/query/blocks_serializer.rs | 55 ++-- .../servers/http/v1/query/execute_state.rs | 17 +- .../src/servers/http/v1/query/http_query.rs | 9 - .../service/src/servers/http/v1/query/mod.rs | 2 +- .../src/servers/http/v1/query/page_manager.rs | 173 +------------ .../src/servers/http/v1/query/sized_spsc.rs | 238 ++++++++++++++---- .../tests/it/servers/http/json_block.rs | 10 +- 8 files changed, 244 insertions(+), 261 deletions(-) diff --git a/src/query/service/src/servers/http/v1/mod.rs b/src/query/service/src/servers/http/v1/mod.rs index f099b9d993b2a..5ad6d628319b9 100644 --- a/src/query/service/src/servers/http/v1/mod.rs +++ b/src/query/service/src/servers/http/v1/mod.rs @@ -31,6 +31,7 @@ pub use http_query_handlers::query_route; pub use http_query_handlers::QueryResponse; pub use http_query_handlers::QueryResponseField; pub use http_query_handlers::QueryStats; +pub use query::blocks_serializer::BlocksCollector; pub use query::blocks_serializer::BlocksSerializer; pub use query::ExecuteStateKind; pub use query::ExpiringMap; diff --git a/src/query/service/src/servers/http/v1/query/blocks_serializer.rs b/src/query/service/src/servers/http/v1/query/blocks_serializer.rs index 458945f833cc8..b6b9056a4ed84 100644 --- a/src/query/service/src/servers/http/v1/query/blocks_serializer.rs +++ b/src/query/service/src/servers/http/v1/query/blocks_serializer.rs @@ -18,7 +18,9 @@ use std::ops::DerefMut; use databend_common_expression::types::date::date_to_string; use databend_common_expression::types::interval::interval_to_string; use databend_common_expression::types::timestamp::timestamp_to_string; +use databend_common_expression::BlockEntry; use databend_common_expression::Column; +use databend_common_expression::DataBlock; use databend_common_formats::field_encoder::FieldEncoderValues; use databend_common_io::ewkb_to_geo; use databend_common_io::geo_to_ewkb; @@ -40,38 +42,51 @@ fn data_is_null(column: &Column, row_index: usize) -> bool { } } -#[derive(Debug, Clone)] -pub struct BlocksSerializer { +#[derive(Debug, Clone, Default)] +pub struct BlocksCollector { // Vec for a Block columns: Vec<(Vec, usize)>, - pub(crate) format: Option, } -impl BlocksSerializer { - pub fn empty() -> Self { - Self { - columns: vec![], - format: None, - } +impl BlocksCollector { + pub fn new() -> Self { + Self { columns: vec![] } } - pub fn new(format: Option) -> Self { - Self { - columns: vec![], - format, - } + pub fn append_columns(&mut self, columns: Vec, num_rows: usize) { + self.columns.push((columns, num_rows)); } - pub fn has_format(&self) -> bool { - self.format.is_some() + pub fn append_block(&mut self, block: DataBlock) { + if block.is_empty() { + return; + } + let columns = block.columns().iter().map(BlockEntry::to_column).collect(); + let num_rows = block.num_rows(); + self.append_columns(columns, num_rows); } - pub fn set_format(&mut self, format: FormatSettings) { - self.format = Some(format); + pub fn into_serializer(self, format: FormatSettings) -> BlocksSerializer { + BlocksSerializer { + columns: self.columns, + format: Some(format), + } } +} - pub fn append(&mut self, columns: Vec, num_rows: usize) { - self.columns.push((columns, num_rows)); +#[derive(Debug, Clone)] +pub struct BlocksSerializer { + // Vec for a Block + columns: Vec<(Vec, usize)>, + format: Option, +} + +impl BlocksSerializer { + pub fn empty() -> Self { + Self { + columns: vec![], + format: None, + } } pub fn is_empty(&self) -> bool { diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index eea4ce6844f36..22a114148372b 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -25,7 +25,6 @@ use databend_common_exception::ResultExt; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::Scalar; -use databend_common_io::prelude::FormatSettings; use databend_common_settings::Settings; use databend_storages_common_session::TxnManagerRef; use futures::StreamExt; @@ -114,7 +113,7 @@ impl ExecuteState { pub struct ExecuteStarting { pub(crate) ctx: Arc, - pub(crate) sender: SizedChannelSender, + pub(crate) sender: SizedChannelSender, } pub struct ExecuteRunning { @@ -355,8 +354,7 @@ impl ExecuteState { sql: String, session: Arc, ctx: Arc, - block_sender: SizedChannelSender, - format_settings: Arc>>, + block_sender: SizedChannelSender, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to start query: {sql}"); @@ -367,11 +365,10 @@ impl ExecuteState { .await .map_err(|err| err.display_with_sql(&sql)) .with_context(make_error)?; - { - // set_var may change settings - let mut guard = format_settings.write(); - *guard = Some(ctx.get_format_settings().with_context(make_error)?); - } + + // set_var may change settings + let format_settings = ctx.get_format_settings().with_context(make_error)?; + block_sender.plan_ready(format_settings); let interpreter = InterpreterFactory::get(ctx.clone(), &plan) .await @@ -424,7 +421,7 @@ impl ExecuteState { interpreter: Arc, schema: DataSchemaRef, ctx: Arc, - block_sender: SizedChannelSender, + block_sender: SizedChannelSender, executor: Arc>, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to execute {}", interpreter.name()); diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 8bd6b38663f76..58bbd8e0a8ba3 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -33,7 +33,6 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_exception::ResultExt; use databend_common_expression::Scalar; -use databend_common_io::prelude::FormatSettings; use databend_common_metrics::http::metrics_incr_http_response_errors_count; use databend_common_settings::ScopeLevel; use databend_storages_common_session::TempTblMgrRef; @@ -624,12 +623,10 @@ impl HttpQuery { let settings = session.get_settings(); let result_timeout_secs = settings.get_http_handler_result_timeout_secs()?; - let format_settings: Arc>> = Default::default(); let data = Arc::new(TokioMutex::new(PageManager::new( req.pagination.max_rows_per_page, block_receiver, - format_settings, ))); Ok(HttpQuery { @@ -802,11 +799,6 @@ impl HttpQuery { let query_state = self.executor.clone(); - let query_format_settings = { - let page_manager = self.page_manager.lock().await; - page_manager.format_settings.clone() - }; - GlobalQueryRuntime::instance().runtime().try_spawn( async move { if let Err(e) = CatchUnwindFuture::create(ExecuteState::try_start_query( @@ -815,7 +807,6 @@ impl HttpQuery { query_session, query_context.clone(), block_sender.clone(), - query_format_settings, )) .await .with_context(|| "failed to start query") diff --git a/src/query/service/src/servers/http/v1/query/mod.rs b/src/query/service/src/servers/http/v1/query/mod.rs index 215def1e4bc95..1d7756c297361 100644 --- a/src/query/service/src/servers/http/v1/query/mod.rs +++ b/src/query/service/src/servers/http/v1/query/mod.rs @@ -38,4 +38,4 @@ pub use http_query_manager::HttpQueryManager; pub(crate) use http_query_manager::RemoveReason; pub use page_manager::PageManager; pub use page_manager::ResponseData; -pub use page_manager::Wait; +pub use sized_spsc::Wait; diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index 44e1e9aa8b3a0..d7c68b72b6b6a 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -12,33 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::min; -use std::ops::Range; use std::sync::Arc; -use std::time::Instant; -use databend_common_base::base::tokio; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::BlockEntry; -use databend_common_expression::Column; -use databend_common_expression::DataBlock; -use databend_common_io::prelude::FormatSettings; -use fastrace::future::FutureExt; -use fastrace::Span; -use itertools::Itertools; -use log::debug; -use log::info; -use parking_lot::RwLock; use super::blocks_serializer::BlocksSerializer; use crate::servers::http::v1::query::sized_spsc::SizedChannelReceiver; - -#[derive(Debug, PartialEq, Eq)] -pub enum Wait { - Async, - Deadline(Instant), -} +use crate::servers::http::v1::query::sized_spsc::Wait; #[derive(Clone)] pub struct Page { @@ -55,29 +36,19 @@ pub struct PageManager { total_rows: usize, total_pages: usize, end: bool, - block_end: bool, last_page: Option, - row_buffer: Option>, - block_receiver: SizedChannelReceiver, - pub(crate) format_settings: Arc>>, + block_receiver: SizedChannelReceiver, } impl PageManager { - pub fn new( - max_rows_per_page: usize, - block_receiver: SizedChannelReceiver, - format_settings: Arc>>, - ) -> PageManager { + pub fn new(max_rows_per_page: usize, block_receiver: SizedChannelReceiver) -> PageManager { PageManager { total_rows: 0, last_page: None, total_pages: 0, end: false, - block_end: false, - row_buffer: Default::default(), block_receiver, max_rows_per_page, - format_settings, } } @@ -94,9 +65,11 @@ impl PageManager { pub async fn get_a_page(&mut self, page_no: usize, tp: &Wait) -> Result { let next_no = self.total_pages; if page_no == next_no { - let mut serializer = BlocksSerializer::new(self.format_settings.read().clone()); if !self.end { - let end = self.collect_new_page(&mut serializer, tp).await?; + let (serializer, end) = self + .block_receiver + .collect_new_page(self.max_rows_per_page, tp) + .await?; let num_row = serializer.num_rows(); log::debug!(num_row, wait_type:? = tp; "collect_new_page"); self.total_rows += num_row; @@ -114,7 +87,7 @@ impl PageManager { // but the response may be lost and client will retry, // we simply return an empty page. let page = Page { - data: Arc::new(serializer), + data: Arc::new(BlocksSerializer::empty()), }; Ok(page) } @@ -137,139 +110,9 @@ impl PageManager { } } - fn append_block( - &mut self, - serializer: &mut BlocksSerializer, - block: DataBlock, - remain_rows: &mut usize, - remain_size: &mut usize, - ) -> Result<()> { - assert!(self.row_buffer.is_none()); - if block.is_empty() { - return Ok(()); - } - if !serializer.has_format() { - let guard = self.format_settings.read(); - serializer.set_format(guard.as_ref().unwrap().clone()); - } - - let columns = block - .columns() - .iter() - .map(BlockEntry::to_column) - .collect_vec(); - - let block_memory_size = block.memory_size(); - let mut take_rows = min( - *remain_rows, - if block_memory_size > *remain_size { - (*remain_size * block.num_rows()) / block_memory_size - } else { - block.num_rows() - }, - ); - // this means that the data in remaining_size cannot satisfy even one row. - if take_rows == 0 { - take_rows = 1; - } - - if take_rows == block.num_rows() { - // theoretically, it should always be smaller than the memory_size of the block. - *remain_size -= min(*remain_size, block_memory_size); - *remain_rows -= take_rows; - serializer.append(columns, block.num_rows()); - } else { - // Since not all rows of the block are used, either the size limit or the row limit must have been exceeded. - // simply set any of remain_xxx to end the page. - *remain_rows = 0; - let fn_slice = |columns: &[Column], range: Range| { - columns - .iter() - .map(|column| column.slice(range.clone())) - .collect_vec() - }; - - serializer.append(fn_slice(&columns, 0..take_rows), take_rows); - self.row_buffer = Some(fn_slice(&columns, take_rows..block.num_rows())); - } - Ok(()) - } - - #[async_backtrace::framed] - async fn collect_new_page( - &mut self, - serializer: &mut BlocksSerializer, - tp: &Wait, - ) -> Result { - let mut remain_size = 10 * 1024 * 1024; - let mut remain_rows = self.max_rows_per_page; - while remain_rows > 0 && remain_size > 0 { - let Some(block) = self.row_buffer.take() else { - break; - }; - self.append_block( - serializer, - DataBlock::new_from_columns(block), - &mut remain_rows, - &mut remain_size, - )?; - } - - while remain_rows > 0 && remain_size > 0 { - match tp { - Wait::Async => match self.block_receiver.try_recv() { - Some(block) => { - self.append_block(serializer, block, &mut remain_rows, &mut remain_size)? - } - None => break, - }, - Wait::Deadline(t) => { - let now = Instant::now(); - let d = *t - now; - if d.is_zero() { - // timeout() will return Ok if the future completes immediately - break; - } - match tokio::time::timeout(d, self.block_receiver.recv()) - .in_span(Span::enter_with_local_parent("PageManager::recv_block")) - .await - { - Ok(Some(block)) => { - debug!( - "[HTTP-QUERY] Received new data block with {} rows", - block.num_rows() - ); - self.append_block( - serializer, - block, - &mut remain_rows, - &mut remain_size, - )? - } - Ok(None) => { - info!("[HTTP-QUERY] Reached end of data blocks"); - break; - } - Err(_) => { - debug!("[HTTP-QUERY] Long polling timeout reached"); - break; - } - } - } - } - } - - // try to report 'no more data' earlier to client to avoid unnecessary http call - if !self.block_end { - self.block_end = self.block_receiver.is_empty(); - } - Ok(self.block_end && self.row_buffer.is_none()) - } - #[async_backtrace::framed] pub async fn detach(&mut self) { self.block_receiver.close(); self.last_page = None; - self.row_buffer = None; } } diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index e3a29b8cde3dd..bf04dedfae7db 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -18,12 +18,77 @@ //! 1. it is SPSC, enough for now. //! 2. receive can check status of channel with fn is_empty(). +use std::cmp::min; use std::collections::VecDeque; use std::sync::Arc; use std::sync::Mutex; +use std::time::Instant; +use databend_common_base::base::tokio; use databend_common_base::base::tokio::sync::Notify; +use databend_common_base::base::WatchNotify; +use databend_common_exception::ErrorCode; use databend_common_expression::DataBlock; +use databend_common_io::prelude::FormatSettings; +use fastrace::future::FutureExt; +use fastrace::Span; +use log::debug; +use log::info; + +use super::blocks_serializer::BlocksCollector; +use super::blocks_serializer::BlocksSerializer; + +pub struct PageBuilder { + pub collector: BlocksCollector, + pub remain_rows: usize, + pub remain_size: usize, +} + +impl PageBuilder { + fn new(max_rows: usize) -> Self { + Self { + collector: BlocksCollector::new(), + remain_size: 10 * 1024 * 1024, + remain_rows: max_rows, + } + } + + fn has_capacity(&self) -> bool { + self.remain_rows > 0 && self.remain_size > 0 + } + + fn append_full_block(&mut self, block: DataBlock) { + let memory_size = block.memory_size(); + let num_rows = block.num_rows(); + + self.remain_size -= min(self.remain_size, memory_size); + self.remain_rows -= num_rows; + + self.collector.append_block(block); + } + + fn append_partial_block(&mut self, block: DataBlock, take_rows: usize) -> DataBlock { + self.collector.append_block(block.slice(0..take_rows)); + + block.slice(take_rows..block.num_rows()) + } + + fn calculate_take_rows(&self, block_rows: usize, memory_size: usize) -> usize { + min( + self.remain_rows, + if memory_size > self.remain_size { + (self.remain_size * block_rows) / memory_size + } else { + block_rows + }, + ) + .max(1) + } + + fn into_serializer(self, format_settings: FormatSettings) -> BlocksSerializer { + self.collector.into_serializer(format_settings) + } +} struct SizedChannelInner { max_size: usize, @@ -32,8 +97,6 @@ struct SizedChannelInner { is_send_stopped: bool, } -struct Stopped {} - pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) { let chan = Arc::new(SizedChannel::create(max_size)); let cloned = chan.clone(); @@ -43,7 +106,7 @@ pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) } impl SizedChannelInner { - pub fn create(max_size: usize) -> Self { + fn create(max_size: usize) -> Self { SizedChannelInner { max_size, values: Default::default(), @@ -52,49 +115,69 @@ impl SizedChannelInner { } } - pub fn size(&self) -> usize { + fn size(&self) -> usize { self.values.iter().map(|x| x.num_rows()).sum::() } - pub fn try_send(&mut self, value: DataBlock) -> Result, Stopped> { + fn try_send(&mut self, value: DataBlock) -> Result<(), Option> { let current_size = self.size(); let value_size = value.num_rows(); if self.is_recv_stopped || self.is_send_stopped { - Err(Stopped {}) + Err(None) } else if current_size + value_size <= self.max_size || current_size == 0 { self.values.push_back(value); - Ok(None) + Ok(()) } else { - Ok(Some(value)) + Err(Some(value)) } } - pub fn try_recv(&mut self) -> Result, Stopped> { - let v = self.values.pop_front(); - if v.is_none() && self.is_send_stopped { - Err(Stopped {}) - } else { - Ok(v) - } - } - - pub fn is_empty(&self) -> bool { + fn is_close(&self) -> bool { self.values.is_empty() && self.is_send_stopped } - pub fn stop_send(&mut self) { + fn stop_send(&mut self) { self.is_send_stopped = true } - pub fn stop_recv(&mut self) { + fn stop_recv(&mut self) { self.is_recv_stopped = true } + + fn append_block_once(&mut self, builder: &mut PageBuilder) { + let Some(block) = self.values.pop_front() else { + return; + }; + + if block.is_empty() { + return; + } + + let take_rows = builder.calculate_take_rows(block.num_rows(), block.memory_size()); + if take_rows < block.num_rows() { + builder.remain_rows = 0; + let remaining_block = builder.append_partial_block(block, take_rows); + self.values.push_front(remaining_block); + } else { + builder.append_full_block(block); + } + } + + fn append_block(&mut self, builder: &mut PageBuilder) -> bool { + while builder.has_capacity() && !self.values.is_empty() { + self.append_block_once(builder) + } + !self.values.is_empty() + } } struct SizedChannel { inner: Mutex, notify_on_sent: Notify, notify_on_recv: Notify, + + is_plan_ready: WatchNotify, + format_settings: Mutex>, } impl SizedChannel { @@ -103,56 +186,60 @@ impl SizedChannel { inner: Mutex::new(SizedChannelInner::create(max_size)), notify_on_sent: Default::default(), notify_on_recv: Default::default(), + is_plan_ready: WatchNotify::new(), + format_settings: Mutex::new(None), } } - fn try_send(&self, value: DataBlock) -> Result, Stopped> { + fn try_send(&self, value: DataBlock) -> Result<(), Option> { let mut guard = self.inner.lock().unwrap(); guard.try_send(value) } - pub fn try_recv(&self) -> Result, Stopped> { - let mut guard = self.inner.lock().unwrap(); - guard.try_recv() + #[fastrace::trace(name = "SizedChannel::try_append_block")] + fn try_append_block(&self, builder: &mut PageBuilder) -> bool { + let has_more = self.inner.lock().unwrap().append_block(builder); + self.notify_on_recv.notify_one(); + has_more } #[async_backtrace::framed] - pub async fn send(&self, value: DataBlock) -> bool { + async fn send(&self, value: DataBlock) -> bool { let mut to_send = value; loop { match self.try_send(to_send) { - Ok(Some(v)) => { - to_send = v; - self.notify_on_recv.notified().await; - } - Ok(None) => { + Ok(_) => { self.notify_on_sent.notify_one(); return true; } - Err(_) => return false, + Err(None) => return false, + Err(Some(v)) => { + to_send = v; + self.notify_on_recv.notified().await; + } } } } #[async_backtrace::framed] - pub async fn recv(&self) -> Option { + async fn recv(&self) -> bool { loop { - match self.try_recv() { - Ok(Some(v)) => { - self.notify_on_recv.notify_one(); - return Some(v); + { + let g = self.inner.lock().unwrap(); + if !g.values.is_empty() { + return true; } - Ok(None) => { - self.notify_on_sent.notified().await; + if g.is_send_stopped { + return false; } - Err(_) => return None, } + self.notify_on_sent.notified().await; } } - pub fn is_empty(&self) -> bool { + pub fn is_close(&self) -> bool { let guard = self.inner.lock().unwrap(); - guard.is_empty() + guard.is_close() } pub fn stop_send(&self) { @@ -172,26 +259,67 @@ impl SizedChannel { } } +#[derive(Debug, PartialEq, Eq)] +pub enum Wait { + Async, + Deadline(Instant), +} + pub struct SizedChannelReceiver { chan: Arc, } impl SizedChannelReceiver { - #[async_backtrace::framed] - pub async fn recv(&self) -> Option { - self.chan.recv().await - } - - pub fn try_recv(&self) -> Option { - self.chan.try_recv().unwrap_or_default() - } - pub fn close(&self) { self.chan.stop_recv() } - pub fn is_empty(&self) -> bool { - self.chan.is_empty() + #[async_backtrace::framed] + pub async fn collect_new_page( + &mut self, + max_rows_per_page: usize, + tp: &Wait, + ) -> Result<(BlocksSerializer, bool), ErrorCode> { + let mut builder = PageBuilder::new(max_rows_per_page); + + while builder.has_capacity() { + match tp { + Wait::Async => match self.chan.try_append_block(&mut builder) { + true => (), + false => break, + }, + Wait::Deadline(t) => { + let d = match t.checked_duration_since(Instant::now()) { + Some(d) if !d.is_zero() => d, + _ => { + // timeout() will return Ok if the future completes immediately + break; + } + }; + match tokio::time::timeout(d, self.chan.recv()).await { + Ok(true) => { + self.chan.try_append_block(&mut builder); + debug!("[HTTP-QUERY] Appended new data block"); + } + Ok(false) => { + info!("[HTTP-QUERY] Reached end of data blocks"); + break; + } + Err(_) => { + debug!("[HTTP-QUERY] Long polling timeout reached"); + break; + } + } + } + } + } + + // try to report 'no more data' earlier to client to avoid unnecessary http call + let block_end = self.chan.is_close(); + Ok(( + builder.into_serializer(self.chan.format_settings.lock().unwrap().clone().unwrap()), + block_end, + )) } } @@ -215,6 +343,12 @@ impl SizedChannelSender { chan: self.chan.clone(), } } + + pub fn plan_ready(&self, format_settings: FormatSettings) { + assert!(!self.chan.is_plan_ready.has_notified()); + *self.chan.format_settings.lock().unwrap() = Some(format_settings); + self.chan.is_plan_ready.notify_waiters(); + } } pub struct SizedChannelSenderCloser { diff --git a/src/query/service/tests/it/servers/http/json_block.rs b/src/query/service/tests/it/servers/http/json_block.rs index af6ca5821f2bd..ae6ddc4f61b95 100644 --- a/src/query/service/tests/it/servers/http/json_block.rs +++ b/src/query/service/tests/it/servers/http/json_block.rs @@ -22,7 +22,7 @@ use databend_common_expression::types::DateType; use databend_common_expression::types::StringType; use databend_common_expression::FromData; use databend_common_io::prelude::FormatSettings; -use databend_query::servers::http::v1::BlocksSerializer; +use databend_query::servers::http::v1::BlocksCollector; use pretty_assertions::assert_eq; fn test_data_block(is_nullable: bool) -> Result<()> { @@ -42,8 +42,9 @@ fn test_data_block(is_nullable: bool) -> Result<()> { } let format = FormatSettings::default(); - let mut serializer = BlocksSerializer::new(Some(format)); - serializer.append(columns, 3); + let mut collector = BlocksCollector::new(); + collector.append_columns(columns, 3); + let serializer = collector.into_serializer(format); let expect = [ vec!["1", "a", "1", "1.1", "1970-01-02"], vec!["2", "b", "1", "2.2", "1970-01-03"], @@ -73,7 +74,8 @@ fn test_data_block_not_nullable() -> Result<()> { #[test] fn test_empty_block() -> Result<()> { let format = FormatSettings::default(); - let serializer = BlocksSerializer::new(Some(format)); + let collector = BlocksCollector::new(); + let serializer = collector.into_serializer(format); assert!(serializer.is_empty()); Ok(()) } From 4c39dce2445e120681ffcafc96f3916526a785fb Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 3 Sep 2025 10:55:05 +0800 Subject: [PATCH 04/33] x --- .../src/servers/http/v1/query/sized_spsc.rs | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index bf04dedfae7db..73c094d0f13e9 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -144,7 +144,7 @@ impl SizedChannelInner { self.is_recv_stopped = true } - fn append_block_once(&mut self, builder: &mut PageBuilder) { + fn take_block_once(&mut self, builder: &mut PageBuilder) { let Some(block) = self.values.pop_front() else { return; }; @@ -163,9 +163,9 @@ impl SizedChannelInner { } } - fn append_block(&mut self, builder: &mut PageBuilder) -> bool { + fn take_block(&mut self, builder: &mut PageBuilder) -> bool { while builder.has_capacity() && !self.values.is_empty() { - self.append_block_once(builder) + self.take_block_once(builder) } !self.values.is_empty() } @@ -192,13 +192,12 @@ impl SizedChannel { } fn try_send(&self, value: DataBlock) -> Result<(), Option> { - let mut guard = self.inner.lock().unwrap(); - guard.try_send(value) + self.inner.lock().unwrap().try_send(value) } - #[fastrace::trace(name = "SizedChannel::try_append_block")] - fn try_append_block(&self, builder: &mut PageBuilder) -> bool { - let has_more = self.inner.lock().unwrap().append_block(builder); + #[fastrace::trace(name = "SizedChannel::try_take_block")] + fn try_take_block(&self, builder: &mut PageBuilder) -> bool { + let has_more = self.inner.lock().unwrap().take_block(builder); self.notify_on_recv.notify_one(); has_more } @@ -284,10 +283,7 @@ impl SizedChannelReceiver { while builder.has_capacity() { match tp { - Wait::Async => match self.chan.try_append_block(&mut builder) { - true => (), - false => break, - }, + Wait::Async => self.chan.try_take_block(&mut builder), Wait::Deadline(t) => { let d = match t.checked_duration_since(Instant::now()) { Some(d) if !d.is_zero() => d, @@ -298,7 +294,7 @@ impl SizedChannelReceiver { }; match tokio::time::timeout(d, self.chan.recv()).await { Ok(true) => { - self.chan.try_append_block(&mut builder); + self.chan.try_take_block(&mut builder); debug!("[HTTP-QUERY] Appended new data block"); } Ok(false) => { From 3d25d5050c659edd687e90cbdc1af07651db30b7 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 3 Sep 2025 11:40:04 +0800 Subject: [PATCH 05/33] SpillableBlock 1 --- .../src/servers/http/v1/query/sized_spsc.rs | 93 ++++++++++++++++--- 1 file changed, 79 insertions(+), 14 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 73c094d0f13e9..192ba2ea770b8 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -12,14 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! A channel bounded with the sum of sizes associate with items instead of count of items. -//! -//! other features: -//! 1. it is SPSC, enough for now. -//! 2. receive can check status of channel with fn is_empty(). - use std::cmp::min; use std::collections::VecDeque; +use std::fmt; +use std::fmt::Debug; +use std::fmt::Formatter; use std::sync::Arc; use std::sync::Mutex; use std::time::Instant; @@ -30,13 +27,12 @@ use databend_common_base::base::WatchNotify; use databend_common_exception::ErrorCode; use databend_common_expression::DataBlock; use databend_common_io::prelude::FormatSettings; -use fastrace::future::FutureExt; -use fastrace::Span; use log::debug; use log::info; use super::blocks_serializer::BlocksCollector; use super::blocks_serializer::BlocksSerializer; +use crate::spillers::Location; pub struct PageBuilder { pub collector: BlocksCollector, @@ -90,7 +86,7 @@ impl PageBuilder { } } -struct SizedChannelInner { +struct SizedChannelBuffer { max_size: usize, values: VecDeque, is_recv_stopped: bool, @@ -105,9 +101,9 @@ pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) }) } -impl SizedChannelInner { +impl SizedChannelBuffer { fn create(max_size: usize) -> Self { - SizedChannelInner { + SizedChannelBuffer { max_size, values: Default::default(), is_recv_stopped: false, @@ -172,7 +168,7 @@ impl SizedChannelInner { } struct SizedChannel { - inner: Mutex, + inner: Mutex, notify_on_sent: Notify, notify_on_recv: Notify, @@ -183,7 +179,7 @@ struct SizedChannel { impl SizedChannel { fn create(max_size: usize) -> Self { SizedChannel { - inner: Mutex::new(SizedChannelInner::create(max_size)), + inner: Mutex::new(SizedChannelBuffer::create(max_size)), notify_on_sent: Default::default(), notify_on_recv: Default::default(), is_plan_ready: WatchNotify::new(), @@ -283,7 +279,9 @@ impl SizedChannelReceiver { while builder.has_capacity() { match tp { - Wait::Async => self.chan.try_take_block(&mut builder), + Wait::Async => { + self.chan.try_take_block(&mut builder); + } Wait::Deadline(t) => { let d = match t.checked_duration_since(Instant::now()) { Some(d) if !d.is_zero() => d, @@ -356,3 +354,70 @@ impl SizedChannelSenderCloser { self.chan.stop_send() } } + +struct SpillableBlock { + data: Option, + /// [SpillableBlock::slice] does not reallocate memory, so memorysize remains unchanged + memory_size: usize, + rows: usize, + location: Option, + processed: usize, +} + +#[async_trait::async_trait] +pub trait Spill: Send { + async fn spill(&self, data_block: DataBlock) -> Result; + async fn restore(&self, location: &Location) -> Result; +} + +impl SpillableBlock { + fn new(data: DataBlock) -> Self { + Self { + location: None, + processed: 0, + rows: data.num_rows(), + memory_size: data.memory_size(), + data: Some(data), + } + } + + fn slice(&mut self, pos: usize) -> DataBlock { + let data = self.data.as_ref().unwrap(); + + let left = data.slice(0..pos); + let right = data.slice(pos..data.num_rows()); + + self.rows = right.num_rows(); + self.data = Some(right); + if self.location.is_some() { + self.processed += pos; + } + left + } + + async fn spill(&mut self, spiller: &impl Spill) -> Result<(), ErrorCode> { + let data = self.data.take().unwrap(); + if self.location.is_none() { + let location = spiller.spill(data).await?; + self.location = Some(location); + } + Ok(()) + } + + fn restore(&mut self, location: &Location, data: DataBlock) { + assert_eq!(self.location.as_ref(), Some(location)); + self.data = Some(data) + } +} + +impl Debug for SpillableBlock { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("SpillableBlock") + .field("has_data", &self.data.is_some()) + .field("memory_size", &self.memory_size) + .field("rows", &self.rows) + .field("location", &self.location) + .field("processed", &self.processed) + .finish() + } +} From 0f3fceb457322b32b25b49b5ab068ecf4a1d6876 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 3 Sep 2025 16:22:57 +0800 Subject: [PATCH 06/33] x --- .../src/servers/http/v1/query/sized_spsc.rs | 121 ++++++++++++------ 1 file changed, 84 insertions(+), 37 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 192ba2ea770b8..e4f3e126e2809 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -88,7 +88,7 @@ impl PageBuilder { struct SizedChannelBuffer { max_size: usize, - values: VecDeque, + values: VecDeque, is_recv_stopped: bool, is_send_stopped: bool, } @@ -112,19 +112,32 @@ impl SizedChannelBuffer { } fn size(&self) -> usize { - self.values.iter().map(|x| x.num_rows()).sum::() + self.values + .iter() + .map(|x| x.data.map(DataBlock::num_rows).unwrap_or_default()) + .sum::() } - fn try_send(&mut self, value: DataBlock) -> Result<(), Option> { - let current_size = self.size(); - let value_size = value.num_rows(); + fn try_send(&mut self, value: SpillableBlock) -> Result<(), Option> { if self.is_recv_stopped || self.is_send_stopped { - Err(None) - } else if current_size + value_size <= self.max_size || current_size == 0 { - self.values.push_back(value); - Ok(()) - } else { - Err(Some(value)) + return Err(None); + } + + match &value.data { + Some(data) => { + let value_size = data.num_rows(); + let current_size = self.size(); + if current_size + value_size <= self.max_size || current_size == 0 { + self.values.push_back(value); + Ok(()) + } else { + Err(Some(value)) + } + } + None => { + self.values.push_back(value); + Ok(()) + } } } @@ -140,69 +153,88 @@ impl SizedChannelBuffer { self.is_recv_stopped = true } - fn take_block_once(&mut self, builder: &mut PageBuilder) { - let Some(block) = self.values.pop_front() else { - return; + fn take_block_once(&mut self, builder: &mut PageBuilder) -> Result<(), &Location> { + let Some(block) = self.values.front_mut() else { + return Ok(()); }; - if block.is_empty() { - return; - } + let Some(data) = &block.data else { + return Err(block.location.as_ref().unwrap()); + }; - let take_rows = builder.calculate_take_rows(block.num_rows(), block.memory_size()); - if take_rows < block.num_rows() { + let take_rows = builder.calculate_take_rows(data.num_rows(), data.memory_size()); + if take_rows < data.num_rows() { builder.remain_rows = 0; - let remaining_block = builder.append_partial_block(block, take_rows); - self.values.push_front(remaining_block); + builder.collector.append_block(block.slice(take_rows)); } else { - builder.append_full_block(block); + builder.append_full_block(self.values.pop_front().unwrap().data.unwrap()); } + Ok(()) } - fn take_block(&mut self, builder: &mut PageBuilder) -> bool { + fn take_block(&mut self, builder: &mut PageBuilder) -> Result<(), &Location> { while builder.has_capacity() && !self.values.is_empty() { - self.take_block_once(builder) + self.take_block_once(builder)?; } - !self.values.is_empty() + Ok(()) + } + + fn restore_first(&mut self, location: &Location, data: DataBlock) { + self.values.front_mut().unwrap().restore(location, data); } } -struct SizedChannel { +struct SizedChannel { inner: Mutex, notify_on_sent: Notify, notify_on_recv: Notify, is_plan_ready: WatchNotify, format_settings: Mutex>, + spiller: S, } -impl SizedChannel { - fn create(max_size: usize) -> Self { +impl SizedChannel { + fn create(max_size: usize, spiller: S) -> Self { SizedChannel { inner: Mutex::new(SizedChannelBuffer::create(max_size)), notify_on_sent: Default::default(), notify_on_recv: Default::default(), is_plan_ready: WatchNotify::new(), format_settings: Mutex::new(None), + spiller, } } fn try_send(&self, value: DataBlock) -> Result<(), Option> { - self.inner.lock().unwrap().try_send(value) + self.inner + .lock() + .unwrap() + .try_send(SpillableBlock::new(value)) } #[fastrace::trace(name = "SizedChannel::try_take_block")] - fn try_take_block(&self, builder: &mut PageBuilder) -> bool { - let has_more = self.inner.lock().unwrap().take_block(builder); - self.notify_on_recv.notify_one(); - has_more + async fn try_take_block( + &self, + builder: &mut PageBuilder, + ) -> Result<(), ErrorCode> { + let location = match self.inner.lock().unwrap().take_block(builder) { + Err(location) => location.clone(), + Ok(_) => { + return Ok(()); + } + }; + let data = self.spiller.restore(&location).await?; + self.inner.lock().unwrap().restore_first(&location, data); + Ok(()) } #[async_backtrace::framed] async fn send(&self, value: DataBlock) -> bool { - let mut to_send = value; + let mut to_send = SpillableBlock::new(value); loop { - match self.try_send(to_send) { + let result = self.inner.lock().unwrap().try_send(to_send); + match result { Ok(_) => { self.notify_on_sent.notify_one(); return true; @@ -210,6 +242,7 @@ impl SizedChannel { Err(None) => return false, Err(Some(v)) => { to_send = v; + // todo self.notify_on_recv.notified().await; } } @@ -277,10 +310,12 @@ impl SizedChannelReceiver { ) -> Result<(BlocksSerializer, bool), ErrorCode> { let mut builder = PageBuilder::new(max_rows_per_page); + let spiller = todo!(); + while builder.has_capacity() { match tp { Wait::Async => { - self.chan.try_take_block(&mut builder); + self.chan.try_take_block(&mut builder, &spiller).await?; } Wait::Deadline(t) => { let d = match t.checked_duration_since(Instant::now()) { @@ -292,7 +327,7 @@ impl SizedChannelReceiver { }; match tokio::time::timeout(d, self.chan.recv()).await { Ok(true) => { - self.chan.try_take_block(&mut builder); + self.chan.try_take_block(&mut builder, &spiller); debug!("[HTTP-QUERY] Appended new data block"); } Ok(false) => { @@ -381,6 +416,14 @@ impl SpillableBlock { } } + fn memory_size(&self) -> usize { + self.memory_size + } + + fn is_empty(&self) -> bool { + self.rows == 0 + } + fn slice(&mut self, pos: usize) -> DataBlock { let data = self.data.as_ref().unwrap(); @@ -395,6 +438,10 @@ impl SpillableBlock { left } + fn take_data(&mut self) -> Option { + self.data.take() + } + async fn spill(&mut self, spiller: &impl Spill) -> Result<(), ErrorCode> { let data = self.data.take().unwrap(); if self.location.is_none() { From 6d9f26c0d802c4147ff3c88bba27a819f9488eee Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 3 Sep 2025 17:04:29 +0800 Subject: [PATCH 07/33] x --- .../servers/http/v1/query/execute_state.rs | 7 +- .../src/servers/http/v1/query/page_manager.rs | 8 +- .../src/servers/http/v1/query/sized_spsc.rs | 78 ++++++++++++------- 3 files changed, 62 insertions(+), 31 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index 22a114148372b..b7da612a8815d 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -42,6 +42,7 @@ use crate::interpreters::InterpreterFactory; use crate::interpreters::InterpreterQueryLog; use crate::servers::http::v1::http_query_handlers::QueryResponseField; use crate::servers::http::v1::query::http_query::ResponseState; +use crate::servers::http::v1::query::sized_spsc::PlaceholderSpill; use crate::servers::http::v1::query::sized_spsc::SizedChannelSender; use crate::sessions::AcquireQueueGuard; use crate::sessions::QueryAffect; @@ -113,7 +114,7 @@ impl ExecuteState { pub struct ExecuteStarting { pub(crate) ctx: Arc, - pub(crate) sender: SizedChannelSender, + pub(crate) sender: SizedChannelSender, } pub struct ExecuteRunning { @@ -354,7 +355,7 @@ impl ExecuteState { sql: String, session: Arc, ctx: Arc, - block_sender: SizedChannelSender, + block_sender: SizedChannelSender, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to start query: {sql}"); @@ -421,7 +422,7 @@ impl ExecuteState { interpreter: Arc, schema: DataSchemaRef, ctx: Arc, - block_sender: SizedChannelSender, + block_sender: SizedChannelSender, executor: Arc>, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to execute {}", interpreter.name()); diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index d7c68b72b6b6a..5f505a49911d1 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -18,6 +18,7 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use super::blocks_serializer::BlocksSerializer; +use crate::servers::http::v1::query::sized_spsc::PlaceholderSpill; use crate::servers::http::v1::query::sized_spsc::SizedChannelReceiver; use crate::servers::http::v1::query::sized_spsc::Wait; @@ -37,11 +38,14 @@ pub struct PageManager { total_pages: usize, end: bool, last_page: Option, - block_receiver: SizedChannelReceiver, + block_receiver: SizedChannelReceiver, } impl PageManager { - pub fn new(max_rows_per_page: usize, block_receiver: SizedChannelReceiver) -> PageManager { + pub fn new( + max_rows_per_page: usize, + block_receiver: SizedChannelReceiver, + ) -> PageManager { PageManager { total_rows: 0, last_page: None, diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index e4f3e126e2809..3625490e5ef17 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -93,8 +93,20 @@ struct SizedChannelBuffer { is_send_stopped: bool, } -pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) { - let chan = Arc::new(SizedChannel::create(max_size)); +pub fn sized_spsc( + max_size: usize, +) -> ( + SizedChannelSender, + SizedChannelReceiver, +) { + sized_spsc_with_spiller(max_size, PlaceholderSpill) +} + +pub fn sized_spsc_with_spiller( + max_size: usize, + spiller: S, +) -> (SizedChannelSender, SizedChannelReceiver) { + let chan = Arc::new(SizedChannel::create(max_size, spiller)); let cloned = chan.clone(); (SizedChannelSender { chan }, SizedChannelReceiver { chan: cloned, @@ -114,7 +126,7 @@ impl SizedChannelBuffer { fn size(&self) -> usize { self.values .iter() - .map(|x| x.data.map(DataBlock::num_rows).unwrap_or_default()) + .map(|x| x.data.as_ref().map(DataBlock::num_rows).unwrap_or_default()) .sum::() } @@ -153,13 +165,12 @@ impl SizedChannelBuffer { self.is_recv_stopped = true } - fn take_block_once(&mut self, builder: &mut PageBuilder) -> Result<(), &Location> { + fn take_block_once(&mut self, builder: &mut PageBuilder) -> Result<(), Location> { let Some(block) = self.values.front_mut() else { return Ok(()); }; - let Some(data) = &block.data else { - return Err(block.location.as_ref().unwrap()); + return Err(block.location.clone().unwrap()); }; let take_rows = builder.calculate_take_rows(data.num_rows(), data.memory_size()); @@ -167,12 +178,14 @@ impl SizedChannelBuffer { builder.remain_rows = 0; builder.collector.append_block(block.slice(take_rows)); } else { - builder.append_full_block(self.values.pop_front().unwrap().data.unwrap()); + let data = block.data.take().unwrap(); + self.values.pop_front(); + builder.append_full_block(data); } Ok(()) } - fn take_block(&mut self, builder: &mut PageBuilder) -> Result<(), &Location> { + fn take_block(&mut self, builder: &mut PageBuilder) -> Result<(), Location> { while builder.has_capacity() && !self.values.is_empty() { self.take_block_once(builder)?; } @@ -211,13 +224,11 @@ impl SizedChannel { .lock() .unwrap() .try_send(SpillableBlock::new(value)) + .map_err(|block| Some(block?.data.unwrap())) } #[fastrace::trace(name = "SizedChannel::try_take_block")] - async fn try_take_block( - &self, - builder: &mut PageBuilder, - ) -> Result<(), ErrorCode> { + async fn try_take_block(&self, builder: &mut PageBuilder) -> Result<(), ErrorCode> { let location = match self.inner.lock().unwrap().take_block(builder) { Err(location) => location.clone(), Ok(_) => { @@ -293,11 +304,11 @@ pub enum Wait { Deadline(Instant), } -pub struct SizedChannelReceiver { - chan: Arc, +pub struct SizedChannelReceiver { + chan: Arc>, } -impl SizedChannelReceiver { +impl SizedChannelReceiver { pub fn close(&self) { self.chan.stop_recv() } @@ -310,12 +321,10 @@ impl SizedChannelReceiver { ) -> Result<(BlocksSerializer, bool), ErrorCode> { let mut builder = PageBuilder::new(max_rows_per_page); - let spiller = todo!(); - while builder.has_capacity() { match tp { Wait::Async => { - self.chan.try_take_block(&mut builder, &spiller).await?; + self.chan.try_take_block(&mut builder).await?; } Wait::Deadline(t) => { let d = match t.checked_duration_since(Instant::now()) { @@ -327,7 +336,7 @@ impl SizedChannelReceiver { }; match tokio::time::timeout(d, self.chan.recv()).await { Ok(true) => { - self.chan.try_take_block(&mut builder, &spiller); + self.chan.try_take_block(&mut builder).await?; debug!("[HTTP-QUERY] Appended new data block"); } Ok(false) => { @@ -353,11 +362,11 @@ impl SizedChannelReceiver { } #[derive(Clone)] -pub struct SizedChannelSender { - chan: Arc, +pub struct SizedChannelSender { + chan: Arc>, } -impl SizedChannelSender { +impl SizedChannelSender { #[async_backtrace::framed] pub async fn send(&self, value: DataBlock) -> bool { self.chan.send(value).await @@ -367,7 +376,7 @@ impl SizedChannelSender { self.chan.stop_send() } - pub fn closer(&self) -> SizedChannelSenderCloser { + pub fn closer(&self) -> SizedChannelSenderCloser { SizedChannelSenderCloser { chan: self.chan.clone(), } @@ -380,11 +389,11 @@ impl SizedChannelSender { } } -pub struct SizedChannelSenderCloser { - chan: Arc, +pub struct SizedChannelSenderCloser { + chan: Arc>, } -impl SizedChannelSenderCloser { +impl SizedChannelSenderCloser { pub fn close(&self) { self.chan.stop_send() } @@ -405,6 +414,23 @@ pub trait Spill: Send { async fn restore(&self, location: &Location) -> Result; } +/// Placeholder implementation of Spill trait +#[derive(Clone)] +pub struct PlaceholderSpill; + +#[async_trait::async_trait] +impl Spill for PlaceholderSpill { + async fn spill(&self, _data_block: DataBlock) -> Result { + // TODO: Implement actual spill logic + todo!("PlaceholderSpill::spill not implemented") + } + + async fn restore(&self, _location: &Location) -> Result { + // TODO: Implement actual restore logic + todo!("PlaceholderSpill::restore not implemented") + } +} + impl SpillableBlock { fn new(data: DataBlock) -> Self { Self { From 3e8c274d5518add6626b5199c25ba121b092f652 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 3 Sep 2025 20:37:43 +0800 Subject: [PATCH 08/33] SpillerRef --- Cargo.lock | 11 ++++ Cargo.toml | 2 + src/query/service/Cargo.toml | 1 + .../src/servers/http/v1/query/sized_spsc.rs | 51 +++++++++---------- src/query/traits/Cargo.toml | 18 +++++++ src/query/traits/src/lib.rs | 13 +++++ 6 files changed, 68 insertions(+), 28 deletions(-) create mode 100644 src/query/traits/Cargo.toml create mode 100644 src/query/traits/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 4c5c6163b7a81..1c59f57714ea9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4687,6 +4687,16 @@ dependencies = [ "tonic", ] +[[package]] +name = "databend-common-traits" +version = "0.1.0" +dependencies = [ + "async-trait", + "databend-common-exception", + "databend-common-expression", + "databend-storages-common-cache", +] + [[package]] name = "databend-common-users" version = "0.1.0" @@ -5225,6 +5235,7 @@ dependencies = [ "databend-common-storages-system", "databend-common-telemetry", "databend-common-tracing", + "databend-common-traits", "databend-common-users", "databend-common-version", "databend-enterprise-aggregating-index", diff --git a/Cargo.toml b/Cargo.toml index 660dc658b4ea2..d26e30e72bf86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,7 @@ members = [ "src/query/script", "src/query/settings", "src/query/sql", + "src/query/traits", "src/query/storages/common/blocks", "src/query/storages/common/cache", "src/query/storages/common/index", @@ -182,6 +183,7 @@ databend-common-storages-stream = { path = "src/query/storages/stream" } databend-common-storages-system = { path = "src/query/storages/system" } databend-common-telemetry = { path = "src/common/telemetry" } databend-common-tracing = { path = "src/common/tracing" } +databend-common-traits = { path = "src/query/traits" } databend-common-users = { path = "src/query/users" } databend-common-vector = { path = "src/common/vector" } databend-common-version = { path = "src/common/version" } diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 0016398385d49..a63a4ec623ba4 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -96,6 +96,7 @@ databend-common-storages-stream = { workspace = true } databend-common-storages-system = { workspace = true } databend-common-telemetry = { workspace = true } databend-common-tracing = { workspace = true } +databend-common-traits = { workspace = true } databend-common-users = { workspace = true } databend-common-version = { workspace = true } databend-enterprise-aggregating-index = { workspace = true } diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 3625490e5ef17..42e1f9ffb5528 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -27,6 +27,7 @@ use databend_common_base::base::WatchNotify; use databend_common_exception::ErrorCode; use databend_common_expression::DataBlock; use databend_common_io::prelude::FormatSettings; +use databend_common_pipeline_transforms::traits::DataBlockSpill; use log::debug; use log::info; @@ -99,14 +100,7 @@ pub fn sized_spsc( SizedChannelSender, SizedChannelReceiver, ) { - sized_spsc_with_spiller(max_size, PlaceholderSpill) -} - -pub fn sized_spsc_with_spiller( - max_size: usize, - spiller: S, -) -> (SizedChannelSender, SizedChannelReceiver) { - let chan = Arc::new(SizedChannel::create(max_size, spiller)); + let chan = Arc::new(SizedChannel::create(max_size, PlaceholderSpill)); let cloned = chan.clone(); (SizedChannelSender { chan }, SizedChannelReceiver { chan: cloned, @@ -197,7 +191,7 @@ impl SizedChannelBuffer { } } -struct SizedChannel { +struct SizedChannel { inner: Mutex, notify_on_sent: Notify, notify_on_recv: Notify, @@ -207,7 +201,9 @@ struct SizedChannel { spiller: S, } -impl SizedChannel { +impl SizedChannel +where S: DataBlockSpill +{ fn create(max_size: usize, spiller: S) -> Self { SizedChannel { inner: Mutex::new(SizedChannelBuffer::create(max_size)), @@ -304,11 +300,13 @@ pub enum Wait { Deadline(Instant), } -pub struct SizedChannelReceiver { +pub struct SizedChannelReceiver { chan: Arc>, } -impl SizedChannelReceiver { +impl SizedChannelReceiver +where S: DataBlockSpill +{ pub fn close(&self) { self.chan.stop_recv() } @@ -362,11 +360,13 @@ impl SizedChannelReceiver { } #[derive(Clone)] -pub struct SizedChannelSender { +pub struct SizedChannelSender { chan: Arc>, } -impl SizedChannelSender { +impl SizedChannelSender +where S: DataBlockSpill +{ #[async_backtrace::framed] pub async fn send(&self, value: DataBlock) -> bool { self.chan.send(value).await @@ -389,11 +389,13 @@ impl SizedChannelSender { } } -pub struct SizedChannelSenderCloser { +pub struct SizedChannelSenderCloser { chan: Arc>, } -impl SizedChannelSenderCloser { +impl SizedChannelSenderCloser +where S: DataBlockSpill +{ pub fn close(&self) { self.chan.stop_send() } @@ -408,25 +410,17 @@ struct SpillableBlock { processed: usize, } -#[async_trait::async_trait] -pub trait Spill: Send { - async fn spill(&self, data_block: DataBlock) -> Result; - async fn restore(&self, location: &Location) -> Result; -} - /// Placeholder implementation of Spill trait #[derive(Clone)] pub struct PlaceholderSpill; #[async_trait::async_trait] -impl Spill for PlaceholderSpill { - async fn spill(&self, _data_block: DataBlock) -> Result { - // TODO: Implement actual spill logic - todo!("PlaceholderSpill::spill not implemented") +impl DataBlockSpill for PlaceholderSpill { + async fn merge_and_spill(&self, data_block: Vec) -> Result { + todo!("PlaceholderSpill::merge_and_spill not implemented") } async fn restore(&self, _location: &Location) -> Result { - // TODO: Implement actual restore logic todo!("PlaceholderSpill::restore not implemented") } } @@ -468,7 +462,8 @@ impl SpillableBlock { self.data.take() } - async fn spill(&mut self, spiller: &impl Spill) -> Result<(), ErrorCode> { + async fn spill(&mut self, spiller: &S) -> Result<(), ErrorCode> + where S: DataBlockSpill { let data = self.data.take().unwrap(); if self.location.is_none() { let location = spiller.spill(data).await?; diff --git a/src/query/traits/Cargo.toml b/src/query/traits/Cargo.toml new file mode 100644 index 0000000000000..6672c849699ce --- /dev/null +++ b/src/query/traits/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "databend-common-traits" +version = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +publish = { workspace = true } +edition = { workspace = true } + +[dependencies] +async-trait = { workspace = true } +databend-common-exception = { workspace = true } +databend-common-expression = { workspace = true } +databend-storages-common-cache = { workspace = true } + +[dev-dependencies] + +[lints] +workspace = true diff --git a/src/query/traits/src/lib.rs b/src/query/traits/src/lib.rs new file mode 100644 index 0000000000000..4ca0419659671 --- /dev/null +++ b/src/query/traits/src/lib.rs @@ -0,0 +1,13 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. From b381fd94a3c8ba9d5bc59dc71614f57f73fee4dd Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 3 Sep 2025 21:47:59 +0800 Subject: [PATCH 09/33] fix --- src/query/service/src/servers/http/v1/query/sized_spsc.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 42e1f9ffb5528..73005e54830e1 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -64,6 +64,7 @@ impl PageBuilder { self.collector.append_block(block); } + #[allow(dead_code)] fn append_partial_block(&mut self, block: DataBlock, take_rows: usize) -> DataBlock { self.collector.append_block(block.slice(0..take_rows)); @@ -215,6 +216,7 @@ where S: DataBlockSpill } } + #[allow(dead_code)] fn try_send(&self, value: DataBlock) -> Result<(), Option> { self.inner .lock() @@ -436,10 +438,12 @@ impl SpillableBlock { } } + #[allow(dead_code)] fn memory_size(&self) -> usize { self.memory_size } + #[allow(dead_code)] fn is_empty(&self) -> bool { self.rows == 0 } @@ -458,10 +462,12 @@ impl SpillableBlock { left } + #[allow(dead_code)] fn take_data(&mut self) -> Option { self.data.take() } + #[allow(dead_code)] async fn spill(&mut self, spiller: &S) -> Result<(), ErrorCode> where S: DataBlockSpill { let data = self.data.take().unwrap(); From f0666e1ac35439aaaa68b21c8ebc3c4c7b225506 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 4 Sep 2025 10:18:15 +0800 Subject: [PATCH 10/33] move --- Cargo.lock | 11 ----------- Cargo.toml | 2 -- src/query/service/Cargo.toml | 1 - .../src/servers/http/v1/query/sized_spsc.rs | 4 ++-- src/query/traits/Cargo.toml | 18 ------------------ src/query/traits/src/lib.rs | 13 ------------- 6 files changed, 2 insertions(+), 47 deletions(-) delete mode 100644 src/query/traits/Cargo.toml delete mode 100644 src/query/traits/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 1c59f57714ea9..4c5c6163b7a81 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4687,16 +4687,6 @@ dependencies = [ "tonic", ] -[[package]] -name = "databend-common-traits" -version = "0.1.0" -dependencies = [ - "async-trait", - "databend-common-exception", - "databend-common-expression", - "databend-storages-common-cache", -] - [[package]] name = "databend-common-users" version = "0.1.0" @@ -5235,7 +5225,6 @@ dependencies = [ "databend-common-storages-system", "databend-common-telemetry", "databend-common-tracing", - "databend-common-traits", "databend-common-users", "databend-common-version", "databend-enterprise-aggregating-index", diff --git a/Cargo.toml b/Cargo.toml index d26e30e72bf86..660dc658b4ea2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,7 +52,6 @@ members = [ "src/query/script", "src/query/settings", "src/query/sql", - "src/query/traits", "src/query/storages/common/blocks", "src/query/storages/common/cache", "src/query/storages/common/index", @@ -183,7 +182,6 @@ databend-common-storages-stream = { path = "src/query/storages/stream" } databend-common-storages-system = { path = "src/query/storages/system" } databend-common-telemetry = { path = "src/common/telemetry" } databend-common-tracing = { path = "src/common/tracing" } -databend-common-traits = { path = "src/query/traits" } databend-common-users = { path = "src/query/users" } databend-common-vector = { path = "src/common/vector" } databend-common-version = { path = "src/common/version" } diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index a63a4ec623ba4..0016398385d49 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -96,7 +96,6 @@ databend-common-storages-stream = { workspace = true } databend-common-storages-system = { workspace = true } databend-common-telemetry = { workspace = true } databend-common-tracing = { workspace = true } -databend-common-traits = { workspace = true } databend-common-users = { workspace = true } databend-common-version = { workspace = true } databend-enterprise-aggregating-index = { workspace = true } diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 73005e54830e1..c222ee45013c2 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -28,12 +28,12 @@ use databend_common_exception::ErrorCode; use databend_common_expression::DataBlock; use databend_common_io::prelude::FormatSettings; use databend_common_pipeline_transforms::traits::DataBlockSpill; +use databend_common_pipeline_transforms::traits::Location; use log::debug; use log::info; use super::blocks_serializer::BlocksCollector; use super::blocks_serializer::BlocksSerializer; -use crate::spillers::Location; pub struct PageBuilder { pub collector: BlocksCollector, @@ -418,7 +418,7 @@ pub struct PlaceholderSpill; #[async_trait::async_trait] impl DataBlockSpill for PlaceholderSpill { - async fn merge_and_spill(&self, data_block: Vec) -> Result { + async fn merge_and_spill(&self, _data_block: Vec) -> Result { todo!("PlaceholderSpill::merge_and_spill not implemented") } diff --git a/src/query/traits/Cargo.toml b/src/query/traits/Cargo.toml deleted file mode 100644 index 6672c849699ce..0000000000000 --- a/src/query/traits/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "databend-common-traits" -version = { workspace = true } -authors = { workspace = true } -license = { workspace = true } -publish = { workspace = true } -edition = { workspace = true } - -[dependencies] -async-trait = { workspace = true } -databend-common-exception = { workspace = true } -databend-common-expression = { workspace = true } -databend-storages-common-cache = { workspace = true } - -[dev-dependencies] - -[lints] -workspace = true diff --git a/src/query/traits/src/lib.rs b/src/query/traits/src/lib.rs deleted file mode 100644 index 4ca0419659671..0000000000000 --- a/src/query/traits/src/lib.rs +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. From 0aa3c6b198d2532ed6c9421aa6a6007cffe6b96b Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 4 Sep 2025 12:31:18 +0800 Subject: [PATCH 11/33] apply_settings --- .../servers/http/v1/query/execute_state.rs | 48 +++++++-- .../src/servers/http/v1/query/http_query.rs | 9 +- .../src/servers/http/v1/query/page_manager.rs | 6 +- .../src/servers/http/v1/query/sized_spsc.rs | 99 ++++++++----------- 4 files changed, 88 insertions(+), 74 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index b7da612a8815d..7f3ba08df2d63 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -26,6 +26,8 @@ use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::Scalar; use databend_common_settings::Settings; +use databend_common_storage::DataOperator; +use databend_storages_common_cache::TempDirManager; use databend_storages_common_session::TxnManagerRef; use futures::StreamExt; use log::debug; @@ -42,13 +44,17 @@ use crate::interpreters::InterpreterFactory; use crate::interpreters::InterpreterQueryLog; use crate::servers::http::v1::http_query_handlers::QueryResponseField; use crate::servers::http::v1::query::http_query::ResponseState; -use crate::servers::http::v1::query::sized_spsc::PlaceholderSpill; use crate::servers::http::v1::query::sized_spsc::SizedChannelSender; use crate::sessions::AcquireQueueGuard; use crate::sessions::QueryAffect; use crate::sessions::QueryContext; use crate::sessions::Session; use crate::sessions::TableContext; +use crate::spillers::Spiller; +use crate::spillers::SpillerConfig; +use crate::spillers::SpillerDiskConfig; +use crate::spillers::SpillerRef; +use crate::spillers::SpillerType; pub struct ExecutionError; @@ -114,7 +120,7 @@ impl ExecuteState { pub struct ExecuteStarting { pub(crate) ctx: Arc, - pub(crate) sender: SizedChannelSender, + pub(crate) sender: SizedChannelSender, } pub struct ExecuteRunning { @@ -355,7 +361,7 @@ impl ExecuteState { sql: String, session: Arc, ctx: Arc, - block_sender: SizedChannelSender, + mut block_sender: SizedChannelSender, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to start query: {sql}"); @@ -367,9 +373,7 @@ impl ExecuteState { .map_err(|err| err.display_with_sql(&sql)) .with_context(make_error)?; - // set_var may change settings - let format_settings = ctx.get_format_settings().with_context(make_error)?; - block_sender.plan_ready(format_settings); + Self::apply_settings(&ctx, &mut block_sender).with_context(make_error)?; let interpreter = InterpreterFactory::get(ctx.clone(), &plan) .await @@ -422,7 +426,7 @@ impl ExecuteState { interpreter: Arc, schema: DataSchemaRef, ctx: Arc, - block_sender: SizedChannelSender, + block_sender: SizedChannelSender, executor: Arc>, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to execute {}", interpreter.name()); @@ -461,4 +465,34 @@ impl ExecuteState { } Ok(()) } + + fn apply_settings( + ctx: &Arc, + block_sender: &mut SizedChannelSender, + ) -> Result<()> { + let settings = ctx.get_settings(); + let temp_dir_manager = TempDirManager::instance(); + let disk_bytes_limit = settings.get_sort_spilling_to_disk_bytes_limit()?; + let enable_dio = settings.get_enable_dio()?; + let disk_spill = temp_dir_manager + .get_disk_spill_dir(disk_bytes_limit, &ctx.get_id()) + .map(|temp_dir| SpillerDiskConfig::new(temp_dir, enable_dio)) + .transpose()?; + + let location_prefix = ctx.query_id_spill_prefix(); + let config = SpillerConfig { + spiller_type: SpillerType::OrderBy, + location_prefix, + disk_spill, + use_parquet: settings.get_spilling_file_format()?.is_parquet(), + }; + let op = DataOperator::instance().spill_operator(); + + let spiller = Spiller::create(ctx.clone(), op, config)?.into(); + + // set_var may change settings + let format_settings = ctx.get_format_settings()?; + block_sender.plan_ready(format_settings, Some(spiller)); + Ok(()) + } } diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 58bbd8e0a8ba3..36c484e889b76 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -74,6 +74,7 @@ use crate::servers::http::v1::QueryStats; use crate::sessions::QueryAffect; use crate::sessions::Session; use crate::sessions::TableContext; +use crate::spillers::SpillerRef; fn default_as_true() -> bool { true @@ -611,7 +612,7 @@ impl HttpQuery { }) }; - let (sender, block_receiver) = sized_spsc(req.pagination.max_rows_in_buffer); + let (sender, receiver) = sized_spsc::(req.pagination.max_rows_in_buffer); let executor = Arc::new(Mutex::new(Executor { query_id: query_id.clone(), @@ -624,9 +625,9 @@ impl HttpQuery { let settings = session.get_settings(); let result_timeout_secs = settings.get_http_handler_result_timeout_secs()?; - let data = Arc::new(TokioMutex::new(PageManager::new( + let page_manager = Arc::new(TokioMutex::new(PageManager::new( req.pagination.max_rows_per_page, - block_receiver, + receiver, ))); Ok(HttpQuery { @@ -636,7 +637,7 @@ impl HttpQuery { node_id, request: req, executor, - page_manager: data, + page_manager, result_timeout_secs, state: Arc::new(Mutex::new(HttpQueryState::Working)), diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index 5f505a49911d1..fd5d25a0b0a8b 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -18,9 +18,9 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use super::blocks_serializer::BlocksSerializer; -use crate::servers::http::v1::query::sized_spsc::PlaceholderSpill; use crate::servers::http::v1::query::sized_spsc::SizedChannelReceiver; use crate::servers::http::v1::query::sized_spsc::Wait; +use crate::spillers::SpillerRef; #[derive(Clone)] pub struct Page { @@ -38,13 +38,13 @@ pub struct PageManager { total_pages: usize, end: bool, last_page: Option, - block_receiver: SizedChannelReceiver, + block_receiver: SizedChannelReceiver, } impl PageManager { pub fn new( max_rows_per_page: usize, - block_receiver: SizedChannelReceiver, + block_receiver: SizedChannelReceiver, ) -> PageManager { PageManager { total_rows: 0, diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index c222ee45013c2..fa7248a56569d 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -64,13 +64,6 @@ impl PageBuilder { self.collector.append_block(block); } - #[allow(dead_code)] - fn append_partial_block(&mut self, block: DataBlock, take_rows: usize) -> DataBlock { - self.collector.append_block(block.slice(0..take_rows)); - - block.slice(take_rows..block.num_rows()) - } - fn calculate_take_rows(&self, block_rows: usize, memory_size: usize) -> usize { min( self.remain_rows, @@ -95,17 +88,12 @@ struct SizedChannelBuffer { is_send_stopped: bool, } -pub fn sized_spsc( - max_size: usize, -) -> ( - SizedChannelSender, - SizedChannelReceiver, -) { - let chan = Arc::new(SizedChannel::create(max_size, PlaceholderSpill)); - let cloned = chan.clone(); - (SizedChannelSender { chan }, SizedChannelReceiver { - chan: cloned, - }) +pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) +where S: DataBlockSpill { + let chan = Arc::new(SizedChannel::create(max_size)); + let sender = SizedChannelSender { chan: chan.clone() }; + let receiver = SizedChannelReceiver { chan }; + (sender, receiver) } impl SizedChannelBuffer { @@ -180,11 +168,11 @@ impl SizedChannelBuffer { Ok(()) } - fn take_block(&mut self, builder: &mut PageBuilder) -> Result<(), Location> { + fn take_block(&mut self, builder: &mut PageBuilder) -> Result { while builder.has_capacity() && !self.values.is_empty() { self.take_block_once(builder)?; } - Ok(()) + Ok(!self.values.is_empty()) } fn restore_first(&mut self, location: &Location, data: DataBlock) { @@ -199,59 +187,59 @@ struct SizedChannel { is_plan_ready: WatchNotify, format_settings: Mutex>, - spiller: S, + spiller: Mutex>, } impl SizedChannel where S: DataBlockSpill { - fn create(max_size: usize, spiller: S) -> Self { + fn create(max_size: usize) -> Self { SizedChannel { inner: Mutex::new(SizedChannelBuffer::create(max_size)), notify_on_sent: Default::default(), notify_on_recv: Default::default(), is_plan_ready: WatchNotify::new(), format_settings: Mutex::new(None), - spiller, + spiller: Mutex::new(None), } } - #[allow(dead_code)] - fn try_send(&self, value: DataBlock) -> Result<(), Option> { - self.inner - .lock() - .unwrap() - .try_send(SpillableBlock::new(value)) - .map_err(|block| Some(block?.data.unwrap())) - } - #[fastrace::trace(name = "SizedChannel::try_take_block")] - async fn try_take_block(&self, builder: &mut PageBuilder) -> Result<(), ErrorCode> { + async fn try_take_block(&self, builder: &mut PageBuilder) -> Result { let location = match self.inner.lock().unwrap().take_block(builder) { Err(location) => location.clone(), - Ok(_) => { - return Ok(()); + Ok(has_more) => { + return Ok(has_more); } }; - let data = self.spiller.restore(&location).await?; + + let spiller = self.spiller.lock().unwrap().clone().unwrap(); + let data = spiller.restore(&location).await?; + self.inner.lock().unwrap().restore_first(&location, data); - Ok(()) + Ok(true) } #[async_backtrace::framed] - async fn send(&self, value: DataBlock) -> bool { + async fn send(&self, value: DataBlock) -> Result { let mut to_send = SpillableBlock::new(value); loop { let result = self.inner.lock().unwrap().try_send(to_send); match result { Ok(_) => { self.notify_on_sent.notify_one(); - return true; + return Ok(true); } - Err(None) => return false, + Err(None) => return Ok(false), Err(Some(v)) => { to_send = v; - // todo + if self.should_spill() { + let spiller = self.spiller.lock().unwrap().clone(); + if let Some(spiller) = spiller.as_ref() { + to_send.spill(spiller).await?; + continue; + } + } self.notify_on_recv.notified().await; } } @@ -294,6 +282,10 @@ where S: DataBlockSpill } self.notify_on_recv.notify_one(); } + + fn should_spill(&self) -> bool { + true + } } #[derive(Debug, PartialEq, Eq)] @@ -324,7 +316,9 @@ where S: DataBlockSpill while builder.has_capacity() { match tp { Wait::Async => { - self.chan.try_take_block(&mut builder).await?; + if !self.chan.try_take_block(&mut builder).await? { + break; + } } Wait::Deadline(t) => { let d = match t.checked_duration_since(Instant::now()) { @@ -371,7 +365,7 @@ where S: DataBlockSpill { #[async_backtrace::framed] pub async fn send(&self, value: DataBlock) -> bool { - self.chan.send(value).await + self.chan.send(value).await.unwrap() } pub fn close(&self) { @@ -384,9 +378,10 @@ where S: DataBlockSpill } } - pub fn plan_ready(&self, format_settings: FormatSettings) { + pub fn plan_ready(&mut self, format_settings: FormatSettings, spiller: Option) { assert!(!self.chan.is_plan_ready.has_notified()); *self.chan.format_settings.lock().unwrap() = Some(format_settings); + *self.chan.spiller.lock().unwrap() = spiller; self.chan.is_plan_ready.notify_waiters(); } } @@ -438,16 +433,6 @@ impl SpillableBlock { } } - #[allow(dead_code)] - fn memory_size(&self) -> usize { - self.memory_size - } - - #[allow(dead_code)] - fn is_empty(&self) -> bool { - self.rows == 0 - } - fn slice(&mut self, pos: usize) -> DataBlock { let data = self.data.as_ref().unwrap(); @@ -462,12 +447,6 @@ impl SpillableBlock { left } - #[allow(dead_code)] - fn take_data(&mut self) -> Option { - self.data.take() - } - - #[allow(dead_code)] async fn spill(&mut self, spiller: &S) -> Result<(), ErrorCode> where S: DataBlockSpill { let data = self.data.take().unwrap(); From e57c659234d0b39d7c02bd152e210aeae9f45ca8 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 4 Sep 2025 13:04:40 +0800 Subject: [PATCH 12/33] settings --- .../servers/http/v1/query/execute_state.rs | 51 ++++++++++--------- .../src/servers/http/v1/query/http_query.rs | 4 +- .../src/servers/http/v1/query/page_manager.rs | 6 +-- src/query/service/src/spillers/adapter.rs | 30 ++++++++--- src/query/service/src/spillers/test_memory.rs | 6 +-- src/query/settings/src/settings_default.rs | 14 +++++ .../settings/src/settings_getter_setter.rs | 8 +++ 7 files changed, 79 insertions(+), 40 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index 7f3ba08df2d63..ea69dc03c45d7 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -50,10 +50,9 @@ use crate::sessions::QueryAffect; use crate::sessions::QueryContext; use crate::sessions::Session; use crate::sessions::TableContext; -use crate::spillers::Spiller; +use crate::spillers::LiteSpiller; use crate::spillers::SpillerConfig; use crate::spillers::SpillerDiskConfig; -use crate::spillers::SpillerRef; use crate::spillers::SpillerType; pub struct ExecutionError; @@ -120,7 +119,7 @@ impl ExecuteState { pub struct ExecuteStarting { pub(crate) ctx: Arc, - pub(crate) sender: SizedChannelSender, + pub(crate) sender: SizedChannelSender, } pub struct ExecuteRunning { @@ -361,7 +360,7 @@ impl ExecuteState { sql: String, session: Arc, ctx: Arc, - mut block_sender: SizedChannelSender, + mut block_sender: SizedChannelSender, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to start query: {sql}"); @@ -426,7 +425,7 @@ impl ExecuteState { interpreter: Arc, schema: DataSchemaRef, ctx: Arc, - block_sender: SizedChannelSender, + block_sender: SizedChannelSender, executor: Arc>, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to execute {}", interpreter.name()); @@ -468,31 +467,35 @@ impl ExecuteState { fn apply_settings( ctx: &Arc, - block_sender: &mut SizedChannelSender, + block_sender: &mut SizedChannelSender, ) -> Result<()> { let settings = ctx.get_settings(); - let temp_dir_manager = TempDirManager::instance(); - let disk_bytes_limit = settings.get_sort_spilling_to_disk_bytes_limit()?; - let enable_dio = settings.get_enable_dio()?; - let disk_spill = temp_dir_manager - .get_disk_spill_dir(disk_bytes_limit, &ctx.get_id()) - .map(|temp_dir| SpillerDiskConfig::new(temp_dir, enable_dio)) - .transpose()?; - - let location_prefix = ctx.query_id_spill_prefix(); - let config = SpillerConfig { - spiller_type: SpillerType::OrderBy, - location_prefix, - disk_spill, - use_parquet: settings.get_spilling_file_format()?.is_parquet(), - }; - let op = DataOperator::instance().spill_operator(); - let spiller = Spiller::create(ctx.clone(), op, config)?.into(); + let spiller = if settings.get_enable_result_set_spilling()? { + let temp_dir_manager = TempDirManager::instance(); + let disk_bytes_limit = settings.get_result_set_spilling_to_disk_bytes_limit()?; + let enable_dio = settings.get_enable_dio()?; + let disk_spill = temp_dir_manager + .get_disk_spill_dir(disk_bytes_limit, &ctx.get_id()) + .map(|temp_dir| SpillerDiskConfig::new(temp_dir, enable_dio)) + .transpose()?; + + let location_prefix = ctx.query_id_spill_prefix(); + let config = SpillerConfig { + spiller_type: SpillerType::ResultSet, + location_prefix, + disk_spill, + use_parquet: settings.get_spilling_file_format()?.is_parquet(), + }; + let op = DataOperator::instance().spill_operator(); + Some(LiteSpiller::new(op, config)?) + } else { + None + }; // set_var may change settings let format_settings = ctx.get_format_settings()?; - block_sender.plan_ready(format_settings, Some(spiller)); + block_sender.plan_ready(format_settings, spiller); Ok(()) } } diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 36c484e889b76..813aeef489ac0 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -74,7 +74,7 @@ use crate::servers::http::v1::QueryStats; use crate::sessions::QueryAffect; use crate::sessions::Session; use crate::sessions::TableContext; -use crate::spillers::SpillerRef; +use crate::spillers::LiteSpiller; fn default_as_true() -> bool { true @@ -612,7 +612,7 @@ impl HttpQuery { }) }; - let (sender, receiver) = sized_spsc::(req.pagination.max_rows_in_buffer); + let (sender, receiver) = sized_spsc::(req.pagination.max_rows_in_buffer); let executor = Arc::new(Mutex::new(Executor { query_id: query_id.clone(), diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index fd5d25a0b0a8b..476f245e9db64 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -20,7 +20,7 @@ use databend_common_exception::Result; use super::blocks_serializer::BlocksSerializer; use crate::servers::http::v1::query::sized_spsc::SizedChannelReceiver; use crate::servers::http::v1::query::sized_spsc::Wait; -use crate::spillers::SpillerRef; +use crate::spillers::LiteSpiller; #[derive(Clone)] pub struct Page { @@ -38,13 +38,13 @@ pub struct PageManager { total_pages: usize, end: bool, last_page: Option, - block_receiver: SizedChannelReceiver, + block_receiver: SizedChannelReceiver, } impl PageManager { pub fn new( max_rows_per_page: usize, - block_receiver: SizedChannelReceiver, + block_receiver: SizedChannelReceiver, ) -> PageManager { PageManager { total_rows: 0, diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 0a31e2f355fb0..c0910b6082a92 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -385,12 +385,26 @@ impl SpillAdapter for LiteAdapter { } } -pub type LiteSpiller = Arc>; - -pub fn new_lite_spiller(operator: Operator, config: SpillerConfig) -> Result { - Ok(Arc::new(SpillerInner::new( - Default::default(), - operator, - config, - )?)) +#[derive(Clone)] +pub struct LiteSpiller(Arc>); + +impl LiteSpiller { + pub fn new(operator: Operator, config: SpillerConfig) -> Result { + Ok(LiteSpiller(Arc::new(SpillerInner::new( + Default::default(), + operator, + config, + )?))) + } +} + +#[async_trait::async_trait] +impl DataBlockSpill for LiteSpiller { + async fn merge_and_spill(&self, data_block: Vec) -> Result { + self.0.spill(data_block).await + } + + async fn restore(&self, location: &Location) -> Result { + self.0.read_spilled_file(location).await + } } diff --git a/src/query/service/src/spillers/test_memory.rs b/src/query/service/src/spillers/test_memory.rs index 3fbf35af9735d..bacf0bdf9f5a8 100644 --- a/src/query/service/src/spillers/test_memory.rs +++ b/src/query/service/src/spillers/test_memory.rs @@ -28,12 +28,12 @@ use databend_common_expression::types::StringType; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::FromData; +use databend_common_pipeline_transforms::traits::DataBlockSpill; use databend_common_pipeline_transforms::traits::Location; use databend_common_storage::DataOperator; use databend_storages_common_cache::TempDirManager; use rand::Rng; -use crate::spillers::new_lite_spiller; use crate::spillers::LiteSpiller; use crate::spillers::SpillerConfig; use crate::spillers::SpillerDiskConfig; @@ -209,7 +209,7 @@ async fn init() -> Result { use_parquet: true, }; let operator = DataOperator::instance().spill_operator(); - new_lite_spiller(operator, spiller_config) + LiteSpiller::new(operator, spiller_config) } #[derive(Debug)] @@ -225,7 +225,7 @@ async fn run_spill_test(spiller: &LiteSpiller, rows: usize) -> Result Result { + Ok(self.try_get_u64("result_set_spilling_to_disk_bytes_limit")? as usize) + } + + pub fn get_enable_result_set_spilling(&self) -> Result { + Ok(self.try_get_u64("enable_result_set_spilling")? == 1) + } + pub fn get_enable_shuffle_sort(&self) -> Result { Ok(self.try_get_u64("enable_shuffle_sort")? == 1) } From db1216f938f0e087aacc9b16c6bace6b4c8e31b4 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 4 Sep 2025 15:20:50 +0800 Subject: [PATCH 13/33] fix --- .../http/v1/query/blocks_serializer.rs | 4 + .../servers/http/v1/query/execute_state.rs | 30 +++- .../src/servers/http/v1/query/http_query.rs | 5 +- .../src/servers/http/v1/query/sized_spsc.rs | 148 ++++++++++++++---- 4 files changed, 149 insertions(+), 38 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/blocks_serializer.rs b/src/query/service/src/servers/http/v1/query/blocks_serializer.rs index b6b9056a4ed84..b9b51a336ce15 100644 --- a/src/query/service/src/servers/http/v1/query/blocks_serializer.rs +++ b/src/query/service/src/servers/http/v1/query/blocks_serializer.rs @@ -66,6 +66,10 @@ impl BlocksCollector { self.append_columns(columns, num_rows); } + pub fn num_rows(&self) -> usize { + self.columns.iter().map(|(_, num_rows)| *num_rows).sum() + } + pub fn into_serializer(self, format: FormatSettings) -> BlocksSerializer { BlocksSerializer { columns: self.columns, diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index ea69dc03c45d7..f5c6afe475a3a 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -425,7 +425,7 @@ impl ExecuteState { interpreter: Arc, schema: DataSchemaRef, ctx: Arc, - block_sender: SizedChannelSender, + mut block_sender: SizedChannelSender, executor: Arc>, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to execute {}", interpreter.name()); @@ -437,7 +437,7 @@ impl ExecuteState { match data_stream.next().await { None => { let block = DataBlock::empty_with_schema(schema); - block_sender.send(block).await; + let _ = block_sender.send(block).await; Executor::stop::<()>(&executor, Ok(())); block_sender.close(); } @@ -446,11 +446,15 @@ impl ExecuteState { block_sender.close(); } Some(Ok(block)) => { - block_sender.send(block).await; - while let Some(block_r) = data_stream.next().await { - match block_r { + Self::send_data_block(&mut block_sender, &executor, block) + .await + .with_context(make_error)?; + while let Some(block) = data_stream.next().await { + match block { Ok(block) => { - block_sender.send(block.clone()).await; + Self::send_data_block(&mut block_sender, &executor, block) + .await + .with_context(make_error)?; } Err(err) => { block_sender.close(); @@ -465,6 +469,20 @@ impl ExecuteState { Ok(()) } + async fn send_data_block( + sender: &mut SizedChannelSender, + executor: &Arc>, + block: DataBlock, + ) -> Result<()> { + match sender.send(block).await { + Ok(_) => Ok(()), + Err(err) => { + Executor::stop(executor, Err(err.clone())); + Err(err) + } + } + } + fn apply_settings( ctx: &Arc, block_sender: &mut SizedChannelSender, diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 813aeef489ac0..320eb14ff7be4 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -802,12 +802,13 @@ impl HttpQuery { GlobalQueryRuntime::instance().runtime().try_spawn( async move { + let block_sender_closer = block_sender.closer(); if let Err(e) = CatchUnwindFuture::create(ExecuteState::try_start_query( query_state.clone(), sql, query_session, query_context.clone(), - block_sender.clone(), + block_sender, )) .await .with_context(|| "failed to start query") @@ -834,7 +835,7 @@ impl HttpQuery { e ); Executor::start_to_stop(&query_state, ExecuteState::Stopped(Box::new(state))); - block_sender.close(); + block_sender_closer.close(); } } .in_span(span), diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index fa7248a56569d..aca4e1627c70e 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -17,6 +17,7 @@ use std::collections::VecDeque; use std::fmt; use std::fmt::Debug; use std::fmt::Formatter; +use std::result; use std::sync::Arc; use std::sync::Mutex; use std::time::Instant; @@ -24,7 +25,7 @@ use std::time::Instant; use databend_common_base::base::tokio; use databend_common_base::base::tokio::sync::Notify; use databend_common_base::base::WatchNotify; -use databend_common_exception::ErrorCode; +use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_io::prelude::FormatSettings; use databend_common_pipeline_transforms::traits::DataBlockSpill; @@ -113,7 +114,7 @@ impl SizedChannelBuffer { .sum::() } - fn try_send(&mut self, value: SpillableBlock) -> Result<(), Option> { + fn try_send(&mut self, value: SpillableBlock) -> result::Result<(), Option> { if self.is_recv_stopped || self.is_send_stopped { return Err(None); } @@ -148,7 +149,7 @@ impl SizedChannelBuffer { self.is_recv_stopped = true } - fn take_block_once(&mut self, builder: &mut PageBuilder) -> Result<(), Location> { + fn take_block_once(&mut self, builder: &mut PageBuilder) -> result::Result<(), Location> { let Some(block) = self.values.front_mut() else { return Ok(()); }; @@ -168,7 +169,7 @@ impl SizedChannelBuffer { Ok(()) } - fn take_block(&mut self, builder: &mut PageBuilder) -> Result { + fn take_block(&mut self, builder: &mut PageBuilder) -> result::Result { while builder.has_capacity() && !self.values.is_empty() { self.take_block_once(builder)?; } @@ -205,7 +206,7 @@ where S: DataBlockSpill } #[fastrace::trace(name = "SizedChannel::try_take_block")] - async fn try_take_block(&self, builder: &mut PageBuilder) -> Result { + async fn try_take_block(&self, builder: &mut PageBuilder) -> Result { let location = match self.inner.lock().unwrap().take_block(builder) { Err(location) => location.clone(), Ok(has_more) => { @@ -221,7 +222,7 @@ where S: DataBlockSpill } #[async_backtrace::framed] - async fn send(&self, value: DataBlock) -> Result { + async fn send(&self, value: DataBlock) -> Result { let mut to_send = SpillableBlock::new(value); loop { let result = self.inner.lock().unwrap().try_send(to_send); @@ -310,7 +311,7 @@ where S: DataBlockSpill &mut self, max_rows_per_page: usize, tp: &Wait, - ) -> Result<(BlocksSerializer, bool), ErrorCode> { + ) -> Result<(BlocksSerializer, bool)> { let mut builder = PageBuilder::new(max_rows_per_page); while builder.has_capacity() { @@ -346,12 +347,15 @@ where S: DataBlockSpill } } + let format = self.chan.format_settings.lock().unwrap().clone(); + let format = format.unwrap_or_else(|| { + assert!(builder.collector.num_rows() == 0); + FormatSettings::default() + }); + // try to report 'no more data' earlier to client to avoid unnecessary http call let block_end = self.chan.is_close(); - Ok(( - builder.into_serializer(self.chan.format_settings.lock().unwrap().clone().unwrap()), - block_end, - )) + Ok((builder.into_serializer(format), block_end)) } } @@ -364,8 +368,8 @@ impl SizedChannelSender where S: DataBlockSpill { #[async_backtrace::framed] - pub async fn send(&self, value: DataBlock) -> bool { - self.chan.send(value).await.unwrap() + pub async fn send(&self, value: DataBlock) -> Result { + self.chan.send(value).await } pub fn close(&self) { @@ -393,7 +397,7 @@ pub struct SizedChannelSenderCloser { impl SizedChannelSenderCloser where S: DataBlockSpill { - pub fn close(&self) { + pub fn close(self) { self.chan.stop_send() } } @@ -407,21 +411,6 @@ struct SpillableBlock { processed: usize, } -/// Placeholder implementation of Spill trait -#[derive(Clone)] -pub struct PlaceholderSpill; - -#[async_trait::async_trait] -impl DataBlockSpill for PlaceholderSpill { - async fn merge_and_spill(&self, _data_block: Vec) -> Result { - todo!("PlaceholderSpill::merge_and_spill not implemented") - } - - async fn restore(&self, _location: &Location) -> Result { - todo!("PlaceholderSpill::restore not implemented") - } -} - impl SpillableBlock { fn new(data: DataBlock) -> Self { Self { @@ -447,7 +436,7 @@ impl SpillableBlock { left } - async fn spill(&mut self, spiller: &S) -> Result<(), ErrorCode> + async fn spill(&mut self, spiller: &S) -> Result<()> where S: DataBlockSpill { let data = self.data.take().unwrap(); if self.location.is_none() { @@ -474,3 +463,102 @@ impl Debug for SpillableBlock { .finish() } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + use std::sync::Mutex; + + use databend_common_exception::ErrorCode; + use databend_common_expression::types::Int32Type; + use databend_common_expression::FromData; + use databend_common_io::prelude::FormatSettings; + use databend_common_pipeline_transforms::traits::DataBlockSpill; + use databend_common_pipeline_transforms::traits::Location; + + use super::*; + + #[derive(Clone)] + struct MockSpiller { + storage: Arc>>, + } + + impl MockSpiller { + fn new() -> Self { + Self { + storage: Arc::new(Mutex::new(HashMap::new())), + } + } + } + + #[async_trait::async_trait] + impl DataBlockSpill for MockSpiller { + async fn spill(&self, data_block: DataBlock) -> Result { + let key = format!("block_{}", rand::random::()); + self.storage.lock().unwrap().insert(key.clone(), data_block); + Ok(Location::Remote(key)) + } + + async fn merge_and_spill(&self, _data_block: Vec) -> Result { + unimplemented!() + } + + async fn restore(&self, location: &Location) -> Result { + match location { + Location::Remote(key) => { + let storage = self.storage.lock().unwrap(); + storage + .get(key) + .cloned() + .ok_or_else(|| ErrorCode::Internal("Block not found in mock spiller")) + } + _ => Err(ErrorCode::Internal("Unsupported location type")), + } + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_sized_spsc_channel() { + let (mut sender, mut receiver) = sized_spsc::(2); + + let test_data = vec![ + DataBlock::new_from_columns(vec![Int32Type::from_data(vec![1, 2, 3])]), + DataBlock::new_from_columns(vec![Int32Type::from_data(vec![4, 5, 6])]), + DataBlock::new_from_columns(vec![Int32Type::from_data(vec![7, 8, 9])]), + ]; + + let sender_data = test_data.clone(); + let send_task = databend_common_base::runtime::spawn(async move { + let format_settings = FormatSettings::default(); + let spiller = MockSpiller::new(); + sender.plan_ready(format_settings, Some(spiller)); + + for block in sender_data { + sender.send(block).await.unwrap(); + } + sender.close(); + }); + + let mut received_blocks = Vec::new(); + loop { + let (serializer, is_end) = receiver.collect_new_page(10, &Wait::Async).await.unwrap(); + + if serializer.num_rows() > 0 { + received_blocks.push(serializer.num_rows()); + } + + if is_end { + break; + } + } + + send_task.await.unwrap(); + + let expected_total_rows: usize = test_data.iter().map(|b| b.num_rows()).sum(); + let received_total_rows: usize = received_blocks.iter().sum(); + + assert_eq!(expected_total_rows, received_total_rows); + assert!(!received_blocks.is_empty()); + } +} From 4014c07cd662a661ff88e682d72314427e87bb80 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 4 Sep 2025 21:02:24 +0800 Subject: [PATCH 14/33] fix --- .../src/servers/http/v1/query/sized_spsc.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index aca4e1627c70e..79a5ca468ea17 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -26,6 +26,7 @@ use databend_common_base::base::tokio; use databend_common_base::base::tokio::sync::Notify; use databend_common_base::base::WatchNotify; use databend_common_exception::Result; +use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; use databend_common_io::prelude::FormatSettings; use databend_common_pipeline_transforms::traits::DataBlockSpill; @@ -65,7 +66,17 @@ impl PageBuilder { self.collector.append_block(block); } - fn calculate_take_rows(&self, block_rows: usize, memory_size: usize) -> usize { + fn calculate_take_rows(&self, block: &DataBlock) -> usize { + let block_rows = block.num_rows(); + let memory_size = block + .columns() + .iter() + .map(|entry| match entry { + BlockEntry::Const(scalar, _, n) => *n * scalar.as_ref().memory_size(), + BlockEntry::Column(column) => column.memory_size(), + }) + .sum::(); + min( self.remain_rows, if memory_size > self.remain_size { @@ -157,7 +168,7 @@ impl SizedChannelBuffer { return Err(block.location.clone().unwrap()); }; - let take_rows = builder.calculate_take_rows(data.num_rows(), data.memory_size()); + let take_rows = builder.calculate_take_rows(&data); if take_rows < data.num_rows() { builder.remain_rows = 0; builder.collector.append_block(block.slice(take_rows)); @@ -303,7 +314,8 @@ impl SizedChannelReceiver where S: DataBlockSpill { pub fn close(&self) { - self.chan.stop_recv() + self.chan.stop_recv(); + self.chan.spiller.lock().unwrap().take(); // release session } #[async_backtrace::framed] From 12ea7923b110b2b942afd8f1d1d3e89416322ee6 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 5 Sep 2025 07:15:54 +0800 Subject: [PATCH 15/33] fix --- src/query/service/src/servers/http/v1/query/sized_spsc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 79a5ca468ea17..7253606ca7d78 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -168,7 +168,7 @@ impl SizedChannelBuffer { return Err(block.location.clone().unwrap()); }; - let take_rows = builder.calculate_take_rows(&data); + let take_rows = builder.calculate_take_rows(data); if take_rows < data.num_rows() { builder.remain_rows = 0; builder.collector.append_block(block.slice(take_rows)); From 44b8ac677639cabd9e9b0ef9fa89dd8f8f0c8823 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 5 Sep 2025 15:43:34 +0800 Subject: [PATCH 16/33] fix --- Cargo.lock | 1 + src/query/service/Cargo.toml | 1 + .../src/servers/http/v1/query/sized_spsc.rs | 21 +++++++++++-------- .../it/servers/http/http_query_handlers.rs | 17 ++++++++------- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4c5c6163b7a81..b9b462ca9cf0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5246,6 +5246,7 @@ dependencies = [ "databend-storages-common-table-meta", "derive-visitor", "dyn-clone", + "either", "enum-as-inner", "ethnum", "fastrace", diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 0016398385d49..8a5715357df1e 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -117,6 +117,7 @@ databend-storages-common-stage = { workspace = true } databend-storages-common-table-meta = { workspace = true } derive-visitor = { workspace = true } dyn-clone = { workspace = true } +either = { workspace = true } enum-as-inner = { workspace = true } ethnum = { workspace = true } fastrace = { workspace = true } diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 7253606ca7d78..803d9855961be 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -31,6 +31,7 @@ use databend_common_expression::DataBlock; use databend_common_io::prelude::FormatSettings; use databend_common_pipeline_transforms::traits::DataBlockSpill; use databend_common_pipeline_transforms::traits::Location; +use either::Either; use log::debug; use log::info; @@ -160,12 +161,12 @@ impl SizedChannelBuffer { self.is_recv_stopped = true } - fn take_block_once(&mut self, builder: &mut PageBuilder) -> result::Result<(), Location> { + fn take_block_once(&mut self, builder: &mut PageBuilder) -> Either<(), Location> { let Some(block) = self.values.front_mut() else { - return Ok(()); + return Either::Left(()); }; let Some(data) = &block.data else { - return Err(block.location.clone().unwrap()); + return Either::Right(block.location.clone().unwrap()); }; let take_rows = builder.calculate_take_rows(data); @@ -177,14 +178,16 @@ impl SizedChannelBuffer { self.values.pop_front(); builder.append_full_block(data); } - Ok(()) + Either::Left(()) } - fn take_block(&mut self, builder: &mut PageBuilder) -> result::Result { + fn take_block(&mut self, builder: &mut PageBuilder) -> Either { while builder.has_capacity() && !self.values.is_empty() { - self.take_block_once(builder)?; + if let Either::Right(location) = self.take_block_once(builder) { + return Either::Right(location); + } } - Ok(!self.values.is_empty()) + Either::Left(!self.values.is_empty()) } fn restore_first(&mut self, location: &Location, data: DataBlock) { @@ -219,8 +222,8 @@ where S: DataBlockSpill #[fastrace::trace(name = "SizedChannel::try_take_block")] async fn try_take_block(&self, builder: &mut PageBuilder) -> Result { let location = match self.inner.lock().unwrap().take_block(builder) { - Err(location) => location.clone(), - Ok(has_more) => { + Either::Right(location) => location, + Either::Left(has_more) => { return Ok(has_more); } }; diff --git a/src/query/service/tests/it/servers/http/http_query_handlers.rs b/src/query/service/tests/it/servers/http/http_query_handlers.rs index 79e588fb669b1..eb5a5366e379c 100644 --- a/src/query/service/tests/it/servers/http/http_query_handlers.rs +++ b/src/query/service/tests/it/servers/http/http_query_handlers.rs @@ -524,9 +524,9 @@ async fn test_client_compatible_query_id() -> Result<()> { #[tokio::test(flavor = "current_thread")] async fn test_active_sessions() -> Result<()> { - let max_active_sessions = 2; let conf = ConfigBuilder::create() - .max_active_sessions(max_active_sessions) + .max_active_sessions(2) + .off_log() .build(); let _fixture = TestFixture::setup_with_config(&conf).await?; let ep = create_endpoint()?; @@ -542,6 +542,7 @@ async fn test_active_sessions() -> Result<()> { .into_iter() .map(|(_status, resp)| (resp.error.map(|e| e.message).unwrap_or_default())) .collect::>(); + _fixture.keep_alive(); results.sort(); let msg = "[HTTP-QUERY] Failed to upgrade session: Current active sessions (2) has exceeded the max_active_sessions limit (2)"; let expect = vec!["", "", msg]; @@ -1764,12 +1765,12 @@ async fn test_max_size_per_page() -> Result<()> { let json = serde_json::json!({"sql": sql.to_string(), "pagination": {"wait_time_secs": wait_time_secs}}); let (_, reply, body) = TestHttpQueryRequest::new(json).fetch_begin().await?; assert!(reply.error.is_none(), "{:?}", reply.error); - let len = body.len() as i32; - let target = 20_080_000; - assert!(len > target); - assert!(len < target + 2000); - assert_eq!(reply.data.len(), 10_000); - assert_eq!(reply.data[0].len(), 2); + let target = (10_usize * 1024 * 1024) as f64; + assert!( + (0.9..1.1).contains(&(body.len() as f64 / target)), + "body len {}", + body.len() + ); Ok(()) } From 0242a0a4b93637e6e625f08952dfb71dff5abb71 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 5 Sep 2025 18:24:57 +0800 Subject: [PATCH 17/33] PortableSpiller --- .../src/servers/http/v1/query/execute_state.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index f5c6afe475a3a..d466d64f5c686 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -55,6 +55,8 @@ use crate::spillers::SpillerConfig; use crate::spillers::SpillerDiskConfig; use crate::spillers::SpillerType; +type Sender = SizedChannelSender; + pub struct ExecutionError; #[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq)] @@ -119,7 +121,7 @@ impl ExecuteState { pub struct ExecuteStarting { pub(crate) ctx: Arc, - pub(crate) sender: SizedChannelSender, + pub(crate) sender: Sender, } pub struct ExecuteRunning { @@ -360,7 +362,7 @@ impl ExecuteState { sql: String, session: Arc, ctx: Arc, - mut block_sender: SizedChannelSender, + mut block_sender: Sender, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to start query: {sql}"); @@ -425,7 +427,7 @@ impl ExecuteState { interpreter: Arc, schema: DataSchemaRef, ctx: Arc, - mut block_sender: SizedChannelSender, + mut block_sender: Sender, executor: Arc>, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to execute {}", interpreter.name()); @@ -470,7 +472,7 @@ impl ExecuteState { } async fn send_data_block( - sender: &mut SizedChannelSender, + sender: &mut Sender, executor: &Arc>, block: DataBlock, ) -> Result<()> { @@ -483,10 +485,7 @@ impl ExecuteState { } } - fn apply_settings( - ctx: &Arc, - block_sender: &mut SizedChannelSender, - ) -> Result<()> { + fn apply_settings(ctx: &Arc, block_sender: &mut Sender) -> Result<()> { let settings = ctx.get_settings(); let spiller = if settings.get_enable_result_set_spilling()? { From 19b90b72f24bf07dda93682c132046d3dc0e3137 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 8 Sep 2025 12:14:01 +0800 Subject: [PATCH 18/33] x --- .../src/servers/http/v1/query/sized_spsc.rs | 124 +++++++++--------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 803d9855961be..e35640c053ba9 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -38,60 +38,12 @@ use log::info; use super::blocks_serializer::BlocksCollector; use super::blocks_serializer::BlocksSerializer; -pub struct PageBuilder { - pub collector: BlocksCollector, - pub remain_rows: usize, - pub remain_size: usize, -} - -impl PageBuilder { - fn new(max_rows: usize) -> Self { - Self { - collector: BlocksCollector::new(), - remain_size: 10 * 1024 * 1024, - remain_rows: max_rows, - } - } - - fn has_capacity(&self) -> bool { - self.remain_rows > 0 && self.remain_size > 0 - } - - fn append_full_block(&mut self, block: DataBlock) { - let memory_size = block.memory_size(); - let num_rows = block.num_rows(); - - self.remain_size -= min(self.remain_size, memory_size); - self.remain_rows -= num_rows; - - self.collector.append_block(block); - } - - fn calculate_take_rows(&self, block: &DataBlock) -> usize { - let block_rows = block.num_rows(); - let memory_size = block - .columns() - .iter() - .map(|entry| match entry { - BlockEntry::Const(scalar, _, n) => *n * scalar.as_ref().memory_size(), - BlockEntry::Column(column) => column.memory_size(), - }) - .sum::(); - - min( - self.remain_rows, - if memory_size > self.remain_size { - (self.remain_size * block_rows) / memory_size - } else { - block_rows - }, - ) - .max(1) - } - - fn into_serializer(self, format_settings: FormatSettings) -> BlocksSerializer { - self.collector.into_serializer(format_settings) - } +pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) +where S: DataBlockSpill { + let chan = Arc::new(SizedChannel::create(max_size)); + let sender = SizedChannelSender { chan: chan.clone() }; + let receiver = SizedChannelReceiver { chan }; + (sender, receiver) } struct SizedChannelBuffer { @@ -101,14 +53,6 @@ struct SizedChannelBuffer { is_send_stopped: bool, } -pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) -where S: DataBlockSpill { - let chan = Arc::new(SizedChannel::create(max_size)); - let sender = SizedChannelSender { chan: chan.clone() }; - let receiver = SizedChannelReceiver { chan }; - (sender, receiver) -} - impl SizedChannelBuffer { fn create(max_size: usize) -> Self { SizedChannelBuffer { @@ -195,6 +139,62 @@ impl SizedChannelBuffer { } } +pub struct PageBuilder { + pub collector: BlocksCollector, + pub remain_rows: usize, + pub remain_size: usize, +} + +impl PageBuilder { + fn new(max_rows: usize) -> Self { + Self { + collector: BlocksCollector::new(), + remain_size: 10 * 1024 * 1024, + remain_rows: max_rows, + } + } + + fn has_capacity(&self) -> bool { + self.remain_rows > 0 && self.remain_size > 0 + } + + fn append_full_block(&mut self, block: DataBlock) { + let memory_size = block.memory_size(); + let num_rows = block.num_rows(); + + self.remain_size -= min(self.remain_size, memory_size); + self.remain_rows -= num_rows; + + self.collector.append_block(block); + } + + fn calculate_take_rows(&self, block: &DataBlock) -> usize { + let block_rows = block.num_rows(); + let memory_size = block + .columns() + .iter() + .map(|entry| match entry { + BlockEntry::Const(scalar, _, n) => *n * scalar.as_ref().memory_size(), + BlockEntry::Column(column) => column.memory_size(), + }) + .sum::(); + + min( + self.remain_rows, + if memory_size > self.remain_size { + (self.remain_size * block_rows) / memory_size + } else { + block_rows + }, + ) + .max(1) + } + + fn into_serializer(self, format_settings: FormatSettings) -> BlocksSerializer { + self.collector.into_serializer(format_settings) + } +} + struct SizedChannel { inner: Mutex, notify_on_sent: Notify, From a9ab260dc248a7a02132b514dcf9b6921bc8774a Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 8 Sep 2025 17:39:41 +0800 Subject: [PATCH 19/33] x --- .../src/servers/http/v1/query/http_query.rs | 5 +- .../src/servers/http/v1/query/sized_spsc.rs | 365 +++++++++++------- 2 files changed, 227 insertions(+), 143 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 320eb14ff7be4..d0432b7b18840 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -612,7 +612,10 @@ impl HttpQuery { }) }; - let (sender, receiver) = sized_spsc::(req.pagination.max_rows_in_buffer); + let (sender, receiver) = sized_spsc::( + req.pagination.max_rows_in_buffer, + req.pagination.max_rows_per_page, + ); let executor = Arc::new(Mutex::new(Executor { query_id: query_id.clone(), diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index e35640c053ba9..8787aa0b9eb98 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -31,70 +31,117 @@ use databend_common_expression::DataBlock; use databend_common_io::prelude::FormatSettings; use databend_common_pipeline_transforms::traits::DataBlockSpill; use databend_common_pipeline_transforms::traits::Location; -use either::Either; use log::debug; use log::info; use super::blocks_serializer::BlocksCollector; use super::blocks_serializer::BlocksSerializer; -pub fn sized_spsc(max_size: usize) -> (SizedChannelSender, SizedChannelReceiver) -where S: DataBlockSpill { - let chan = Arc::new(SizedChannel::create(max_size)); +pub fn sized_spsc( + max_size: usize, + page_size: usize, +) -> (SizedChannelSender, SizedChannelReceiver) +where + S: DataBlockSpill, +{ + let chan = Arc::new(SizedChannel::create(max_size, page_size)); let sender = SizedChannelSender { chan: chan.clone() }; let receiver = SizedChannelReceiver { chan }; (sender, receiver) } +enum Page { + Memory(Vec), + Spilled(SpillableBlock), +} + struct SizedChannelBuffer { max_size: usize, - values: VecDeque, + page_size: usize, + pages: VecDeque, + current_page: Option, is_recv_stopped: bool, is_send_stopped: bool, } impl SizedChannelBuffer { - fn create(max_size: usize) -> Self { + fn create(max_size: usize, page_size: usize) -> Self { SizedChannelBuffer { max_size, - values: Default::default(), + page_size, + pages: Default::default(), + current_page: None, is_recv_stopped: false, is_send_stopped: false, } } - fn size(&self) -> usize { - self.values + fn pages_rows(&self) -> usize { + self.pages .iter() - .map(|x| x.data.as_ref().map(DataBlock::num_rows).unwrap_or_default()) + .map(|page| match page { + Page::Memory(blocks) => blocks.iter().map(DataBlock::num_rows).sum::(), + Page::Spilled(_) => 0, + }) .sum::() } - fn try_send(&mut self, value: SpillableBlock) -> result::Result<(), Option> { + fn try_add_block(&mut self, mut block: DataBlock) -> result::Result<(), SendFail> { if self.is_recv_stopped || self.is_send_stopped { - return Err(None); + return Err(SendFail::Closed); } - match &value.data { - Some(data) => { - let value_size = data.num_rows(); - let current_size = self.size(); - if current_size + value_size <= self.max_size || current_size == 0 { - self.values.push_back(value); - Ok(()) - } else { - Err(Some(value)) + loop { + let page_builder = self + .current_page + .get_or_insert_with(|| PageBuilder::new(self.page_size)); + + let remain = page_builder.try_append_block(block); + if !page_builder.has_capacity() { + let rows = page_builder.num_rows(); + let page = self.current_page.take().unwrap().into_page(); + if self.pages_rows() + rows > self.max_size { + return Err(SendFail::Full { page, remain }); } + self.pages.push_back(Page::Memory(page)); } - None => { - self.values.push_back(value); - Ok(()) + match remain { + Some(remain) => block = remain, + None => return Ok(()), } } } + fn try_add_page(&mut self, page: Page) -> result::Result<(), SendFail> { + assert!(self.current_page.is_none()); + + if self.is_recv_stopped || self.is_send_stopped { + return Err(SendFail::Closed); + } + + match page { + Page::Memory(blocks) => { + let rows = blocks.iter().map(DataBlock::num_rows).sum::(); + if self.pages_rows() + rows > self.max_size { + return Err(SendFail::Full { + page: blocks, + remain: None, + }); + }; + self.pages.push_back(Page::Memory(blocks)) + } + page @ Page::Spilled(_) => self.pages.push_back(page), + }; + Ok(()) + } + fn is_close(&self) -> bool { - self.values.is_empty() && self.is_send_stopped + self.current_page + .as_ref() + .map(PageBuilder::is_empty) + .unwrap_or(true) + && self.pages.is_empty() + && self.is_send_stopped } fn stop_send(&mut self) { @@ -105,42 +152,30 @@ impl SizedChannelBuffer { self.is_recv_stopped = true } - fn take_block_once(&mut self, builder: &mut PageBuilder) -> Either<(), Location> { - let Some(block) = self.values.front_mut() else { - return Either::Left(()); - }; - let Some(data) = &block.data else { - return Either::Right(block.location.clone().unwrap()); - }; - - let take_rows = builder.calculate_take_rows(data); - if take_rows < data.num_rows() { - builder.remain_rows = 0; - builder.collector.append_block(block.slice(take_rows)); - } else { - let data = block.data.take().unwrap(); - self.values.pop_front(); - builder.append_full_block(data); - } - Either::Left(()) - } - - fn take_block(&mut self, builder: &mut PageBuilder) -> Either { - while builder.has_capacity() && !self.values.is_empty() { - if let Either::Right(location) = self.take_block_once(builder) { - return Either::Right(location); + fn flush_current_page(&mut self) { + if let Some(page_builder) = self.current_page.take() { + let blocks = page_builder.into_page(); + if !blocks.is_empty() { + self.pages.push_back(Page::Memory(blocks)); } } - Either::Left(!self.values.is_empty()) } - fn restore_first(&mut self, location: &Location, data: DataBlock) { - self.values.front_mut().unwrap().restore(location, data); + fn take_page(&mut self) -> Option { + self.pages.pop_front() } } +enum SendFail { + Closed, + Full { + page: Vec, + remain: Option, + }, +} + pub struct PageBuilder { - pub collector: BlocksCollector, + pub blocks: Vec, pub remain_rows: usize, pub remain_size: usize, } @@ -148,7 +183,7 @@ pub struct PageBuilder { impl PageBuilder { fn new(max_rows: usize) -> Self { Self { - collector: BlocksCollector::new(), + blocks: Vec::new(), remain_size: 10 * 1024 * 1024, remain_rows: max_rows, } @@ -158,14 +193,12 @@ impl PageBuilder { self.remain_rows > 0 && self.remain_size > 0 } - fn append_full_block(&mut self, block: DataBlock) { - let memory_size = block.memory_size(); - let num_rows = block.num_rows(); - - self.remain_size -= min(self.remain_size, memory_size); - self.remain_rows -= num_rows; + fn is_empty(&self) -> bool { + self.blocks.is_empty() + } - self.collector.append_block(block); + fn num_rows(&self) -> usize { + self.blocks.iter().map(DataBlock::num_rows).sum() } fn calculate_take_rows(&self, block: &DataBlock) -> usize { @@ -190,13 +223,36 @@ impl PageBuilder { .max(1) } - fn into_serializer(self, format_settings: FormatSettings) -> BlocksSerializer { - self.collector.into_serializer(format_settings) + fn append_full_block(&mut self, block: DataBlock) { + let memory_size = block.memory_size(); + let num_rows = block.num_rows(); + + self.remain_size -= min(self.remain_size, memory_size); + self.remain_rows -= num_rows; + + self.blocks.push(block); + } + + fn try_append_block(&mut self, block: DataBlock) -> Option { + let take_rows = self.calculate_take_rows(&block); + let total = block.num_rows(); + if take_rows < block.num_rows() { + self.remain_rows = 0; + self.append_full_block(block.slice(0..take_rows)); + Some(block.slice(take_rows..total)) + } else { + self.append_full_block(block); + None + } + } + + fn into_page(self) -> Vec { + self.blocks } } struct SizedChannel { - inner: Mutex, + buffer: Mutex, notify_on_sent: Notify, notify_on_recv: Notify, @@ -208,9 +264,9 @@ struct SizedChannel { impl SizedChannel where S: DataBlockSpill { - fn create(max_size: usize) -> Self { + fn create(max_size: usize, page_size: usize) -> Self { SizedChannel { - inner: Mutex::new(SizedChannelBuffer::create(max_size)), + buffer: Mutex::new(SizedChannelBuffer::create(max_size, page_size)), notify_on_sent: Default::default(), notify_on_recv: Default::default(), is_plan_ready: WatchNotify::new(), @@ -219,54 +275,88 @@ where S: DataBlockSpill } } - #[fastrace::trace(name = "SizedChannel::try_take_block")] - async fn try_take_block(&self, builder: &mut PageBuilder) -> Result { - let location = match self.inner.lock().unwrap().take_block(builder) { - Either::Right(location) => location, - Either::Left(has_more) => { - return Ok(has_more); + #[fastrace::trace(name = "SizedChannel::try_take_page")] + async fn try_take_page(&self) -> Result> { + let format_settings = self + .format_settings + .lock() + .unwrap() + .as_ref() + .cloned() + .unwrap_or_default(); + let page = self.buffer.lock().unwrap().take_page(); + match page { + Some(Page::Memory(blocks)) => { + let mut collector = BlocksCollector::new(); + for block in blocks { + collector.append_block(block); + } + Ok(Some(collector.into_serializer(format_settings))) } - }; - - let spiller = self.spiller.lock().unwrap().clone().unwrap(); - let data = spiller.restore(&location).await?; - - self.inner.lock().unwrap().restore_first(&location, data); - Ok(true) + Some(Page::Spilled(_)) => { + // TODO: Implement restoration from spilled page + unimplemented!("Restoration from spilled page not implemented yet") + } + None => Ok(None), + } } #[async_backtrace::framed] - async fn send(&self, value: DataBlock) -> Result { - let mut to_send = SpillableBlock::new(value); + async fn send(&self, mut block: DataBlock) -> Result { loop { - let result = self.inner.lock().unwrap().try_send(to_send); + let result = self.buffer.lock().unwrap().try_add_block(block); match result { Ok(_) => { self.notify_on_sent.notify_one(); return Ok(true); } - Err(None) => return Ok(false), - Err(Some(v)) => { - to_send = v; - if self.should_spill() { - let spiller = self.spiller.lock().unwrap().clone(); - if let Some(spiller) = spiller.as_ref() { - to_send.spill(spiller).await?; - continue; + Err(SendFail::Closed) => { + self.notify_on_sent.notify_one(); + return Ok(false); + } + Err(SendFail::Full { page, remain }) => { + let mut to_add = self.try_spill_page(page).await?; + loop { + let result = self.buffer.lock().unwrap().try_add_page(to_add); + match result { + Ok(_) => break, + Err(SendFail::Closed) => return Ok(false), + Err(SendFail::Full { page, .. }) => { + to_add = Page::Memory(page); + self.notify_on_recv.notified().await; + } + } + } + + match remain { + Some(remain) => { + block = remain; } + None => return Ok(true), } - self.notify_on_recv.notified().await; } } } } + async fn try_spill_page(&self, page: Vec) -> Result { + if !self.should_spill() { + return Ok(Page::Memory(page)); + } + let spiller = self.spiller.lock().unwrap().clone(); + let Some(spiller) = spiller.as_ref() else { + return Ok(Page::Memory(page)); + }; + let spilled = SpillableBlock::spill_page(page, spiller).await?; + Ok(Page::Spilled(spilled)) + } + #[async_backtrace::framed] async fn recv(&self) -> bool { loop { { - let g = self.inner.lock().unwrap(); - if !g.values.is_empty() { + let g = self.buffer.lock().unwrap(); + if !g.pages.is_empty() { return true; } if g.is_send_stopped { @@ -278,13 +368,13 @@ where S: DataBlockSpill } pub fn is_close(&self) -> bool { - let guard = self.inner.lock().unwrap(); - guard.is_close() + self.buffer.lock().unwrap().is_close() } pub fn stop_send(&self) { { - let mut guard = self.inner.lock().unwrap(); + let mut guard = self.buffer.lock().unwrap(); + guard.flush_current_page(); guard.stop_send() } self.notify_on_sent.notify_one(); @@ -292,7 +382,7 @@ where S: DataBlockSpill pub fn stop_recv(&self) { { - let mut guard = self.inner.lock().unwrap(); + let mut guard = self.buffer.lock().unwrap(); guard.stop_recv() } self.notify_on_recv.notify_one(); @@ -324,53 +414,41 @@ where S: DataBlockSpill #[async_backtrace::framed] pub async fn collect_new_page( &mut self, - max_rows_per_page: usize, + _max_rows_per_page: usize, // Now ignored since pages are pre-built tp: &Wait, ) -> Result<(BlocksSerializer, bool)> { - let mut builder = PageBuilder::new(max_rows_per_page); - - while builder.has_capacity() { - match tp { - Wait::Async => { - if !self.chan.try_take_block(&mut builder).await? { - break; + let page = match tp { + Wait::Async => self.chan.try_take_page().await?, + Wait::Deadline(t) => { + let d = match t.checked_duration_since(Instant::now()) { + Some(d) if !d.is_zero() => d, + _ => { + // timeout() will return Ok if the future completes immediately + return Ok((BlocksSerializer::empty(), self.chan.is_close())); } - } - Wait::Deadline(t) => { - let d = match t.checked_duration_since(Instant::now()) { - Some(d) if !d.is_zero() => d, - _ => { - // timeout() will return Ok if the future completes immediately - break; - } - }; - match tokio::time::timeout(d, self.chan.recv()).await { - Ok(true) => { - self.chan.try_take_block(&mut builder).await?; - debug!("[HTTP-QUERY] Appended new data block"); - } - Ok(false) => { - info!("[HTTP-QUERY] Reached end of data blocks"); - break; - } - Err(_) => { - debug!("[HTTP-QUERY] Long polling timeout reached"); - break; - } + }; + match tokio::time::timeout(d, self.chan.recv()).await { + Ok(true) => self.chan.try_take_page().await?, + Ok(false) => { + info!("[HTTP-QUERY] Reached end of data blocks"); + return Ok((BlocksSerializer::empty(), true)); + } + Err(_) => { + debug!("[HTTP-QUERY] Long polling timeout reached"); + return Ok((BlocksSerializer::empty(), self.chan.is_close())); } } } - } + }; - let format = self.chan.format_settings.lock().unwrap().clone(); - let format = format.unwrap_or_else(|| { - assert!(builder.collector.num_rows() == 0); - FormatSettings::default() - }); + let page = match page { + Some(page) => page, + None => BlocksSerializer::empty(), + }; // try to report 'no more data' earlier to client to avoid unnecessary http call let block_end = self.chan.is_close(); - Ok((builder.into_serializer(format), block_end)) + Ok((page, block_end)) } } @@ -427,14 +505,17 @@ struct SpillableBlock { } impl SpillableBlock { - fn new(data: DataBlock) -> Self { - Self { - location: None, + async fn spill_page(page: Vec, spiller: &impl DataBlockSpill) -> Result { + let rows = page.iter().map(DataBlock::num_rows).sum(); + let memory_size = page.iter().map(DataBlock::memory_size).sum(); + let location = spiller.merge_and_spill(page).await?; + Ok(Self { + location: Some(location), processed: 0, - rows: data.num_rows(), - memory_size: data.memory_size(), - data: Some(data), - } + rows, + memory_size, + data: None, + }) } fn slice(&mut self, pos: usize) -> DataBlock { @@ -535,7 +616,7 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_sized_spsc_channel() { - let (mut sender, mut receiver) = sized_spsc::(2); + let (mut sender, mut receiver) = sized_spsc::(2, 10); let test_data = vec![ DataBlock::new_from_columns(vec![Int32Type::from_data(vec![1, 2, 3])]), From 35ee1532cf63d3ffff822470b760658e1b02d4f0 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 9 Sep 2025 10:48:59 +0800 Subject: [PATCH 20/33] x --- .../servers/http/v1/query/execute_state.rs | 12 +- .../src/servers/http/v1/query/http_query.rs | 2 +- .../src/servers/http/v1/query/page_manager.rs | 5 +- .../src/servers/http/v1/query/sized_spsc.rs | 287 +++++++----------- 4 files changed, 113 insertions(+), 193 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index d466d64f5c686..c718a13cf50a0 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -410,11 +410,11 @@ impl ExecuteState { match CatchUnwindFuture::create(res).await { Ok(Err(err)) => { Executor::stop(&executor_clone, Err(err.clone())); - block_sender_closer.close(); + block_sender_closer.abort(); } Err(e) => { Executor::stop(&executor_clone, Err(e)); - block_sender_closer.close(); + block_sender_closer.abort(); } _ => {} } @@ -441,11 +441,11 @@ impl ExecuteState { let block = DataBlock::empty_with_schema(schema); let _ = block_sender.send(block).await; Executor::stop::<()>(&executor, Ok(())); - block_sender.close(); + block_sender.finish(); } Some(Err(err)) => { Executor::stop(&executor, Err(err)); - block_sender.close(); + block_sender.finish(); } Some(Ok(block)) => { Self::send_data_block(&mut block_sender, &executor, block) @@ -459,13 +459,13 @@ impl ExecuteState { .with_context(make_error)?; } Err(err) => { - block_sender.close(); + block_sender.finish(); return Err(err.with_context(make_error())); } }; } Executor::stop::<()>(&executor, Ok(())); - block_sender.close(); + block_sender.finish(); } } Ok(()) diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index d0432b7b18840..6f5dcb393fb51 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -838,7 +838,7 @@ impl HttpQuery { e ); Executor::start_to_stop(&query_state, ExecuteState::Stopped(Box::new(state))); - block_sender_closer.close(); + block_sender_closer.abort(); } } .in_span(span), diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index 476f245e9db64..2bda3ef1f95b3 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -70,10 +70,7 @@ impl PageManager { let next_no = self.total_pages; if page_no == next_no { if !self.end { - let (serializer, end) = self - .block_receiver - .collect_new_page(self.max_rows_per_page, tp) - .await?; + let (serializer, end) = self.block_receiver.collect_new_page(tp).await?; let num_row = serializer.num_rows(); log::debug!(num_row, wait_type:? = tp; "collect_new_page"); self.total_rows += num_row; diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 8787aa0b9eb98..a03e8ce94bd1a 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -14,9 +14,7 @@ use std::cmp::min; use std::collections::VecDeque; -use std::fmt; use std::fmt::Debug; -use std::fmt::Formatter; use std::result; use std::sync::Arc; use std::sync::Mutex; @@ -70,7 +68,7 @@ impl SizedChannelBuffer { max_size, page_size, pages: Default::default(), - current_page: None, + current_page: Some(PageBuilder::new(page_size)), is_recv_stopped: false, is_send_stopped: false, } @@ -92,18 +90,24 @@ impl SizedChannelBuffer { } loop { - let page_builder = self - .current_page - .get_or_insert_with(|| PageBuilder::new(self.page_size)); + let page_builder = self.current_page.as_mut().unwrap(); let remain = page_builder.try_append_block(block); if !page_builder.has_capacity() { let rows = page_builder.num_rows(); - let page = self.current_page.take().unwrap().into_page(); + if self.pages_rows() + rows > self.max_size { - return Err(SendFail::Full { page, remain }); + return Err(SendFail::Full { + page: self.current_page.take().unwrap().into_page(), + remain, + }); } - self.pages.push_back(Page::Memory(page)); + self.pages.push_back(Page::Memory( + self.current_page + .replace(PageBuilder::new(self.page_size)) + .unwrap() + .into_page(), + )); } match remain { Some(remain) => block = remain, @@ -135,15 +139,6 @@ impl SizedChannelBuffer { Ok(()) } - fn is_close(&self) -> bool { - self.current_page - .as_ref() - .map(PageBuilder::is_empty) - .unwrap_or(true) - && self.pages.is_empty() - && self.is_send_stopped - } - fn stop_send(&mut self) { self.is_send_stopped = true } @@ -152,15 +147,6 @@ impl SizedChannelBuffer { self.is_recv_stopped = true } - fn flush_current_page(&mut self) { - if let Some(page_builder) = self.current_page.take() { - let blocks = page_builder.into_page(); - if !blocks.is_empty() { - self.pages.push_back(Page::Memory(blocks)); - } - } - } - fn take_page(&mut self) -> Option { self.pages.pop_front() } @@ -275,70 +261,6 @@ where S: DataBlockSpill } } - #[fastrace::trace(name = "SizedChannel::try_take_page")] - async fn try_take_page(&self) -> Result> { - let format_settings = self - .format_settings - .lock() - .unwrap() - .as_ref() - .cloned() - .unwrap_or_default(); - let page = self.buffer.lock().unwrap().take_page(); - match page { - Some(Page::Memory(blocks)) => { - let mut collector = BlocksCollector::new(); - for block in blocks { - collector.append_block(block); - } - Ok(Some(collector.into_serializer(format_settings))) - } - Some(Page::Spilled(_)) => { - // TODO: Implement restoration from spilled page - unimplemented!("Restoration from spilled page not implemented yet") - } - None => Ok(None), - } - } - - #[async_backtrace::framed] - async fn send(&self, mut block: DataBlock) -> Result { - loop { - let result = self.buffer.lock().unwrap().try_add_block(block); - match result { - Ok(_) => { - self.notify_on_sent.notify_one(); - return Ok(true); - } - Err(SendFail::Closed) => { - self.notify_on_sent.notify_one(); - return Ok(false); - } - Err(SendFail::Full { page, remain }) => { - let mut to_add = self.try_spill_page(page).await?; - loop { - let result = self.buffer.lock().unwrap().try_add_page(to_add); - match result { - Ok(_) => break, - Err(SendFail::Closed) => return Ok(false), - Err(SendFail::Full { page, .. }) => { - to_add = Page::Memory(page); - self.notify_on_recv.notified().await; - } - } - } - - match remain { - Some(remain) => { - block = remain; - } - None => return Ok(true), - } - } - } - } - } - async fn try_spill_page(&self, page: Vec) -> Result { if !self.should_spill() { return Ok(Page::Memory(page)); @@ -355,11 +277,11 @@ where S: DataBlockSpill async fn recv(&self) -> bool { loop { { - let g = self.buffer.lock().unwrap(); - if !g.pages.is_empty() { + let buffer = self.buffer.lock().unwrap(); + if !buffer.pages.is_empty() { return true; } - if g.is_send_stopped { + if buffer.is_send_stopped { return false; } } @@ -367,24 +289,13 @@ where S: DataBlockSpill } } - pub fn is_close(&self) -> bool { - self.buffer.lock().unwrap().is_close() - } - - pub fn stop_send(&self) { - { - let mut guard = self.buffer.lock().unwrap(); - guard.flush_current_page(); - guard.stop_send() - } - self.notify_on_sent.notify_one(); + fn is_close(&self) -> bool { + let buffer = self.buffer.lock().unwrap(); + buffer.is_send_stopped && buffer.pages.is_empty() } - pub fn stop_recv(&self) { - { - let mut guard = self.buffer.lock().unwrap(); - guard.stop_recv() - } + fn stop_recv(&self) { + self.buffer.lock().unwrap().stop_recv(); self.notify_on_recv.notify_one(); } @@ -408,17 +319,12 @@ where S: DataBlockSpill { pub fn close(&self) { self.chan.stop_recv(); - self.chan.spiller.lock().unwrap().take(); // release session } #[async_backtrace::framed] - pub async fn collect_new_page( - &mut self, - _max_rows_per_page: usize, // Now ignored since pages are pre-built - tp: &Wait, - ) -> Result<(BlocksSerializer, bool)> { + pub async fn collect_new_page(&mut self, tp: &Wait) -> Result<(BlocksSerializer, bool)> { let page = match tp { - Wait::Async => self.chan.try_take_page().await?, + Wait::Async => self.try_take_page().await?, Wait::Deadline(t) => { let d = match t.checked_duration_since(Instant::now()) { Some(d) if !d.is_zero() => d, @@ -428,7 +334,7 @@ where S: DataBlockSpill } }; match tokio::time::timeout(d, self.chan.recv()).await { - Ok(true) => self.chan.try_take_page().await?, + Ok(true) => self.try_take_page().await?, Ok(false) => { info!("[HTTP-QUERY] Reached end of data blocks"); return Ok((BlocksSerializer::empty(), true)); @@ -450,6 +356,37 @@ where S: DataBlockSpill let block_end = self.chan.is_close(); Ok((page, block_end)) } + + #[fastrace::trace(name = "SizedChannelReceiver::try_take_page")] + async fn try_take_page(&mut self) -> Result> { + let page = self.chan.buffer.lock().unwrap().take_page(); + let collector = match page { + None => return Ok(None), + Some(Page::Memory(blocks)) => { + let mut collector = BlocksCollector::new(); + for block in blocks { + collector.append_block(block); + } + collector + } + Some(Page::Spilled(block)) => { + let spiller = self.chan.spiller.lock().unwrap().clone().unwrap(); + let block = spiller.restore(&block.location).await?; + let mut collector = BlocksCollector::new(); + collector.append_block(block); + collector + } + }; + + let format_settings = self + .chan + .format_settings + .lock() + .unwrap() + .clone() + .unwrap_or_default(); + Ok(Some(collector.into_serializer(format_settings))) + } } #[derive(Clone)] @@ -461,12 +398,51 @@ impl SizedChannelSender where S: DataBlockSpill { #[async_backtrace::framed] - pub async fn send(&self, value: DataBlock) -> Result { - self.chan.send(value).await + pub async fn send(&mut self, mut block: DataBlock) -> Result { + loop { + let result = self.chan.buffer.lock().unwrap().try_add_block(block); + match result { + Ok(_) => { + self.chan.notify_on_sent.notify_one(); + return Ok(true); + } + Err(SendFail::Closed) => { + self.chan.notify_on_sent.notify_one(); + return Ok(false); + } + Err(SendFail::Full { page, remain }) => { + let mut to_add = self.chan.try_spill_page(page).await?; + loop { + let result = self.chan.buffer.lock().unwrap().try_add_page(to_add); + match result { + Ok(_) => break, + Err(SendFail::Closed) => return Ok(false), + Err(SendFail::Full { page, .. }) => { + to_add = Page::Memory(page); + self.chan.notify_on_recv.notified().await; + } + } + } + + match remain { + Some(remain) => { + block = remain; + } + None => return Ok(true), + } + } + } + } } - pub fn close(&self) { - self.chan.stop_send() + pub fn finish(&mut self) { + { + let mut buffer = self.chan.buffer.lock().unwrap(); + let builder = buffer.current_page.take().unwrap(); + buffer.pages.push_back(Page::Memory(builder.into_page())); + buffer.stop_send(); + } + self.chan.notify_on_sent.notify_one(); } pub fn closer(&self) -> SizedChannelSenderCloser { @@ -490,73 +466,20 @@ pub struct SizedChannelSenderCloser { impl SizedChannelSenderCloser where S: DataBlockSpill { - pub fn close(self) { - self.chan.stop_send() + pub fn abort(self) { + self.chan.buffer.lock().unwrap().stop_send(); + self.chan.notify_on_sent.notify_one(); } } struct SpillableBlock { - data: Option, - /// [SpillableBlock::slice] does not reallocate memory, so memorysize remains unchanged - memory_size: usize, - rows: usize, - location: Option, - processed: usize, + location: Location, } impl SpillableBlock { async fn spill_page(page: Vec, spiller: &impl DataBlockSpill) -> Result { - let rows = page.iter().map(DataBlock::num_rows).sum(); - let memory_size = page.iter().map(DataBlock::memory_size).sum(); let location = spiller.merge_and_spill(page).await?; - Ok(Self { - location: Some(location), - processed: 0, - rows, - memory_size, - data: None, - }) - } - - fn slice(&mut self, pos: usize) -> DataBlock { - let data = self.data.as_ref().unwrap(); - - let left = data.slice(0..pos); - let right = data.slice(pos..data.num_rows()); - - self.rows = right.num_rows(); - self.data = Some(right); - if self.location.is_some() { - self.processed += pos; - } - left - } - - async fn spill(&mut self, spiller: &S) -> Result<()> - where S: DataBlockSpill { - let data = self.data.take().unwrap(); - if self.location.is_none() { - let location = spiller.spill(data).await?; - self.location = Some(location); - } - Ok(()) - } - - fn restore(&mut self, location: &Location, data: DataBlock) { - assert_eq!(self.location.as_ref(), Some(location)); - self.data = Some(data) - } -} - -impl Debug for SpillableBlock { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - f.debug_struct("SpillableBlock") - .field("has_data", &self.data.is_some()) - .field("memory_size", &self.memory_size) - .field("rows", &self.rows) - .field("location", &self.location) - .field("processed", &self.processed) - .finish() + Ok(Self { location }) } } @@ -633,12 +556,12 @@ mod tests { for block in sender_data { sender.send(block).await.unwrap(); } - sender.close(); + sender.finish(); }); let mut received_blocks = Vec::new(); loop { - let (serializer, is_end) = receiver.collect_new_page(10, &Wait::Async).await.unwrap(); + let (serializer, is_end) = receiver.collect_new_page(&Wait::Async).await.unwrap(); if serializer.num_rows() > 0 { received_blocks.push(serializer.num_rows()); From c0abd0487fb9127949de02f840ddc2f16dd8a168 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 9 Sep 2025 14:14:55 +0800 Subject: [PATCH 21/33] x --- .../servers/http/v1/query/execute_state.rs | 26 +++++----- .../src/servers/http/v1/query/http_query.rs | 21 +++----- .../src/servers/http/v1/query/page_manager.rs | 20 ++++---- .../src/servers/http/v1/query/sized_spsc.rs | 49 +++++++++---------- src/query/service/src/spillers/adapter.rs | 16 ++++++ 5 files changed, 68 insertions(+), 64 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/execute_state.rs b/src/query/service/src/servers/http/v1/query/execute_state.rs index c718a13cf50a0..3673a6028c559 100644 --- a/src/query/service/src/servers/http/v1/query/execute_state.rs +++ b/src/query/service/src/servers/http/v1/query/execute_state.rs @@ -121,7 +121,7 @@ impl ExecuteState { pub struct ExecuteStarting { pub(crate) ctx: Arc, - pub(crate) sender: Sender, + pub(crate) sender: Option, } pub struct ExecuteRunning { @@ -427,7 +427,7 @@ impl ExecuteState { interpreter: Arc, schema: DataSchemaRef, ctx: Arc, - mut block_sender: Sender, + mut sender: Sender, executor: Arc>, ) -> Result<(), ExecutionError> { let make_error = || format!("failed to execute {}", interpreter.name()); @@ -438,34 +438,35 @@ impl ExecuteState { .with_context(make_error)?; match data_stream.next().await { None => { - let block = DataBlock::empty_with_schema(schema); - let _ = block_sender.send(block).await; + Self::send_data_block(&mut sender, &executor, DataBlock::empty_with_schema(schema)) + .await + .with_context(make_error)?; Executor::stop::<()>(&executor, Ok(())); - block_sender.finish(); + sender.finish(); } Some(Err(err)) => { Executor::stop(&executor, Err(err)); - block_sender.finish(); + sender.abort(); } Some(Ok(block)) => { - Self::send_data_block(&mut block_sender, &executor, block) + Self::send_data_block(&mut sender, &executor, block) .await .with_context(make_error)?; while let Some(block) = data_stream.next().await { match block { Ok(block) => { - Self::send_data_block(&mut block_sender, &executor, block) + Self::send_data_block(&mut sender, &executor, block) .await .with_context(make_error)?; } Err(err) => { - block_sender.finish(); + sender.abort(); return Err(err.with_context(make_error())); } }; } Executor::stop::<()>(&executor, Ok(())); - block_sender.finish(); + sender.finish(); } } Ok(()) @@ -475,11 +476,12 @@ impl ExecuteState { sender: &mut Sender, executor: &Arc>, block: DataBlock, - ) -> Result<()> { + ) -> Result { match sender.send(block).await { - Ok(_) => Ok(()), + Ok(ok) => Ok(ok), Err(err) => { Executor::stop(executor, Err(err.clone())); + sender.abort(); Err(err) } } diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 6f5dcb393fb51..5fab50c44e3c7 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -621,17 +621,14 @@ impl HttpQuery { query_id: query_id.clone(), state: ExecuteState::Starting(ExecuteStarting { ctx: ctx.clone(), - sender, + sender: Some(sender), }), })); let settings = session.get_settings(); let result_timeout_secs = settings.get_http_handler_result_timeout_secs()?; - let page_manager = Arc::new(TokioMutex::new(PageManager::new( - req.pagination.max_rows_per_page, - receiver, - ))); + let page_manager = Arc::new(TokioMutex::new(PageManager::new(receiver))); Ok(HttpQuery { id: query_id, @@ -789,14 +786,14 @@ impl HttpQuery { pub async fn start_query(&mut self, sql: String) -> Result<()> { let span = fastrace::Span::enter_with_local_parent("HttpQuery::start_query"); let (block_sender, query_context) = { - let state = self.executor.lock(); - let ExecuteState::Starting(state) = &state.state else { + let state = &mut self.executor.lock().state; + let ExecuteState::Starting(state) = state else { return Err(ErrorCode::Internal( "[HTTP-QUERY] Invalid query state: expected Starting state", )); }; - (state.sender.clone(), state.ctx.clone()) + (state.sender.take().unwrap(), state.ctx.clone()) }; let query_session = query_context.get_current_session(); @@ -851,17 +848,11 @@ impl HttpQuery { #[async_backtrace::framed] pub async fn kill(&self, reason: ErrorCode) { // the query will be removed from the query manager before the session is dropped. - self.detach().await; + self.page_manager.lock().await.detach().await; Executor::stop(&self.executor, Err(reason)); } - #[async_backtrace::framed] - async fn detach(&self) { - let mut data = self.page_manager.lock().await; - data.detach().await - } - #[async_backtrace::framed] pub async fn update_expire_time(&self, before_wait: bool) { let duration = Duration::from_secs(self.result_timeout_secs) diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index 2bda3ef1f95b3..dd472d5fc7c78 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -33,26 +33,21 @@ pub struct ResponseData { } pub struct PageManager { - max_rows_per_page: usize, total_rows: usize, total_pages: usize, end: bool, last_page: Option, - block_receiver: SizedChannelReceiver, + receiver: SizedChannelReceiver, } impl PageManager { - pub fn new( - max_rows_per_page: usize, - block_receiver: SizedChannelReceiver, - ) -> PageManager { + pub fn new(receiver: SizedChannelReceiver) -> PageManager { PageManager { total_rows: 0, last_page: None, total_pages: 0, end: false, - block_receiver, - max_rows_per_page, + receiver, } } @@ -70,7 +65,7 @@ impl PageManager { let next_no = self.total_pages; if page_no == next_no { if !self.end { - let (serializer, end) = self.block_receiver.collect_new_page(tp).await?; + let (serializer, end) = self.receiver.collect_new_page(tp).await?; let num_row = serializer.num_rows(); log::debug!(num_row, wait_type:? = tp; "collect_new_page"); self.total_rows += num_row; @@ -113,7 +108,12 @@ impl PageManager { #[async_backtrace::framed] pub async fn detach(&mut self) { - self.block_receiver.close(); self.last_page = None; + if let Some(spiller) = self.receiver.close() { + let _ = spiller.cleanup().await; + }; } + + #[async_backtrace::framed] + pub async fn cleanup(&mut self) {} } diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index a03e8ce94bd1a..468c19809b571 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -50,7 +50,7 @@ where enum Page { Memory(Vec), - Spilled(SpillableBlock), + Spilled(Location), } struct SizedChannelBuffer { @@ -179,10 +179,6 @@ impl PageBuilder { self.remain_rows > 0 && self.remain_size > 0 } - fn is_empty(&self) -> bool { - self.blocks.is_empty() - } - fn num_rows(&self) -> usize { self.blocks.iter().map(DataBlock::num_rows).sum() } @@ -269,8 +265,8 @@ where S: DataBlockSpill let Some(spiller) = spiller.as_ref() else { return Ok(Page::Memory(page)); }; - let spilled = SpillableBlock::spill_page(page, spiller).await?; - Ok(Page::Spilled(spilled)) + let location = spiller.merge_and_spill(page).await?; + Ok(Page::Spilled(location)) } #[async_backtrace::framed] @@ -317,8 +313,14 @@ pub struct SizedChannelReceiver { impl SizedChannelReceiver where S: DataBlockSpill { - pub fn close(&self) { + pub fn close(&mut self) -> Option { self.chan.stop_recv(); + { + let mut buffer = self.chan.buffer.lock().unwrap(); + buffer.current_page = None; + buffer.pages.clear(); + } + self.chan.spiller.lock().unwrap().take() } #[async_backtrace::framed] @@ -362,16 +364,16 @@ where S: DataBlockSpill let page = self.chan.buffer.lock().unwrap().take_page(); let collector = match page { None => return Ok(None), - Some(Page::Memory(blocks)) => { + Some(Page::Memory(page)) => { let mut collector = BlocksCollector::new(); - for block in blocks { + for block in page { collector.append_block(block); } collector } - Some(Page::Spilled(block)) => { + Some(Page::Spilled(location)) => { let spiller = self.chan.spiller.lock().unwrap().clone().unwrap(); - let block = spiller.restore(&block.location).await?; + let block = spiller.restore(&location).await?; let mut collector = BlocksCollector::new(); collector.append_block(block); collector @@ -389,7 +391,6 @@ where S: DataBlockSpill } } -#[derive(Clone)] pub struct SizedChannelSender { chan: Arc>, } @@ -435,7 +436,12 @@ where S: DataBlockSpill } } - pub fn finish(&mut self) { + pub fn abort(&mut self) { + self.chan.buffer.lock().unwrap().stop_send(); + self.chan.notify_on_sent.notify_one(); + } + + pub fn finish(self) { { let mut buffer = self.chan.buffer.lock().unwrap(); let builder = buffer.current_page.take().unwrap(); @@ -472,17 +478,6 @@ where S: DataBlockSpill } } -struct SpillableBlock { - location: Location, -} - -impl SpillableBlock { - async fn spill_page(page: Vec, spiller: &impl DataBlockSpill) -> Result { - let location = spiller.merge_and_spill(page).await?; - Ok(Self { location }) - } -} - #[cfg(test)] mod tests { use std::collections::HashMap; @@ -519,8 +514,8 @@ mod tests { Ok(Location::Remote(key)) } - async fn merge_and_spill(&self, _data_block: Vec) -> Result { - unimplemented!() + async fn merge_and_spill(&self, data_block: Vec) -> Result { + self.spill(DataBlock::concat(&data_block)?).await } async fn restore(&self, location: &Location) -> Result { diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index c0910b6082a92..13ac4d7755519 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; use std::collections::HashSet; +use std::ops::DerefMut; use std::ops::Range; use std::sync::Arc; use std::sync::RwLock; @@ -396,6 +397,21 @@ impl LiteSpiller { config, )?))) } + + pub async fn cleanup(self) -> Result<()> { + let files = std::mem::take(self.0.adapter.files.write().unwrap().deref_mut()); + let files: Vec<_> = files + .into_keys() + .filter_map(|location| match location { + Location::Remote(path) => Some(path), + Location::Local(_) => None, + }) + .collect(); + let op = self.0.local_operator.as_ref().unwrap_or(&self.0.operator); + + op.delete_iter(files).await?; + Ok(()) + } } #[async_trait::async_trait] From a25beca6e0e864d94de7c35d6ad86500b462d184 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 9 Sep 2025 14:54:24 +0800 Subject: [PATCH 22/33] fix --- .../src/servers/http/v1/query/http_query.rs | 12 +----- .../src/servers/http/v1/query/page_manager.rs | 36 +++++++++++------- .../src/servers/http/v1/query/sized_spsc.rs | 38 ++++++++----------- 3 files changed, 40 insertions(+), 46 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 5fab50c44e3c7..24d74c338c2c1 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -60,7 +60,6 @@ use crate::servers::http::v1::query::execute_state::ExecuteStarting; use crate::servers::http::v1::query::execute_state::ExecuteStopped; use crate::servers::http::v1::query::execute_state::ExecutorSessionState; use crate::servers::http::v1::query::execute_state::Progresses; -use crate::servers::http::v1::query::sized_spsc::sized_spsc; use crate::servers::http::v1::query::ExecuteState; use crate::servers::http::v1::query::ExecuteStateKind; use crate::servers::http::v1::query::Executor; @@ -74,7 +73,6 @@ use crate::servers::http::v1::QueryStats; use crate::sessions::QueryAffect; use crate::sessions::Session; use crate::sessions::TableContext; -use crate::spillers::LiteSpiller; fn default_as_true() -> bool { true @@ -612,11 +610,7 @@ impl HttpQuery { }) }; - let (sender, receiver) = sized_spsc::( - req.pagination.max_rows_in_buffer, - req.pagination.max_rows_per_page, - ); - + let (page_manager, sender) = PageManager::create(&req.pagination); let executor = Arc::new(Mutex::new(Executor { query_id: query_id.clone(), state: ExecuteState::Starting(ExecuteStarting { @@ -628,8 +622,6 @@ impl HttpQuery { let settings = session.get_settings(); let result_timeout_secs = settings.get_http_handler_result_timeout_secs()?; - let page_manager = Arc::new(TokioMutex::new(PageManager::new(receiver))); - Ok(HttpQuery { id: query_id, user_name: http_ctx.user_name.clone(), @@ -637,7 +629,7 @@ impl HttpQuery { node_id, request: req, executor, - page_manager, + page_manager: Arc::new(TokioMutex::new(page_manager)), result_timeout_secs, state: Arc::new(Mutex::new(HttpQueryState::Working)), diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index dd472d5fc7c78..aa2a44e33eb2c 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -18,8 +18,11 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use super::blocks_serializer::BlocksSerializer; -use crate::servers::http::v1::query::sized_spsc::SizedChannelReceiver; -use crate::servers::http::v1::query::sized_spsc::Wait; +use super::http_query::PaginationConf; +use super::sized_spsc::sized_spsc; +use super::sized_spsc::SizedChannelReceiver; +use super::sized_spsc::SizedChannelSender; +use super::Wait; use crate::spillers::LiteSpiller; #[derive(Clone)] @@ -41,14 +44,20 @@ pub struct PageManager { } impl PageManager { - pub fn new(receiver: SizedChannelReceiver) -> PageManager { - PageManager { - total_rows: 0, - last_page: None, - total_pages: 0, - end: false, - receiver, - } + pub fn create(conf: &PaginationConf) -> (PageManager, SizedChannelSender) { + let (sender, receiver) = + sized_spsc::(conf.max_rows_in_buffer, conf.max_rows_per_page); + + ( + PageManager { + total_rows: 0, + last_page: None, + total_pages: 0, + end: false, + receiver, + }, + sender, + ) } pub fn next_page_no(&mut self) -> Option { @@ -110,10 +119,9 @@ impl PageManager { pub async fn detach(&mut self) { self.last_page = None; if let Some(spiller) = self.receiver.close() { - let _ = spiller.cleanup().await; + if let Err(error) = spiller.cleanup().await { + log::error!(error:?; "clean up spilled result set file fail"); + } }; } - - #[async_backtrace::framed] - pub async fn cleanup(&mut self) {} } diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 468c19809b571..3807875391f6f 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -117,12 +117,16 @@ impl SizedChannelBuffer { } fn try_add_page(&mut self, page: Page) -> result::Result<(), SendFail> { - assert!(self.current_page.is_none()); - if self.is_recv_stopped || self.is_send_stopped { return Err(SendFail::Closed); } + let is_none = self + .current_page + .replace(PageBuilder::new(self.page_size)) + .is_none(); + assert!(is_none); + match page { Page::Memory(blocks) => { let rows = blocks.iter().map(DataBlock::num_rows).sum::(); @@ -216,10 +220,10 @@ impl PageBuilder { } fn try_append_block(&mut self, block: DataBlock) -> Option { + assert!(self.has_capacity()); let take_rows = self.calculate_take_rows(&block); let total = block.num_rows(); if take_rows < block.num_rows() { - self.remain_rows = 0; self.append_full_block(block.slice(0..take_rows)); Some(block.slice(take_rows..total)) } else { @@ -493,19 +497,11 @@ mod tests { use super::*; - #[derive(Clone)] + #[derive(Clone, Default)] struct MockSpiller { storage: Arc>>, } - impl MockSpiller { - fn new() -> Self { - Self { - storage: Arc::new(Mutex::new(HashMap::new())), - } - } - } - #[async_trait::async_trait] impl DataBlockSpill for MockSpiller { async fn spill(&self, data_block: DataBlock) -> Result { @@ -534,7 +530,7 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_sized_spsc_channel() { - let (mut sender, mut receiver) = sized_spsc::(2, 10); + let (mut sender, mut receiver) = sized_spsc::(5, 4); let test_data = vec![ DataBlock::new_from_columns(vec![Int32Type::from_data(vec![1, 2, 3])]), @@ -545,7 +541,7 @@ mod tests { let sender_data = test_data.clone(); let send_task = databend_common_base::runtime::spawn(async move { let format_settings = FormatSettings::default(); - let spiller = MockSpiller::new(); + let spiller = MockSpiller::default(); sender.plan_ready(format_settings, Some(spiller)); for block in sender_data { @@ -554,12 +550,12 @@ mod tests { sender.finish(); }); - let mut received_blocks = Vec::new(); + let mut received_blocks_size = Vec::new(); loop { let (serializer, is_end) = receiver.collect_new_page(&Wait::Async).await.unwrap(); if serializer.num_rows() > 0 { - received_blocks.push(serializer.num_rows()); + received_blocks_size.push(serializer.num_rows()); } if is_end { @@ -567,12 +563,10 @@ mod tests { } } - send_task.await.unwrap(); - - let expected_total_rows: usize = test_data.iter().map(|b| b.num_rows()).sum(); - let received_total_rows: usize = received_blocks.iter().sum(); + let storage = receiver.close().unwrap().storage; + assert_eq!(storage.lock().unwrap().len(), 1); - assert_eq!(expected_total_rows, received_total_rows); - assert!(!received_blocks.is_empty()); + send_task.await.unwrap(); + assert_eq!(received_blocks_size, vec![4, 4, 1]); } } From c38d7853214dec238e5d3a866771ae33589e1d41 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 9 Sep 2025 17:29:56 +0800 Subject: [PATCH 23/33] test --- Cargo.lock | 1 + src/query/service/Cargo.toml | 1 + .../src/servers/http/v1/query/page_manager.rs | 6 +- .../src/servers/http/v1/query/sized_spsc.rs | 145 ++++++++++++++---- .../it/servers/http/http_query_handlers.rs | 8 +- 5 files changed, 124 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b9b462ca9cf0c..eb94c5977cf41 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5285,6 +5285,7 @@ dependencies = [ "poem", "pretty_assertions", "prometheus-client 0.22.3", + "proptest", "prost", "rand 0.8.5", "recursive", diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 8a5715357df1e..f9dfe289ee43f 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -197,6 +197,7 @@ tempfile = { workspace = true } tower = { workspace = true } url = { workspace = true } wiremock = { workspace = true } +proptest = { workspace = true } [build-dependencies] databend-common-building = { workspace = true } diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index aa2a44e33eb2c..e0efac301dd83 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -70,13 +70,13 @@ impl PageManager { #[async_backtrace::framed] #[fastrace::trace(name = "PageManager::get_a_page")] - pub async fn get_a_page(&mut self, page_no: usize, tp: &Wait) -> Result { + pub async fn get_a_page(&mut self, page_no: usize, wait: &Wait) -> Result { let next_no = self.total_pages; if page_no == next_no { if !self.end { - let (serializer, end) = self.receiver.collect_new_page(tp).await?; + let (serializer, end) = self.receiver.next_page(wait).await?; let num_row = serializer.num_rows(); - log::debug!(num_row, wait_type:? = tp; "collect_new_page"); + log::debug!(num_row, wait_type:? = wait; "collect_new_page"); self.total_rows += num_row; let page = Page { data: Arc::new(serializer), diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 3807875391f6f..fceed0ceeca1e 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -121,13 +121,10 @@ impl SizedChannelBuffer { return Err(SendFail::Closed); } - let is_none = self - .current_page - .replace(PageBuilder::new(self.page_size)) - .is_none(); - assert!(is_none); + assert!(self.current_page.is_none()); match page { + page @ Page::Spilled(_) => self.pages.push_back(page), Page::Memory(blocks) => { let rows = blocks.iter().map(DataBlock::num_rows).sum::(); if self.pages_rows() + rows > self.max_size { @@ -138,8 +135,9 @@ impl SizedChannelBuffer { }; self.pages.push_back(Page::Memory(blocks)) } - page @ Page::Spilled(_) => self.pages.push_back(page), }; + + self.current_page = Some(PageBuilder::new(self.page_size)); Ok(()) } @@ -187,7 +185,7 @@ impl PageBuilder { self.blocks.iter().map(DataBlock::num_rows).sum() } - fn calculate_take_rows(&self, block: &DataBlock) -> usize { + fn calculate_take_rows(&self, block: &DataBlock) -> (usize, usize) { let block_rows = block.num_rows(); let memory_size = block .columns() @@ -198,36 +196,34 @@ impl PageBuilder { }) .sum::(); - min( - self.remain_rows, - if memory_size > self.remain_size { - (self.remain_size * block_rows) / memory_size - } else { - block_rows - }, + ( + min( + self.remain_rows, + if memory_size > self.remain_size { + (self.remain_size * block_rows) / memory_size + } else { + block_rows + }, + ) + .max(1), + memory_size, ) - .max(1) - } - - fn append_full_block(&mut self, block: DataBlock) { - let memory_size = block.memory_size(); - let num_rows = block.num_rows(); - - self.remain_size -= min(self.remain_size, memory_size); - self.remain_rows -= num_rows; - - self.blocks.push(block); } fn try_append_block(&mut self, block: DataBlock) -> Option { assert!(self.has_capacity()); - let take_rows = self.calculate_take_rows(&block); + let (take_rows, memory_size) = self.calculate_take_rows(&block); let total = block.num_rows(); - if take_rows < block.num_rows() { - self.append_full_block(block.slice(0..take_rows)); + if take_rows < total { + self.remain_size = 0; + self.remain_rows -= total; + self.blocks.push(block.slice(0..take_rows)); Some(block.slice(take_rows..total)) } else { - self.append_full_block(block); + self.remain_size -= min(self.remain_size, memory_size); + self.remain_rows -= total; + + self.blocks.push(block); None } } @@ -328,7 +324,7 @@ where S: DataBlockSpill } #[async_backtrace::framed] - pub async fn collect_new_page(&mut self, tp: &Wait) -> Result<(BlocksSerializer, bool)> { + pub async fn next_page(&mut self, tp: &Wait) -> Result<(BlocksSerializer, bool)> { let page = match tp { Wait::Async => self.try_take_page().await?, Wait::Deadline(t) => { @@ -383,6 +379,7 @@ where S: DataBlockSpill collector } }; + self.chan.notify_on_recv.notify_one(); let format_settings = self .chan @@ -487,13 +484,19 @@ mod tests { use std::collections::HashMap; use std::sync::Arc; use std::sync::Mutex; + use std::time::Duration; use databend_common_exception::ErrorCode; use databend_common_expression::types::Int32Type; + use databend_common_expression::types::Number; + use databend_common_expression::types::NumberType; use databend_common_expression::FromData; use databend_common_io::prelude::FormatSettings; use databend_common_pipeline_transforms::traits::DataBlockSpill; use databend_common_pipeline_transforms::traits::Location; + use proptest::prelude::*; + use proptest::strategy::ValueTree; + use proptest::test_runner::TestRunner; use super::*; @@ -552,7 +555,7 @@ mod tests { let mut received_blocks_size = Vec::new(); loop { - let (serializer, is_end) = receiver.collect_new_page(&Wait::Async).await.unwrap(); + let (serializer, is_end) = receiver.next_page(&Wait::Async).await.unwrap(); if serializer.num_rows() > 0 { received_blocks_size.push(serializer.num_rows()); @@ -569,4 +572,84 @@ mod tests { send_task.await.unwrap(); assert_eq!(received_blocks_size, vec![4, 4, 1]); } + + fn data_block_strategy(len: usize) -> impl Strategy + where + N: Number + Arbitrary + 'static, + NumberType: FromData, + { + prop::collection::vec(any::(), len) + .prop_map(|data| DataBlock::new_from_columns(vec![NumberType::::from_data(data)])) + } + + fn data_block_vec_strategy() -> impl Strategy> { + let num_rows_strategy = 0..30_usize; + let num_blocks_strategy = 0..100_usize; + (num_blocks_strategy, num_rows_strategy).prop_flat_map(|(num_blocks, num_rows)| { + prop::collection::vec(data_block_strategy::(num_rows), num_blocks) + }) + } + + async fn delay() { + if rand::thread_rng().gen_bool(0.7) { + let delay = rand::thread_rng().gen_range(0..1000); + tokio::time::sleep(Duration::from_micros(delay)).await; + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_sized_spsc_channel_fuzz() { + let mut runner = TestRunner::default(); + for _ in 0..100 { + let (has_spiller, max_size, page_size, test_data) = ( + any::(), + 10_usize..20, + 5_usize..8, + data_block_vec_strategy(), + ) + .new_tree(&mut runner) + .unwrap() + .current(); + + let (mut sender, mut receiver) = sized_spsc::(max_size, page_size); + + let sender_data = test_data.clone(); + let send_task = databend_common_base::runtime::spawn(async move { + let format_settings = FormatSettings::default(); + sender.plan_ready(format_settings, has_spiller.then(MockSpiller::default)); + + for block in sender_data { + delay().await; + sender.send(block).await.unwrap(); + delay().await; + } + sender.finish(); + }); + + let mut received_blocks_size = Vec::new(); + loop { + delay().await; + let wait = Wait::Deadline(Instant::now() + Duration::from_millis(50)); + let (serializer, is_end) = receiver.next_page(&wait).await.unwrap(); + if !serializer.is_empty() { + received_blocks_size.push(serializer.num_rows()); + } + if is_end { + break; + } + } + + send_task.await.unwrap(); + + if has_spiller { + let _ = receiver.close().unwrap().storage; + } else { + assert!(receiver.close().is_none()); + } + + let total = test_data.iter().map(|data| data.num_rows()).sum::(); + + assert_eq!(received_blocks_size.iter().sum::(), total); + } + } } diff --git a/src/query/service/tests/it/servers/http/http_query_handlers.rs b/src/query/service/tests/it/servers/http/http_query_handlers.rs index eb5a5366e379c..b870b15a74642 100644 --- a/src/query/service/tests/it/servers/http/http_query_handlers.rs +++ b/src/query/service/tests/it/servers/http/http_query_handlers.rs @@ -1758,7 +1758,8 @@ async fn test_has_result_set() -> Result<()> { #[tokio::test(flavor = "current_thread")] async fn test_max_size_per_page() -> Result<()> { - let _fixture = TestFixture::setup().await?; + let _fixture = + TestFixture::setup_with_config(&ConfigBuilder::create().off_log().config()).await?; let sql = "select repeat('1', 1000) as a, repeat('2', 1000) from numbers(10000)"; let wait_time_secs = 5; @@ -1768,8 +1769,9 @@ async fn test_max_size_per_page() -> Result<()> { let target = (10_usize * 1024 * 1024) as f64; assert!( (0.9..1.1).contains(&(body.len() as f64 / target)), - "body len {}", - body.len() + "body len {} rows {}", + body.len(), + reply.data.len() ); Ok(()) } From 5e9e739f601b422d8000e1e9d86bffbb8c220f72 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 9 Sep 2025 17:46:55 +0800 Subject: [PATCH 24/33] disable --- src/query/settings/src/settings_default.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 4ff98e48eec3b..5205204fbe2d9 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -697,7 +697,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(0..=u64::MAX)), }), ("enable_result_set_spilling", DefaultSettingValue { - value: UserSettingValue::UInt64(1), + value: UserSettingValue::UInt64(0), desc: "Enable spilling result set data to storage when memory usage exceeds the threshold.", mode: SettingMode::Both, scope: SettingScope::Both, From 0da4d9046ae4050d402043dd3c150627f4ccd029 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 9 Sep 2025 18:00:35 +0800 Subject: [PATCH 25/33] fix --- src/query/service/Cargo.toml | 2 +- .../service/src/servers/http/v1/query/sized_spsc.rs | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index f9dfe289ee43f..66a8a9f07d023 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -189,6 +189,7 @@ maplit = { workspace = true } mysql_async = { workspace = true } p256 = { workspace = true } pretty_assertions = { workspace = true } +proptest = { workspace = true } reqwest = { workspace = true } serde_json.workspace = true serde_yaml = { workspace = true } @@ -197,7 +198,6 @@ tempfile = { workspace = true } tower = { workspace = true } url = { workspace = true } wiremock = { workspace = true } -proptest = { workspace = true } [build-dependencies] databend-common-building = { workspace = true } diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index fceed0ceeca1e..f8b58c31330d4 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -213,16 +213,15 @@ impl PageBuilder { fn try_append_block(&mut self, block: DataBlock) -> Option { assert!(self.has_capacity()); let (take_rows, memory_size) = self.calculate_take_rows(&block); - let total = block.num_rows(); - if take_rows < total { + let total_rows = block.num_rows(); + if take_rows < total_rows { self.remain_size = 0; - self.remain_rows -= total; + self.remain_rows -= take_rows; self.blocks.push(block.slice(0..take_rows)); - Some(block.slice(take_rows..total)) + Some(block.slice(take_rows..total_rows)) } else { self.remain_size -= min(self.remain_size, memory_size); - self.remain_rows -= total; - + self.remain_rows -= total_rows; self.blocks.push(block); None } From 8e380641f0bb290355a8cd415c6cdee258153544 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 9 Sep 2025 20:58:39 +0800 Subject: [PATCH 26/33] fix --- scripts/test-bend-tests/main.sh | 2 +- .../src/servers/http/v1/query/sized_spsc.rs | 99 ++++++++++--------- tests/nox/noxfile.py | 6 +- .../09_http_handler/09_0007_session.py | 13 ++- 4 files changed, 67 insertions(+), 53 deletions(-) diff --git a/scripts/test-bend-tests/main.sh b/scripts/test-bend-tests/main.sh index cc3410951627e..8977897e6023a 100755 --- a/scripts/test-bend-tests/main.sh +++ b/scripts/test-bend-tests/main.sh @@ -10,4 +10,4 @@ echo "🔧 Installing databend_test_helper..." pip install -e "$PROJECT_DIR/databend_test_helper" echo "🚀 Running test cluster..." -python "$SCRIPT_DIR/test_cluster.py" \ No newline at end of file +python "$SCRIPT_DIR/test_cluster.py" diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index f8b58c31330d4..c6a3e02b6d835 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -42,7 +42,7 @@ pub fn sized_spsc( where S: DataBlockSpill, { - let chan = Arc::new(SizedChannel::create(max_size, page_size)); + let chan = Arc::new(SizedChannel::new(max_size, page_size)); let sender = SizedChannelSender { chan: chan.clone() }; let receiver = SizedChannelReceiver { chan }; (sender, receiver) @@ -54,8 +54,8 @@ enum Page { } struct SizedChannelBuffer { - max_size: usize, - page_size: usize, + max_rows: usize, + page_rows: usize, pages: VecDeque, current_page: Option, is_recv_stopped: bool, @@ -63,12 +63,12 @@ struct SizedChannelBuffer { } impl SizedChannelBuffer { - fn create(max_size: usize, page_size: usize) -> Self { + fn new(max_rows: usize, page_rows: usize) -> Self { SizedChannelBuffer { - max_size, - page_size, + max_rows: max_rows.max(page_rows), + page_rows, pages: Default::default(), - current_page: Some(PageBuilder::new(page_size)), + current_page: Some(PageBuilder::new(page_rows)), is_recv_stopped: false, is_send_stopped: false, } @@ -81,7 +81,11 @@ impl SizedChannelBuffer { Page::Memory(blocks) => blocks.iter().map(DataBlock::num_rows).sum::(), Page::Spilled(_) => 0, }) - .sum::() + .sum() + } + + fn is_pages_full(&self, reserve: usize) -> bool { + self.pages_rows() + reserve > self.max_rows } fn try_add_block(&mut self, mut block: DataBlock) -> result::Result<(), SendFail> { @@ -90,24 +94,27 @@ impl SizedChannelBuffer { } loop { - let page_builder = self.current_page.as_mut().unwrap(); + let page_builder = self.current_page.as_mut().expect("current_page has taken"); let remain = page_builder.try_append_block(block); if !page_builder.has_capacity() { let rows = page_builder.num_rows(); - - if self.pages_rows() + rows > self.max_size { + if self.is_pages_full(rows) { return Err(SendFail::Full { - page: self.current_page.take().unwrap().into_page(), + page: self + .current_page + .take() + .expect("current_page has taken") + .into_page(), remain, }); } - self.pages.push_back(Page::Memory( - self.current_page - .replace(PageBuilder::new(self.page_size)) - .unwrap() - .into_page(), - )); + let page = self + .current_page + .replace(PageBuilder::new(self.page_rows)) + .expect("current_page has taken") + .into_page(); + self.pages.push_back(Page::Memory(page)); } match remain { Some(remain) => block = remain, @@ -125,19 +132,16 @@ impl SizedChannelBuffer { match page { page @ Page::Spilled(_) => self.pages.push_back(page), - Page::Memory(blocks) => { - let rows = blocks.iter().map(DataBlock::num_rows).sum::(); - if self.pages_rows() + rows > self.max_size { - return Err(SendFail::Full { - page: blocks, - remain: None, - }); + Page::Memory(page) => { + let rows = page.iter().map(DataBlock::num_rows).sum(); + if self.is_pages_full(rows) { + return Err(SendFail::Full { page, remain: None }); }; - self.pages.push_back(Page::Memory(blocks)) + self.pages.push_back(Page::Memory(page)) } }; - self.current_page = Some(PageBuilder::new(self.page_size)); + self.current_page = Some(PageBuilder::new(self.page_rows)); Ok(()) } @@ -162,10 +166,10 @@ enum SendFail { }, } -pub struct PageBuilder { - pub blocks: Vec, - pub remain_rows: usize, - pub remain_size: usize, +struct PageBuilder { + blocks: Vec, + remain_rows: usize, + remain_size: usize, } impl PageBuilder { @@ -245,14 +249,14 @@ struct SizedChannel { impl SizedChannel where S: DataBlockSpill { - fn create(max_size: usize, page_size: usize) -> Self { + fn new(max_rows: usize, page_rows: usize) -> Self { SizedChannel { - buffer: Mutex::new(SizedChannelBuffer::create(max_size, page_size)), + buffer: Mutex::new(SizedChannelBuffer::new(max_rows, page_rows)), notify_on_sent: Default::default(), notify_on_recv: Default::default(), is_plan_ready: WatchNotify::new(), - format_settings: Mutex::new(None), - spiller: Mutex::new(None), + format_settings: Default::default(), + spiller: Default::default(), } } @@ -289,12 +293,8 @@ where S: DataBlockSpill buffer.is_send_stopped && buffer.pages.is_empty() } - fn stop_recv(&self) { - self.buffer.lock().unwrap().stop_recv(); - self.notify_on_recv.notify_one(); - } - fn should_spill(&self) -> bool { + // todo: expected to be controlled externally true } } @@ -313,13 +313,15 @@ impl SizedChannelReceiver where S: DataBlockSpill { pub fn close(&mut self) -> Option { - self.chan.stop_recv(); { let mut buffer = self.chan.buffer.lock().unwrap(); + buffer.stop_recv(); buffer.current_page = None; buffer.pages.clear(); } - self.chan.spiller.lock().unwrap().take() + let spiller = self.chan.spiller.lock().unwrap().take(); + self.chan.notify_on_recv.notify_one(); + spiller } #[async_backtrace::framed] @@ -444,8 +446,14 @@ where S: DataBlockSpill pub fn finish(self) { { let mut buffer = self.chan.buffer.lock().unwrap(); - let builder = buffer.current_page.take().unwrap(); - buffer.pages.push_back(Page::Memory(builder.into_page())); + if !buffer.is_recv_stopped && !buffer.is_send_stopped { + let page = buffer + .current_page + .take() + .expect("current_page has taken") + .into_page(); + buffer.pages.push_back(Page::Memory(page)); + } buffer.stop_send(); } self.chan.notify_on_sent.notify_one(); @@ -565,8 +573,7 @@ mod tests { } } - let storage = receiver.close().unwrap().storage; - assert_eq!(storage.lock().unwrap().len(), 1); + let _ = receiver.close().unwrap().storage; send_task.await.unwrap(); assert_eq!(received_blocks_size, vec![4, 4, 1]); diff --git a/tests/nox/noxfile.py b/tests/nox/noxfile.py index d7c654bc65288..3ed6e4066f8ed 100644 --- a/tests/nox/noxfile.py +++ b/tests/nox/noxfile.py @@ -17,9 +17,9 @@ def python_client(session, driver_version): env = { "DRIVER_VERSION": driver_version, } - session.run("behave","tests/asyncio", env=env) - session.run("behave","tests/blocking", env=env) - session.run("behave","tests/cursor", env=env) + session.run("behave", "tests/asyncio", env=env) + session.run("behave", "tests/blocking", env=env) + session.run("behave", "tests/cursor", env=env) JDBC_DRIVER = ["0.4.0", "main"] diff --git a/tests/suites/1_stateful/09_http_handler/09_0007_session.py b/tests/suites/1_stateful/09_http_handler/09_0007_session.py index a2c00a07c5bd7..878c48512e786 100755 --- a/tests/suites/1_stateful/09_http_handler/09_0007_session.py +++ b/tests/suites/1_stateful/09_http_handler/09_0007_session.py @@ -168,8 +168,12 @@ def test_no_session(): def do_query_from_worksheet(client, sql, sid=HEADER_SESSION_ID_V, new_session=False): - internal = None if new_session else "{}"; - payload = {"sql": sql, "pagination": {"max_rows_per_page": 2, "wait_time_secs": 10}, "session": {"internal": internal}} + internal = None if new_session else "{}" + payload = { + "sql": sql, + "pagination": {"max_rows_per_page": 2, "wait_time_secs": 10}, + "session": {"internal": internal}, + } resp = client.post( query_url, auth=auth, @@ -208,9 +212,12 @@ def test_worksheet_session(): resp = do_query_from_worksheet(client, "select * from t09_0007", new_session=True) assert "Unknown table" in resp["error"]["message"], resp - resp = do_query_from_worksheet(client, "select * from numbers(10) ", new_session=True) + resp = do_query_from_worksheet( + client, "select * from numbers(10) ", new_session=True + ) assert len(resp["data"]) == 2, resp + def main(): test_no_session() test_session() From 12556150ecadaa26f7497240e7cdf9a9ae7d54c2 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 11 Sep 2025 14:09:31 +0800 Subject: [PATCH 27/33] test --- .../test_09_0013_result_spill.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 tests/nox/suites/1_stateful/09_http_handler/test_09_0013_result_spill.py diff --git a/tests/nox/suites/1_stateful/09_http_handler/test_09_0013_result_spill.py b/tests/nox/suites/1_stateful/09_http_handler/test_09_0013_result_spill.py new file mode 100644 index 0000000000000..248cd186bcd7c --- /dev/null +++ b/tests/nox/suites/1_stateful/09_http_handler/test_09_0013_result_spill.py @@ -0,0 +1,101 @@ +import requests +import time + +auth = ("root", "") + + +def do_query(query, session, pagination): + url = f"http://localhost:8000/v1/query" + payload = { + "sql": query, + } + if session: + payload["session"] = session + if pagination: + payload["pagination"] = pagination + headers = { + "Content-Type": "application/json", + } + + response = requests.post(url, headers=headers, json=payload, auth=auth) + return response.json() + + +def test_disable_result_spill(): + session = { + "settings": { + "enable_result_set_spilling": "0", + "max_block_size": "10", + "max_threads": "2", + } + } + pagination = { + "wait_time_secs": 2, + "max_rows_in_buffer": 4, + "max_rows_per_page": 4, + } + resp = do_query("select * from numbers(97)", session=session, pagination=pagination) + assert resp["error"] == None + + # print(resp["state"]) + # print(resp["stats"]) + + rows = len(resp["data"]) + for _ in range(30): + if resp.get("next_uri") == None: + break + + uri = f"http://localhost:8000/{resp['next_uri']}" + resp = requests.get(uri, auth=auth).json() + cur_rows = len(resp["data"]) + assert resp["error"] == None, resp + if rows < 96: + assert cur_rows == 4 + rows += cur_rows + + # print(resp["state"]) + # print(resp["stats"]) + + # get same page again + assert requests.get(uri, auth=auth).json()["error"] == None, resp + + assert rows == 97 + + +def test_enable_result_spill(): + session = { + "settings": { + "enable_result_set_spilling": "1", + "max_block_size": "10", + "max_threads": "2", + } + } + pagination = { + "wait_time_secs": 2, + "max_rows_in_buffer": 4, + "max_rows_per_page": 4, + } + resp = do_query("select * from numbers(97)", session=session, pagination=pagination) + assert resp["error"] == None + + time.sleep(1) + + rows = len(resp["data"]) + for _ in range(30): + if resp.get("next_uri") == None: + break + + uri = f"http://localhost:8000/{resp['next_uri']}" + resp = requests.get(uri, auth=auth).json() + cur_rows = len(resp["data"]) + assert resp["error"] == None, resp + if rows < 96: + assert cur_rows == 4 + rows += cur_rows + + assert resp["state"] == "Succeeded" + + # get same page again + assert requests.get(uri, auth=auth).json()["error"] == None, resp + + assert rows == 97 From e786b5e7f1775a97e673a4c40cc561b26e12ddb4 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 12 Sep 2025 10:16:40 +0800 Subject: [PATCH 28/33] test --- .../src/servers/http/v1/query/sized_spsc.rs | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index c6a3e02b6d835..c5240d5df149f 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -539,7 +539,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread")] - async fn test_sized_spsc_channel() { + async fn test_spsc_channel() { let (mut sender, mut receiver) = sized_spsc::(5, 4); let test_data = vec![ @@ -579,6 +579,48 @@ mod tests { assert_eq!(received_blocks_size, vec![4, 4, 1]); } + #[tokio::test(flavor = "multi_thread")] + async fn test_spsc_slow() { + let (mut sender, mut receiver) = sized_spsc::(5, 4); + + let test_data = vec![DataBlock::new_from_columns(vec![Int32Type::from_data( + vec![1, 2, 3], + )])]; + + let wait = Arc::new(Notify::new()); + + let sender_wait = wait.clone(); + let sender_data = test_data.clone(); + let send_task = databend_common_base::runtime::spawn(async move { + let format_settings = FormatSettings::default(); + sender.plan_ready(format_settings, None); + + sender_wait.notified().await; + + for block in sender_data { + sender.send(block).await.unwrap(); + } + sender.finish(); + }); + + for _ in 0..10 { + let deadline = Instant::now() + Duration::from_millis(1); + let (serializer, _) = receiver.next_page(&Wait::Deadline(deadline)).await.unwrap(); + assert_eq!(serializer.num_rows(), 0); + } + + wait.notify_one(); + + let (serializer, _) = receiver + .next_page(&Wait::Deadline(Instant::now() + Duration::from_secs(1))) + .await + .unwrap(); + let _ = receiver.close(); + send_task.await.unwrap(); + + assert_eq!(serializer.num_rows(), 3); + } + fn data_block_strategy(len: usize) -> impl Strategy where N: Number + Arbitrary + 'static, @@ -604,7 +646,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread")] - async fn test_sized_spsc_channel_fuzz() { + async fn test_spsc_channel_fuzz() { let mut runner = TestRunner::default(); for _ in 0..100 { let (has_spiller, max_size, page_size, test_data) = ( From 759717116ab882198f6f92d716530b4049174af0 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 12 Sep 2025 10:30:59 +0800 Subject: [PATCH 29/33] update --- .../src/servers/http/v1/query/http_query.rs | 5 ++- .../src/servers/http/v1/query/sized_spsc.rs | 41 +++++++------------ 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 4b2851b32e1aa..d5298abe6dd5d 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -777,7 +777,6 @@ impl HttpQuery { } pub async fn start_query(&mut self, sql: String) -> Result<()> { - let span = fastrace::Span::enter_with_local_parent("HttpQuery::start_query"); let (block_sender, query_context) = { let state = &mut self.executor.lock().state; let ExecuteState::Starting(state) = state else { @@ -831,7 +830,9 @@ impl HttpQuery { block_sender_closer.abort(); } } - .in_span(span), + .in_span(fastrace::Span::enter_with_local_parent( + "HttpQuery::start_query", + )), None, )?; diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index c5240d5df149f..afe8b9bc22b60 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -329,14 +329,7 @@ where S: DataBlockSpill let page = match tp { Wait::Async => self.try_take_page().await?, Wait::Deadline(t) => { - let d = match t.checked_duration_since(Instant::now()) { - Some(d) if !d.is_zero() => d, - _ => { - // timeout() will return Ok if the future completes immediately - return Ok((BlocksSerializer::empty(), self.chan.is_close())); - } - }; - match tokio::time::timeout(d, self.chan.recv()).await { + match tokio::time::timeout_at((*t).into(), self.chan.recv()).await { Ok(true) => self.try_take_page().await?, Ok(false) => { info!("[HTTP-QUERY] Reached end of data blocks"); @@ -350,14 +343,11 @@ where S: DataBlockSpill } }; - let page = match page { - Some(page) => page, - None => BlocksSerializer::empty(), - }; - // try to report 'no more data' earlier to client to avoid unnecessary http call - let block_end = self.chan.is_close(); - Ok((page, block_end)) + Ok(( + page.unwrap_or_else(BlocksSerializer::empty), + self.chan.is_close(), + )) } #[fastrace::trace(name = "SizedChannelReceiver::try_take_page")] @@ -405,21 +395,22 @@ where S: DataBlockSpill loop { let result = self.chan.buffer.lock().unwrap().try_add_block(block); match result { + Err(SendFail::Closed) => return Ok(false), Ok(_) => { self.chan.notify_on_sent.notify_one(); return Ok(true); } - Err(SendFail::Closed) => { - self.chan.notify_on_sent.notify_one(); - return Ok(false); - } Err(SendFail::Full { page, remain }) => { + self.chan.notify_on_sent.notify_one(); let mut to_add = self.chan.try_spill_page(page).await?; loop { let result = self.chan.buffer.lock().unwrap().try_add_page(to_add); match result { - Ok(_) => break, Err(SendFail::Closed) => return Ok(false), + Ok(_) => { + self.chan.notify_on_sent.notify_one(); + break; + } Err(SendFail::Full { page, .. }) => { to_add = Page::Memory(page); self.chan.notify_on_recv.notified().await; @@ -427,12 +418,10 @@ where S: DataBlockSpill } } - match remain { - Some(remain) => { - block = remain; - } - None => return Ok(true), - } + let Some(remain) = remain else { + return Ok(true); + }; + block = remain; } } } From 8827b71a6e748f14bd55f56439c5e31b5656ba54 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 12 Sep 2025 11:12:27 +0800 Subject: [PATCH 30/33] format --- .../0_stateless/01_transaction/01_03_issue_18705.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/suites/0_stateless/01_transaction/01_03_issue_18705.py b/tests/suites/0_stateless/01_transaction/01_03_issue_18705.py index c7f8bcf08606f..1646e234cab25 100755 --- a/tests/suites/0_stateless/01_transaction/01_03_issue_18705.py +++ b/tests/suites/0_stateless/01_transaction/01_03_issue_18705.py @@ -6,7 +6,9 @@ if __name__ == "__main__": # Session 1: # Insert into empty table - mdb = mysql.connector.connect(host="127.0.0.1", user="root", passwd="root", port="3307") + mdb = mysql.connector.connect( + host="127.0.0.1", user="root", passwd="root", port="3307" + ) mycursor = mdb.cursor() mycursor.execute("create or replace table t_18705(c int)") mycursor.fetchall() @@ -17,9 +19,13 @@ # Session 2: # Alter table in another session, so that the new table after alter operation will still be empty - mydb_alter_tbl = mysql.connector.connect(host="127.0.0.1", user="root", passwd="root", port="3307") + mydb_alter_tbl = mysql.connector.connect( + host="127.0.0.1", user="root", passwd="root", port="3307" + ) mycursor_alter_tbl = mydb_alter_tbl.cursor() - mycursor_alter_tbl.execute("alter table t_18705 SET OPTIONS (block_per_segment = 500)") + mycursor_alter_tbl.execute( + "alter table t_18705 SET OPTIONS (block_per_segment = 500)" + ) mycursor_alter_tbl.fetchall() # Session 1: From 088a91a83f45be97f758896dbea4946749cad1e4 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 12 Sep 2025 13:41:53 +0800 Subject: [PATCH 31/33] doc --- src/query/service/src/servers/http/v1/query/sized_spsc.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index afe8b9bc22b60..81478998381b3 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -57,6 +57,9 @@ struct SizedChannelBuffer { max_rows: usize, page_rows: usize, pages: VecDeque, + + /// The current_page gets moved outside the lock to spilling and then moved back in, or cleared on close. + /// There's a lot of unwrap here to make sure there's no unintended behavior that's not by design. current_page: Option, is_recv_stopped: bool, is_send_stopped: bool, From d8217a5408e9fe337d4ff89a1467f170e9e405e8 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 12 Sep 2025 16:33:00 +0800 Subject: [PATCH 32/33] test --- .../src/servers/http/v1/query/sized_spsc.rs | 49 ++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 81478998381b3..40683a26f01cb 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -57,7 +57,7 @@ struct SizedChannelBuffer { max_rows: usize, page_rows: usize, pages: VecDeque, - + /// The current_page gets moved outside the lock to spilling and then moved back in, or cleared on close. /// There's a lot of unwrap here to make sure there's no unintended behavior that's not by design. current_page: Option, @@ -613,6 +613,53 @@ mod tests { assert_eq!(serializer.num_rows(), 3); } + #[tokio::test(flavor = "multi_thread")] + async fn test_spsc_abort() { + let (mut sender, mut receiver) = sized_spsc::(5, 4); + + let test_data = vec![ + DataBlock::new_from_columns(vec![Int32Type::from_data(vec![1, 2, 3])]), + DataBlock::new_from_columns(vec![Int32Type::from_data(vec![1, 2])]), + DataBlock::new_from_columns(vec![Int32Type::from_data(vec![])]), + DataBlock::new_from_columns(vec![Int32Type::from_data(vec![4, 5, 6, 7, 8])]), + DataBlock::new_from_columns(vec![Int32Type::from_data(vec![7, 8, 9])]), + ]; + + let sender_data = test_data.clone(); + let send_task = databend_common_base::runtime::spawn(async move { + let format_settings = FormatSettings::default(); + sender.plan_ready(format_settings, None); + + for (i, block) in sender_data.into_iter().enumerate() { + sender.send(block).await.unwrap(); + if i == 3 { + sender.abort(); + return; + } + } + }); + + let mut received_blocks_size = Vec::new(); + loop { + let (serializer, is_end) = receiver + .next_page(&Wait::Deadline(Instant::now() + Duration::from_secs(1))) + .await + .unwrap(); + + if serializer.num_rows() > 0 { + received_blocks_size.push(serializer.num_rows()); + } + + if is_end { + break; + } + } + + send_task.await.unwrap(); + + assert_eq!(received_blocks_size, vec![4, 4]) + } + fn data_block_strategy(len: usize) -> impl Strategy where N: Number + Arbitrary + 'static, From 4f398ee2a47c566f270031d22a305f36075cef06 Mon Sep 17 00:00:00 2001 From: BohuTANG Date: Sat, 13 Sep 2025 21:52:38 +0800 Subject: [PATCH 33/33] feat(query): add comprehensive logging for result set spilling operations --- .../src/servers/http/v1/query/page_manager.rs | 41 ++++++++++- .../src/servers/http/v1/query/sized_spsc.rs | 68 ++++++++++++++++++- 2 files changed, 106 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs index e0efac301dd83..bba28b16444f4 100644 --- a/src/query/service/src/servers/http/v1/query/page_manager.rs +++ b/src/query/service/src/servers/http/v1/query/page_manager.rs @@ -74,9 +74,26 @@ impl PageManager { let next_no = self.total_pages; if page_no == next_no { if !self.end { + let start_time = std::time::Instant::now(); let (serializer, end) = self.receiver.next_page(wait).await?; let num_row = serializer.num_rows(); + let duration_ms = start_time.elapsed().as_millis(); + log::debug!(num_row, wait_type:? = wait; "collect_new_page"); + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Page received page_no={}, rows={}, total_rows={}, end={}, duration_ms={}", + self.total_pages, num_row, self.total_rows + num_row, end, duration_ms + ); + + if num_row == 0 { + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Empty page page_no={}, end={}", + self.total_pages, end + ); + } + self.total_rows += num_row; let page = Page { data: Arc::new(serializer), @@ -117,10 +134,30 @@ impl PageManager { #[async_backtrace::framed] pub async fn detach(&mut self) { + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Query completed total_pages={}, total_rows={}", + self.total_pages, self.total_rows + ); + self.last_page = None; if let Some(spiller) = self.receiver.close() { - if let Err(error) = spiller.cleanup().await { - log::error!(error:?; "clean up spilled result set file fail"); + let start_time = std::time::Instant::now(); + match spiller.cleanup().await { + Ok(_) => { + let duration_ms = start_time.elapsed().as_millis(); + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Cleanup completed duration_ms={}", + duration_ms + ); + } + Err(error) => { + log::error!( + target: "result-set-spill", + error:?; "[RESULT-SET-SPILL] Failed to cleanup spilled result set files" + ); + } } }; } diff --git a/src/query/service/src/servers/http/v1/query/sized_spsc.rs b/src/query/service/src/servers/http/v1/query/sized_spsc.rs index 40683a26f01cb..1f695116b5b64 100644 --- a/src/query/service/src/servers/http/v1/query/sized_spsc.rs +++ b/src/query/service/src/servers/http/v1/query/sized_spsc.rs @@ -264,14 +264,54 @@ where S: DataBlockSpill } async fn try_spill_page(&self, page: Vec) -> Result { + let rows_count = page.iter().map(|b| b.num_rows()).sum::(); + let memory_bytes = page.iter().map(|b| b.memory_size()).sum::(); + if !self.should_spill() { + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Spill disabled, page kept in memory blocks={}, rows={}, memory_bytes={}", + page.len(), rows_count, memory_bytes + ); return Ok(Page::Memory(page)); } let spiller = self.spiller.lock().unwrap().clone(); let Some(spiller) = spiller.as_ref() else { + log::warn!( + target: "result-set-spill", + "[RESULT-SET-SPILL] No spiller configured, page kept in memory blocks={}, rows={}, memory_bytes={}", + page.len(), rows_count, memory_bytes + ); return Ok(Page::Memory(page)); }; - let location = spiller.merge_and_spill(page).await?; + + let start_time = std::time::Instant::now(); + + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Starting spill to disk blocks={}, rows={}, memory_bytes={}", + page.len(), rows_count, memory_bytes + ); + + let location = match spiller.merge_and_spill(page).await { + Ok(location) => location, + Err(e) => { + log::error!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Spill failed error={:?}", + e + ); + return Err(e); + } + }; + let duration_ms = start_time.elapsed().as_millis(); + + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Spill completed rows={}, duration_ms={}, location={:?}", + rows_count, duration_ms, location + ); + Ok(Page::Spilled(location)) } @@ -366,8 +406,26 @@ where S: DataBlockSpill collector } Some(Page::Spilled(location)) => { + let start_time = std::time::Instant::now(); + + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Restoring from disk location={:?}", + location + ); + let spiller = self.chan.spiller.lock().unwrap().clone().unwrap(); let block = spiller.restore(&location).await?; + let rows_count = block.num_rows(); + let memory_bytes = block.memory_size(); + let duration_ms = start_time.elapsed().as_millis(); + + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Restore completed rows={}, memory_bytes={}, duration_ms={}", + rows_count, memory_bytes, duration_ms + ); + let mut collector = BlocksCollector::new(); collector.append_block(block); collector @@ -404,6 +462,14 @@ where S: DataBlockSpill return Ok(true); } Err(SendFail::Full { page, remain }) => { + let page_rows = page.iter().map(|b| b.num_rows()).sum::(); + + log::info!( + target: "result-set-spill", + "[RESULT-SET-SPILL] Buffer full page_blocks={}, page_rows={}, has_remain={}", + page.len(), page_rows, remain.is_some() + ); + self.chan.notify_on_sent.notify_one(); let mut to_add = self.chan.try_spill_page(page).await?; loop {