From 55f7acba4cee7bc0af673ce203a80187f108e925 Mon Sep 17 00:00:00 2001 From: Anton Borisov Date: Mon, 12 Jan 2026 02:59:40 +0000 Subject: [PATCH] [ISSUE-153] Add blocking poll into python bindings --- bindings/python/example/example.py | 21 +++++++++++ bindings/python/src/table.rs | 56 +++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/bindings/python/example/example.py b/bindings/python/example/example.py index 0523f943..1111078c 100644 --- a/bindings/python/example/example.py +++ b/bindings/python/example/example.py @@ -175,6 +175,27 @@ async def main(): # TODO: support to_duckdb() + # Test the new poll() method for incremental reading + print("\n--- Testing poll() method ---") + log_scanner.subscribe(None, None) + + # Poll with a timeout of 5000ms (5 seconds) + # Note: poll() returns an empty table (not an error) on timeout + try: + poll_result = log_scanner.poll(5000) + print(f"Number of rows: {poll_result.num_rows}") + + if poll_result.num_rows > 0: + poll_df = poll_result.to_pandas() + print(f"Polled data:\n{poll_df}") + else: + print("Empty result (no records available)") + # Empty table still has schema + print(f"Schema: {poll_result.schema}") + + except Exception as e: + print(f"Error during poll: {e}") + except Exception as e: print(f"Error during scanning: {e}") diff --git a/bindings/python/src/table.rs b/bindings/python/src/table.rs index 8a116485..aeba9f08 100644 --- a/bindings/python/src/table.rs +++ b/bindings/python/src/table.rs @@ -18,8 +18,9 @@ use crate::TOKIO_RUNTIME; use crate::*; use arrow::array::RecordBatch; -use arrow_pyarrow::FromPyArrow; +use arrow_pyarrow::{FromPyArrow, ToPyArrow}; use fluss::client::EARLIEST_OFFSET; +use fluss::record::to_arrow_schema; use fluss::rpc::message::OffsetSpec; use pyo3_async_runtimes::tokio::future_into_py; use std::sync::Arc; @@ -321,6 +322,59 @@ impl LogScanner { Ok(df) } + /// Poll for new records with the specified timeout + /// + /// Args: + /// timeout_ms: Timeout in milliseconds to wait for records + /// + /// Returns: + /// PyArrow Table containing the polled records + /// + /// Note: + /// - Returns an empty table (with correct schema) if no records are available + /// - When timeout expires, returns an empty table (NOT an error) + fn poll(&self, py: Python, timeout_ms: i64) -> PyResult> { + use std::time::Duration; + + if timeout_ms < 0 { + return Err(FlussError::new_err(format!( + "timeout_ms must be non-negative, got: {timeout_ms}" + ))); + } + + let timeout = Duration::from_millis(timeout_ms as u64); + let scan_records = py + .detach(|| TOKIO_RUNTIME.block_on(async { self.inner.poll(timeout).await })) + .map_err(|e| FlussError::new_err(e.to_string()))?; + + // Convert records to Arrow batches per bucket + let mut arrow_batches = Vec::new(); + for (_bucket, records) in scan_records.into_records_by_buckets() { + let mut batches = Utils::convert_scan_records_to_arrow(records); + arrow_batches.append(&mut batches); + } + if arrow_batches.is_empty() { + return self.create_empty_table(py); + } + + Utils::combine_batches_to_table(py, arrow_batches) + } + + /// Create an empty PyArrow table with the correct schema + fn create_empty_table(&self, py: Python) -> PyResult> { + let arrow_schema = to_arrow_schema(self.table_info.get_row_type()); + let py_schema = arrow_schema + .to_pyarrow(py) + .map_err(|e| FlussError::new_err(format!("Failed to convert schema: {e}")))?; + + let pyarrow = py.import("pyarrow")?; + let empty_table = pyarrow + .getattr("Table")? + .call_method1("from_batches", (vec![] as Vec>, py_schema))?; + + Ok(empty_table.into()) + } + fn __repr__(&self) -> String { format!("LogScanner(table={})", self.table_info.table_path) }