apache · wiedld · Aug 15, 2023 · Aug 15, 2023 · Aug 15, 2023 · Aug 15, 2023
diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs
@@ -380,7 +380,7 @@ impl ExecutionPlan for ParquetExec {
         let stream =
             FileStream::new(&self.base_config, partition_index, opener, &self.metrics)?;
 
-        Ok(Box::pin(stream))
+        Ok(Box::pin(stream) as SendableRecordBatchStream)
     }
 
     fn metrics(&self) -> Option<MetricsSet> {

diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml
@@ -58,6 +58,7 @@ tokio = { version = "1.28", features = ["sync", "fs", "parking_lot"] }
 uuid = { version = "^1.2", features = ["v4"] }
 
 [dev-dependencies]
+paste = "1.0.14"
 rstest = "0.18.0"
 termtree = "0.4.1"
 tokio = { version = "1.28", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] }
diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs
@@ -29,7 +29,9 @@ use datafusion_common::{internal_err, DataFusionError, Result};
 use futures::StreamExt;
 
 use super::expressions::PhysicalSortExpr;
-use super::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter};
+use super::stream::{
+    ReceiverStream, RecordBatchReceiverStreamAdaptor, RecordBatchStreamAdapter,
+};
 use super::{DisplayAs, Distribution, SendableRecordBatchStream};
 use datafusion_execution::TaskContext;
 
@@ -155,11 +157,11 @@ impl ExecutionPlan for AnalyzeExec {
         // parallel (on a separate tokio task) using a JoinSet to
         // cancel outstanding futures on drop
         let num_input_partitions = self.input.output_partitioning().partition_count();
-        let mut builder =
-            RecordBatchReceiverStream::builder(self.schema(), num_input_partitions);
+        let mut builder = ReceiverStream::builder(self.schema(), num_input_partitions);
+        let input = Arc::new(RecordBatchReceiverStreamAdaptor::new(self.input.clone()));
 
         for input_partition in 0..num_input_partitions {
-            builder.run_input(self.input.clone(), input_partition, context.clone());
+            builder.run_input(input.clone(), input_partition, context.clone());
         }
 
         // Create future that computes thefinal output

diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs
@@ -23,7 +23,7 @@ use std::sync::Arc;
 
 use super::expressions::PhysicalSortExpr;
 use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
-use super::stream::{ObservedStream, RecordBatchReceiverStream};
+use super::stream::{ObservedStream, ReceiverStream, RecordBatchReceiverStreamAdaptor};
 use super::{DisplayAs, SendableRecordBatchStream, Statistics};
 
 use crate::{DisplayFormatType, EquivalenceProperties, ExecutionPlan, Partitioning};
@@ -145,12 +145,14 @@ impl ExecutionPlan for CoalescePartitionsExec {
                 // least one result in an attempt to maximize
                 // parallelism.
                 let mut builder =
-                    RecordBatchReceiverStream::builder(self.schema(), input_partitions);
+                    ReceiverStream::builder(self.schema(), input_partitions);
+                let input =
+                    Arc::new(RecordBatchReceiverStreamAdaptor::new(self.input.clone()));
 
                 // spawn independent tasks whose resulting streams (of batches)
                 // are sent to the channel for consumption.
                 for part_i in 0..input_partitions {
-                    builder.run_input(self.input.clone(), part_i, context.clone());
+                    builder.run_input(input.clone(), part_i, context.clone());
                 }
 
                 let stream = builder.build();

diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs
@@ -18,7 +18,7 @@
 //! Defines common code used in execution plans
 
 use super::SendableRecordBatchStream;
-use crate::stream::RecordBatchReceiverStream;
+use crate::stream::ReceiverStream;
 use crate::{ColumnStatistics, ExecutionPlan, Statistics};
 use arrow::datatypes::Schema;
 use arrow::ipc::writer::{FileWriter, IpcWriteOptions};
@@ -102,7 +102,7 @@ pub(crate) fn spawn_buffered(
         Ok(handle)
             if handle.runtime_flavor() == tokio::runtime::RuntimeFlavor::MultiThread =>
         {
-            let mut builder = RecordBatchReceiverStream::builder(input.schema(), buffer);
+            let mut builder = ReceiverStream::builder(input.schema(), buffer);
 
             let sender = builder.tx();
 

diff --git a/datafusion/physical-plan/src/sorts/batch_cursor.rs b/datafusion/physical-plan/src/sorts/batch_cursor.rs
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::Result;
+
+use super::cursor::Cursor;
+
+pub type BatchId = u64;
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub struct BatchOffset(pub usize);
+
+pub type SlicedBatchCursorIdentifier = (BatchId, BatchOffset);
+
+/// The [`BatchCursor`] represents a complete, or partial, [`Cursor`] for a given record batch ([`BatchId`]).
+///
+/// A record batch (represented by its [`Cursor`]) can be sliced due to the following reason:
+///   1. a merge node takes in 10 streams
+///   2 at any given time, this means up to 10 cursors (record batches) are being merged (e.g. in the loser tree)
+///   3. merge nodes will yield once it hits a size limit
+///   4. at the moment of yielding, there may be some cursors which are partially yielded
+///
+/// Unique representation of sliced cursor is denoted by the [`SlicedBatchCursorIdentifier`].
+#[derive(Debug)]
+pub struct BatchCursor<C: Cursor> {
+    /// The index into BatchTrackingStream::batches
+    batch: BatchId,
+    /// The offset of the row within the given batch, based on the idea of a sliced cursor.
+    /// When a batch is partially yielded, then the offset->end will determine how much was yielded.
+    row_offset: BatchOffset,
+
+    /// The cursor for the given batch.
+    pub cursor: C,
+}
+
+impl<C: Cursor> BatchCursor<C> {
+    /// Create a new [`BatchCursor`] from a [`Cursor`] and a [`BatchId`].
+    ///
+    /// New [`BatchCursor`]s will have a [`BatchOffset`] of 0.
+    /// Subsequent batch_cursors can be created by slicing.
+    pub fn new(batch: BatchId, cursor: C) -> Self {
+        Self {
+            batch,
+            row_offset: BatchOffset(0),
+            cursor,
+        }
+    }
+
+    /// A unique identifier used to identify a [`BatchCursor`]
+    pub fn identifier(&self) -> SlicedBatchCursorIdentifier {
+        (self.batch, self.row_offset)
+    }
+
+    /// Slicing of a batch cursor is done by slicing the underlying cursor,
+    /// and adjust the BatchOffset
+    pub fn slice(&self, offset: usize, length: usize) -> Result<Self> {
+        Ok(Self {
+            batch: self.batch,
+            row_offset: BatchOffset(self.row_offset.0 + offset),
+            cursor: self.cursor.slice(offset, length)?,
+        })
+    }
+}
+
+impl<C: Cursor> std::fmt::Display for BatchCursor<C> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "BatchCursor(batch: {}, offset: {}, num_rows: {})",
+            self.batch,
+            self.row_offset.0,
+            self.cursor.num_rows()
+        )
+    }
+}