apache · alamb · May 31, 2023 · May 18, 2023 · May 30, 2023 · May 30, 2023
diff --git a/datafusion/core/src/physical_optimizer/pipeline_checker.rs b/datafusion/core/src/physical_optimizer/pipeline_checker.rs
@@ -387,7 +387,7 @@ mod sql_tests {
         };
         let test2 = UnaryTestCase {
             source_type: SourceType::Unbounded,
-            expect_fail: true,
+            expect_fail: false,
         };
         let case = QueryCase {
             sql: "EXPLAIN ANALYZE SELECT * FROM test".to_string(),

diff --git a/datafusion/core/src/physical_plan/analyze.rs b/datafusion/core/src/physical_plan/analyze.rs
@@ -28,10 +28,12 @@ use crate::{
     },
 };
 use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch};
-use futures::StreamExt;
+use futures::{FutureExt, StreamExt, TryFutureExt};
+use tokio::task::JoinSet;
 
 use super::expressions::PhysicalSortExpr;
-use super::{stream::RecordBatchReceiverStream, Distribution, SendableRecordBatchStream};
+use super::stream::RecordBatchStreamAdapter;
+use super::{Distribution, SendableRecordBatchStream};
 use crate::execution::context::TaskContext;
 
 /// `EXPLAIN ANALYZE` execution plan operator. This operator runs its input,
@@ -73,7 +75,7 @@ impl ExecutionPlan for AnalyzeExec {
 
     /// Specifies we want the input as a single stream
     fn required_input_distribution(&self) -> Vec<Distribution> {
-        vec![Distribution::SinglePartition]
+        vec![Distribution::UnspecifiedDistribution]
     }
 
     /// Specifies whether this plan generates an infinite stream of records.
@@ -121,96 +123,102 @@ impl ExecutionPlan for AnalyzeExec {
             )));
         }
 
-        // should be ensured by `SinglePartition`  above
-        let input_partitions = self.input.output_partitioning().partition_count();
-        if input_partitions != 1 {
-            return Err(DataFusionError::Internal(format!(
-                "AnalyzeExec invalid number of input partitions. Expected 1, got {input_partitions}"
-            )));
+        // Gather futures that will run each input partition using a
+        // JoinSet to cancel outstanding futures on drop
+        let mut set = JoinSet::new();
+        let num_input_partitions = self.input.output_partitioning().partition_count();
+
+        for input_partition in 0..num_input_partitions {
+            let input_stream = self.input.execute(input_partition, context.clone());
+
+            set.spawn(async move {
+                let mut total_rows = 0;
+                let mut input_stream = input_stream?;
+                while let Some(batch) = input_stream.next().await {
+                    let batch = batch?;
+                    total_rows += batch.num_rows();
+                }
+                Ok(total_rows)
+            });
         }
 
-        let (tx, rx) = tokio::sync::mpsc::channel(input_partitions);
+        // Turn the tasks in the JoinSet into a stream of
+        // Result<usize> representing the counts of each output
+        // partition.
+        let counts_stream = futures::stream::unfold(set, |mut set| async {
+            let next = set.join_next().await?; // returns Some when empty
+                                               // translate join errors (aka task panic's) into ExecutionErrors
+            let next = match next {
+                Ok(res) => res,
+                Err(e) => Err(DataFusionError::Execution(format!(
+                    "Join error in AnalyzeExec: {e}"
+                ))),
+            };
+            Some((next, set))
+        });
 
+        let start = Instant::now();
         let captured_input = self.input.clone();
-        let mut input_stream = captured_input.execute(0, context)?;
         let captured_schema = self.schema.clone();
         let verbose = self.verbose;
 
-        // Task reads batches the input and when complete produce a
-        // RecordBatch with a report that is written to `tx` when done
-        let join_handle = tokio::task::spawn(async move {
-            let start = Instant::now();
-            let mut total_rows = 0;
-
-            // Note the code below ignores errors sending on tx. An
-            // error sending means the plan is being torn down and
-            // nothing is left that will handle the error (aka no one
-            // will hear us scream)
-            while let Some(b) = input_stream.next().await {
-                match b {
-                    Ok(batch) => {
-                        total_rows += batch.num_rows();
-                    }
-                    b @ Err(_) => {
-                        // try and pass on errors from input
-                        if tx.send(b).await.is_err() {
-                            // receiver hung up, stop executing (no
-                            // one will look at any further results we
-                            // send)
-                            return;
-                        }
-                    }
-                }
-            }
-            let end = Instant::now();
-
-            let mut type_builder = StringBuilder::with_capacity(1, 1024);
-            let mut plan_builder = StringBuilder::with_capacity(1, 1024);
-
-            // TODO use some sort of enum rather than strings?
-            type_builder.append_value("Plan with Metrics");
+        // future that gathers the input counts into an overall output
+        // count, and makes an output batch
+        let output = counts_stream
+            // combine results from all input stream into a total count
+            .fold(Ok(0), |total_rows: Result<usize>, input_rows| async move {
+                Ok(total_rows? + input_rows?)
+            })
+            // convert the total to a RecordBatch
+            .map(move |total_rows| {
+                let total_rows = total_rows?;
+                let end = Instant::now();
 
-            let annotated_plan =
-                DisplayableExecutionPlan::with_metrics(captured_input.as_ref())
-                    .indent()
-                    .to_string();
-            plan_builder.append_value(annotated_plan);
+                let mut type_builder = StringBuilder::with_capacity(1, 1024);
+                let mut plan_builder = StringBuilder::with_capacity(1, 1024);
 
-            // Verbose output
-            // TODO make this more sophisticated
-            if verbose {
-                type_builder.append_value("Plan with Full Metrics");
+                // TODO use some sort of enum rather than strings?
+                type_builder.append_value("Plan with Metrics");
 
                 let annotated_plan =
-                    DisplayableExecutionPlan::with_full_metrics(captured_input.as_ref())
+                    DisplayableExecutionPlan::with_metrics(captured_input.as_ref())
                         .indent()
                         .to_string();
                 plan_builder.append_value(annotated_plan);
 
-                type_builder.append_value("Output Rows");
-                plan_builder.append_value(total_rows.to_string());
+                // Verbose output
+                // TODO make this more sophisticated
+                if verbose {
+                    type_builder.append_value("Plan with Full Metrics");
 
-                type_builder.append_value("Duration");
-                plan_builder.append_value(format!("{:?}", end - start));
-            }
+                    let annotated_plan = DisplayableExecutionPlan::with_full_metrics(
+                        captured_input.as_ref(),
+                    )
+                    .indent()
+                    .to_string();
+                    plan_builder.append_value(annotated_plan);
 
-            let maybe_batch = RecordBatch::try_new(
-                captured_schema,
-                vec![
-                    Arc::new(type_builder.finish()),
-                    Arc::new(plan_builder.finish()),
-                ],
-            )
-            .map_err(Into::into);
-            // again ignore error
-            tx.send(maybe_batch).await.ok();
-        });
+                    type_builder.append_value("Output Rows");
+                    plan_builder.append_value(total_rows.to_string());
+
+                    type_builder.append_value("Duration");
+                    plan_builder.append_value(format!("{:?}", end - start));
+                }
 
-        Ok(RecordBatchReceiverStream::create(
-            &self.schema,
-            rx,
-            join_handle,
-        ))
+                RecordBatch::try_new(
+                    captured_schema,
+                    vec![
+                        Arc::new(type_builder.finish()),
+                        Arc::new(plan_builder.finish()),
+                    ],
+                )
+            })
+            .map_err(DataFusionError::from);
+
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            self.schema.clone(),
+            futures::stream::once(output),
+        )))
     }
 
     fn fmt_as(

diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs
@@ -1192,11 +1192,9 @@ impl DefaultPhysicalPlanner {
                         "Unsupported logical plan: Distinct should be replaced to Aggregate".to_string(),
                     ))
                 }
-                LogicalPlan::Analyze(a) => {
-                    let input = self.create_initial_plan(&a.input, session_state).await?;
-                    let schema = SchemaRef::new((*a.schema).clone().into());
-                    Ok(Arc::new(AnalyzeExec::new(a.verbose, input, schema)))
-                }
+                LogicalPlan::Analyze(_) => Err(DataFusionError::Internal(
+                    "Unsupported logical plan: Analyze must be root of the plan".to_string(),
+                )),
                 LogicalPlan::Extension(e) => {
                     let physical_inputs = self.create_initial_plan_multi(e.node.inputs(), session_state).await?;
 
@@ -1851,6 +1849,10 @@ impl DefaultPhysicalPlanner {
                 stringified_plans,
                 e.verbose,
             ))))
+        } else if let LogicalPlan::Analyze(a) = logical_plan {
+            let input = self.create_physical_plan(&a.input, session_state).await?;
+            let schema = SchemaRef::new((*a.schema).clone().into());
+            Ok(Some(Arc::new(AnalyzeExec::new(a.verbose, input, schema))))
         } else {
             Ok(None)
         }

diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs
@@ -687,13 +687,31 @@ async fn csv_explain_analyze() {
     // Only test basic plumbing and try to avoid having to change too
     // many things. explain_analyze_baseline_metrics covers the values
     // in greater depth
-    let needle = "CoalescePartitionsExec, metrics=[output_rows=5, elapsed_compute=";
+    let needle = "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[COUNT(UInt8(1))], metrics=[output_rows=5";
     assert_contains!(&formatted, needle);
 
     let verbose_needle = "Output Rows";
     assert_not_contains!(formatted, verbose_needle);
 }
 
+#[tokio::test]
+#[cfg_attr(tarpaulin, ignore)]
+async fn csv_explain_analyze_order_by() {
+    let ctx = SessionContext::new();
+    register_aggregate_csv_by_sql(&ctx).await;
+    let sql = "EXPLAIN ANALYZE SELECT c1 FROM aggregate_test_100 order by c1";
+    let actual = execute_to_batches(&ctx, sql).await;
+    let formatted = arrow::util::pretty::pretty_format_batches(&actual)
+        .unwrap()
+        .to_string();
+
+    // Ensure that the ordering is not optimized away from the plan
+    // https://github.com/apache/arrow-datafusion/issues/6379
+    let needle =
+        "SortExec: expr=[c1@0 ASC NULLS LAST], metrics=[output_rows=100, elapsed_compute";
+    assert_contains!(&formatted, needle);
+}
+
 #[tokio::test]
 #[cfg_attr(tarpaulin, ignore)]
 async fn parquet_explain_analyze() {