From a0f93f786e8908c7bc36e25918dc3a68824c1b68 Mon Sep 17 00:00:00 2001 From: blaginin Date: Wed, 19 Feb 2025 18:22:52 +0000 Subject: [PATCH 1/7] Reuse alias if possible --- datafusion/expr/src/expr.rs | 27 +++++++++++++++++-- .../sqllogictest/test_files/group_by.slt | 2 +- datafusion/sqllogictest/test_files/unnest.slt | 2 +- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index df79b3568ce6..3e08d05e62f1 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -408,6 +408,19 @@ impl Alias { name: name.into(), } } + + /// Create an alias with an optional schema/field qualifier. + pub fn new_boxed( + expr: Box, + relation: Option>, + name: impl Into, + ) -> Self { + Self { + expr, + relation: relation.map(|r| r.into()), + name: name.into(), + } + } } /// Binary expression @@ -1276,7 +1289,12 @@ impl Expr { /// Return `self AS name` alias expression pub fn alias(self, name: impl Into) -> Expr { - Expr::Alias(Alias::new(self, None::<&str>, name.into())) + if let Expr::Alias(Alias { expr, .. }) = self { + // reuse the existing layer if possible + Expr::Alias(Alias::new_boxed(expr, None::<&str>, name.into())) + } else { + Expr::Alias(Alias::new(self, None::<&str>, name.into())) + } } /// Return `self AS name` alias expression with a specific qualifier @@ -1285,7 +1303,12 @@ impl Expr { relation: Option>, name: impl Into, ) -> Expr { - Expr::Alias(Alias::new(self, relation, name.into())) + if let Expr::Alias(Alias { expr, .. }) = self { + // reuse the existing layer if possible + Expr::Alias(Alias::new_boxed(expr, relation, name.into())) + } else { + Expr::Alias(Alias::new(self, relation, name.into())) + } } /// Remove an alias from an expression if one exists. diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 2b3ebcda1520..df468a85af82 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4188,7 +4188,7 @@ EXPLAIN SELECT SUM(DISTINCT CAST(x AS DOUBLE)), MAX(DISTINCT CAST(x AS DOUBLE)) logical_plan 01)Projection: sum(alias1) AS sum(DISTINCT t1.x), max(alias1) AS max(DISTINCT t1.x) 02)--Aggregate: groupBy=[[t1.y]], aggr=[[sum(alias1), max(alias1)]] -03)----Aggregate: groupBy=[[t1.y, __common_expr_1 AS t1.x AS alias1]], aggr=[[]] +03)----Aggregate: groupBy=[[t1.y, __common_expr_1 AS alias1]], aggr=[[]] 04)------Projection: CAST(t1.x AS Float64) AS __common_expr_1, t1.y 05)--------TableScan: t1 projection=[x, y] physical_plan diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 9c46410c4909..f68fd993ddd0 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -595,7 +595,7 @@ explain select unnest(unnest(column3)), column3 from recursive_unnest_table; ---- logical_plan 01)Unnest: lists[] structs[__unnest_placeholder(UNNEST(recursive_unnest_table.column3))] -02)--Projection: __unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), recursive_unnest_table.column3 +02)--Projection: __unnest_placeholder(recursive_unnest_table.column3,depth=1) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), recursive_unnest_table.column3 03)----Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[] 04)------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3 05)--------TableScan: recursive_unnest_table projection=[column3] From 9368c57d3742e1df8bfb3312a873bf13e5ef6630 Mon Sep 17 00:00:00 2001 From: blaginin Date: Wed, 19 Feb 2025 21:18:26 +0000 Subject: [PATCH 2/7] Extend doc --- datafusion/expr/src/expr.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 3e08d05e62f1..5e9308577561 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1288,6 +1288,16 @@ impl Expr { } /// Return `self AS name` alias expression + /// + /// # Example + /// ``` + /// # use datafusion_expr::col; + /// let expr = col("foo").alias("bar"); + /// assert_eq!(expr.to_string(), "foo AS bar"); + /// + /// // when aliasing over the exising alias, the previous one is removed + /// let expr = col("foo").alias("bar").alias("baz"); + /// assert_eq!(expr.to_string(), "foo AS baz"); pub fn alias(self, name: impl Into) -> Expr { if let Expr::Alias(Alias { expr, .. }) = self { // reuse the existing layer if possible From 583ad25437c23d1e0e3b5fb4185c3219306497c7 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 18:47:55 +0000 Subject: [PATCH 3/7] Add aliases to unnests --- datafusion/sql/src/unparser/plan.rs | 39 +++++++++++++++++------ datafusion/sql/tests/cases/plan_to_sql.rs | 10 +++--- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index b14fbdff236f..aa5d52212945 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -370,20 +370,30 @@ impl Unparser<'_> { // Projection can be top-level plan for unnest relation // The projection generated by the `RecursiveUnnestRewriter` from a UNNEST relation will have // only one expression, which is the placeholder column generated by the rewriter. - let unnest_input_type = if p.expr.len() == 1 { + let unnest_params = if p.expr.len() == 1 { Self::check_unnest_placeholder_with_outer_ref(&p.expr[0]) } else { None }; - if self.dialect.unnest_as_table_factor() && unnest_input_type.is_some() { - if let LogicalPlan::Unnest(unnest) = &p.input.as_ref() { - return self - .unnest_to_table_factor_sql(unnest, query, select, relation); + + if self.dialect.unnest_as_table_factor() { + if let Some((_, unnest_alias)) = &unnest_params { + if let LogicalPlan::Unnest(unnest) = &p.input.as_ref() { + return self.unnest_to_table_factor_sql( + unnest, + query, + select, + relation, + unnest_alias, + ); + } } } // Projection can be top-level plan for derived table if select.already_projected() { + let unnest_input_type = unnest_params.map(|(t, _)| t); + return self.derive_with_dialect_alias( "derived_projection", plan, @@ -839,15 +849,22 @@ impl Unparser<'_> { /// - If the column is not a placeholder column, return [None]. /// /// `outer_ref` is the display result of [Expr::OuterReferenceColumn] - fn check_unnest_placeholder_with_outer_ref(expr: &Expr) -> Option { - if let Expr::Alias(Alias { expr, .. }) = expr { + fn check_unnest_placeholder_with_outer_ref( + expr: &Expr, + ) -> Option<(UnnestInputType, &str)> { + if let Expr::Alias(Alias { + expr, + name: alias_name, + .. + }) = expr + { if let Expr::Column(Column { name, .. }) = expr.as_ref() { if let Some(prefix) = name.strip_prefix(UNNEST_PLACEHOLDER) { if prefix.starts_with(&format!("({}(", OUTER_REFERENCE_COLUMN_PREFIX)) { - return Some(UnnestInputType::OuterReference); + return Some((UnnestInputType::OuterReference, alias_name)); } - return Some(UnnestInputType::Scalar); + return Some((UnnestInputType::Scalar, alias_name)); } } } @@ -860,8 +877,12 @@ impl Unparser<'_> { query: &mut Option, select: &mut SelectBuilder, relation: &mut RelationBuilder, + alias: &str, ) -> Result<()> { let mut unnest_relation = UnnestRelationBuilder::default(); + + unnest_relation.alias(Some(self.new_table_alias(alias.to_string(), vec![]))); + let LogicalPlan::Projection(p) = unnest.input.as_ref() else { return internal_err!("Unnest input is not a Projection: {unnest:?}"); }; diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index fc0b7a26baaf..08155381c7cd 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -575,7 +575,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3])", - expected: r#"SELECT * FROM UNNEST([1, 2, 3])"#, + expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3)))"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, @@ -593,7 +593,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]), j1", - expected: r#"SELECT * FROM UNNEST([1, 2, 3]) CROSS JOIN j1"#, + expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3))) CROSS JOIN j1"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, @@ -611,13 +611,13 @@ fn roundtrip_statement_with_dialect() -> Result<()> { }, TestStatementWithDialect { sql: "SELECT UNNEST([1,2,3])", - expected: r#"SELECT * FROM UNNEST([1, 2, 3])"#, + expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3)))"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, TestStatementWithDialect { sql: "SELECT UNNEST([1,2,3]) as c1", - expected: r#"SELECT UNNEST([1, 2, 3]) AS c1"#, + expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS c1"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, @@ -629,7 +629,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { }, TestStatementWithDialect { sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col)", - expected: r#"SELECT * FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col)"#, + expected: r#"SELECT * FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS UNNEST(outer_ref(u.array_col))"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, From 3cdbdec433980e7a6abec6b0ea2de6841b8a9257 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 18:52:07 +0000 Subject: [PATCH 4/7] Update slt --- datafusion/sqllogictest/test_files/count_star_rule.slt | 4 ++-- datafusion/sqllogictest/test_files/insert.slt | 2 +- datafusion/sqllogictest/test_files/union.slt | 4 ++-- datafusion/sqllogictest/test_files/window.slt | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/sqllogictest/test_files/count_star_rule.slt b/datafusion/sqllogictest/test_files/count_star_rule.slt index d38d3490fed4..c638107da531 100644 --- a/datafusion/sqllogictest/test_files/count_star_rule.slt +++ b/datafusion/sqllogictest/test_files/count_star_rule.slt @@ -59,7 +59,7 @@ query TT EXPLAIN SELECT t1.a, COUNT() AS cnt FROM t1 GROUP BY t1.a HAVING COUNT() > 0; ---- logical_plan -01)Projection: t1.a, count(Int64(1)) AS count() AS cnt +01)Projection: t1.a, count(Int64(1)) AS cnt 02)--Filter: count(Int64(1)) > Int64(0) 03)----Aggregate: groupBy=[[t1.a]], aggr=[[count(Int64(1))]] 04)------TableScan: t1 projection=[a] @@ -83,7 +83,7 @@ query TT EXPLAIN SELECT a, COUNT() OVER (PARTITION BY a) AS count_a FROM t1; ---- logical_plan -01)Projection: t1.a, count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS count() PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS count_a +01)Projection: t1.a, count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS count_a 02)--WindowAggr: windowExpr=[[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] 03)----TableScan: t1 projection=[a] physical_plan diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 8a9c01d36308..c2654ba9a815 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -171,7 +171,7 @@ logical_plan 01)Dml: op=[Insert Into] table=[table_without_values] 02)--Projection: a1 AS a1, a2 AS a2 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST -04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1 +04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1 05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] 06)----------TableScan: aggregate_test_100 projection=[c1, c4, c9] physical_plan diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index 918c6e281173..9ab732c65533 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -491,7 +491,7 @@ EXPLAIN logical_plan 01)Limit: skip=0, fetch=3 02)--Union -03)----Projection: count(Int64(1)) AS count(*) AS cnt +03)----Projection: count(Int64(1)) AS cnt 04)------Limit: skip=0, fetch=3 05)--------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] 06)----------SubqueryAlias: a @@ -650,7 +650,7 @@ select x, y from (select 1 as x , max(10) as y) b ---- logical_plan 01)Union -02)--Projection: count(Int64(1)) AS count(*) AS count, a.n +02)--Projection: count(Int64(1)) AS count, a.n 03)----Aggregate: groupBy=[[a.n]], aggr=[[count(Int64(1))]] 04)------SubqueryAlias: a 05)--------Projection: Int64(5) AS n diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 1a9acc0f531a..6c0e69a467e1 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -1764,7 +1764,7 @@ EXPLAIN SELECT count(*) as global_count FROM ORDER BY c1 ) AS a ---- logical_plan -01)Projection: count(Int64(1)) AS count(*) AS global_count +01)Projection: count(Int64(1)) AS global_count 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] 03)----SubqueryAlias: a 04)------Projection: From 211bd5efa615cde6327e6bdd86e8a2fa18d5c997 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 19:08:58 +0000 Subject: [PATCH 5/7] Update `test_count_wildcard_on_sort`? --- datafusion/core/tests/dataframe/mod.rs | 38 ++++++++++++-------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index a902cf8ae65b..dadec5b2be5d 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -2469,26 +2469,24 @@ async fn test_count_wildcard_on_sort() -> Result<()> { .collect() .await?; - let expected_sql_result = "+---------------+------------------------------------------------------------------------------------------------------------+\ - \n| plan_type | plan |\ - \n+---------------+------------------------------------------------------------------------------------------------------------+\ - \n| logical_plan | Projection: t1.b, count(*) |\ - \n| | Sort: count(Int64(1)) AS count(*) AS count(*) ASC NULLS LAST |\ - \n| | Projection: t1.b, count(Int64(1)) AS count(*), count(Int64(1)) |\ - \n| | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]] |\ - \n| | TableScan: t1 projection=[b] |\ - \n| physical_plan | ProjectionExec: expr=[b@0 as b, count(*)@1 as count(*)] |\ - \n| | SortPreservingMergeExec: [count(Int64(1))@2 ASC NULLS LAST] |\ - \n| | SortExec: expr=[count(Int64(1))@2 ASC NULLS LAST], preserve_partitioning=[true] |\ - \n| | ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*), count(Int64(1))@1 as count(Int64(1))] |\ - \n| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] |\ - \n| | CoalesceBatchesExec: target_batch_size=8192 |\ - \n| | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 |\ - \n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ - \n| | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))] |\ - \n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ - \n| | |\ - \n+---------------+------------------------------------------------------------------------------------------------------------+"; + let expected_sql_result = "+---------------+------------------------------------------------------------------------------------+\ + \n| plan_type | plan |\ + \n+---------------+------------------------------------------------------------------------------------+\ + \n| logical_plan | Sort: count(*) ASC NULLS LAST |\ + \n| | Projection: t1.b, count(Int64(1)) AS count(*) |\ + \n| | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]] |\ + \n| | TableScan: t1 projection=[b] |\ + \n| physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST] |\ + \n| | SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] |\ + \n| | ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*)] |\ + \n| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] |\ + \n| | CoalesceBatchesExec: target_batch_size=8192 |\ + \n| | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 |\ + \n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ + \n| | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))] |\ + \n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ + \n| | |\ + \n+---------------+------------------------------------------------------------------------------------+"; assert_eq!( expected_sql_result, From 6bc770d6656fac269769200a45135e415114820e Mon Sep 17 00:00:00 2001 From: blaginin Date: Mon, 24 Mar 2025 15:44:35 +0000 Subject: [PATCH 6/7] Merge main --- .asf.yaml | 2 + .github/workflows/extended.yml | 4 +- .github/workflows/rust.yml | 264 +- Cargo.lock | 977 +++-- Cargo.toml | 79 +- benchmarks/Cargo.toml | 4 +- benchmarks/README.md | 3 +- benchmarks/src/clickbench.rs | 4 + benchmarks/src/imdb/convert.rs | 4 +- benchmarks/src/tpch/convert.rs | 4 +- datafusion-cli/CONTRIBUTING.md | 75 + datafusion-cli/Cargo.toml | 6 +- datafusion-cli/src/main.rs | 42 +- datafusion-cli/tests/cli_integration.rs | 123 +- .../tests/snapshots/aws_options.snap | 25 + .../snapshots/cli@load_local_csv.sql.snap | 26 + .../tests/snapshots/cli@load_s3_csv.sql.snap | 26 + .../tests/snapshots/cli@select.sql.snap | 23 + .../tests/snapshots/cli_format@automatic.snap | 21 + .../tests/snapshots/cli_format@csv.snap | 18 + .../tests/snapshots/cli_format@json.snap | 17 + .../tests/snapshots/cli_format@nd-json.snap | 17 + .../tests/snapshots/cli_format@table.snap | 21 + .../tests/snapshots/cli_format@tsv.snap | 18 + .../snapshots/cli_quick_test@backslash.snap | 17 + .../snapshots/cli_quick_test@batch_size.snap | 21 + .../tests/snapshots/cli_quick_test@files.snap | 19 + .../snapshots/cli_quick_test@statements.snap | 24 + .../{data/backslash.txt => sql/backslash.sql} | 0 .../tests/sql/integration/load_local_csv.sql | 6 + .../tests/sql/integration/load_s3_csv.sql | 5 + .../tests/{data/sql.txt => sql/select.sql} | 0 datafusion-examples/Cargo.toml | 4 +- datafusion-examples/examples/planner_api.rs | 28 +- datafusion-examples/examples/tracing.rs | 139 + datafusion-testing | 2 +- datafusion/catalog-listing/src/helpers.rs | 4 + datafusion/catalog-listing/src/mod.rs | 3 + datafusion/catalog/Cargo.toml | 5 +- .../src}/cte_worktable.rs | 13 +- .../src}/default_table_source.rs | 4 +- datafusion/catalog/src/lib.rs | 19 +- .../src}/listing_schema.rs | 6 +- .../src/{memory.rs => memory/catalog.rs} | 69 +- .../src/memory}/mod.rs | 10 +- datafusion/catalog/src/memory/schema.rs | 89 + datafusion/catalog/src/session.rs | 5 +- .../src/datasource => catalog/src}/stream.rs | 12 +- datafusion/catalog/src/view.rs | 155 + datafusion/common-runtime/Cargo.toml | 3 +- datafusion/common-runtime/src/common.rs | 3 +- datafusion/common-runtime/src/join_set.rs | 172 + datafusion/common-runtime/src/lib.rs | 7 +- datafusion/common-runtime/src/trace_utils.rs | 188 + datafusion/common/Cargo.toml | 3 +- datafusion/common/src/config.rs | 5 + datafusion/common/src/dfschema.rs | 58 +- datafusion/common/src/lib.rs | 3 +- datafusion/common/src/scalar/mod.rs | 73 +- datafusion/common/src/test_util.rs | 28 + datafusion/core/Cargo.toml | 14 +- datafusion/core/src/dataframe/mod.rs | 21 +- .../core/src/datasource/file_format/arrow.rs | 7 +- .../src/datasource/file_format/parquet.rs | 3 + .../core/src/datasource/listing/table.rs | 1 + datafusion/core/src/datasource/memory.rs | 7 +- datafusion/core/src/datasource/mod.rs | 109 +- .../src/datasource/physical_plan/parquet.rs | 321 ++ .../src/datasource/{view.rs => view_test.rs} | 148 +- datafusion/core/src/execution/context/csv.rs | 18 +- datafusion/core/src/execution/context/mod.rs | 128 +- datafusion/core/src/execution/mod.rs | 2 - .../core/src/execution/session_state.rs | 19 +- .../src/execution/session_state_defaults.rs | 2 +- datafusion/core/src/lib.rs | 4 +- datafusion/core/src/physical_planner.rs | 414 +- datafusion/core/tests/catalog/memory.rs | 32 +- datafusion/core/tests/core_integration.rs | 5 + .../tests/dataframe/dataframe_functions.rs | 1320 ++++--- datafusion/core/tests/dataframe/describe.rs | 99 +- datafusion/core/tests/dataframe/mod.rs | 3466 +++++++++-------- .../core/tests/fuzz_cases/aggregate_fuzz.rs | 2 +- .../fuzz_cases/aggregation_fuzzer/fuzzer.rs | 2 +- .../fuzz_cases/distinct_count_string_fuzz.rs | 2 +- .../core/tests/parquet/custom_reader.rs | 23 +- datafusion/core/tests/parquet/schema.rs | 74 +- .../core/tests/parquet/schema_coercion.rs | 53 +- .../physical_optimizer/projection_pushdown.rs | 9 +- datafusion/core/tests/serde/mod.rs | 34 + datafusion/core/tests/sql/aggregates.rs | 214 +- datafusion/core/tests/sql/mod.rs | 3 +- datafusion/core/tests/sql/path_partition.rs | 166 +- datafusion/core/tests/sql/select.rs | 199 +- .../core/tests/user_defined/expr_planner.rs | 57 +- .../user_defined/user_defined_aggregates.rs | 249 +- .../tests/user_defined/user_defined_plan.rs | 100 +- .../user_defined_scalar_functions.rs | 166 +- .../user_defined_table_functions.rs | 59 +- .../user_defined_window_functions.rs | 395 +- datafusion/datasource-avro/src/mod.rs | 3 + datafusion/datasource-csv/Cargo.toml | 4 - datafusion/datasource-csv/src/file_format.rs | 4 +- datafusion/datasource-csv/src/mod.rs | 4 + datafusion/datasource-csv/src/source.rs | 2 +- datafusion/datasource-json/src/file_format.rs | 4 +- datafusion/datasource-json/src/mod.rs | 4 + datafusion/datasource-json/src/source.rs | 2 +- datafusion/datasource-parquet/Cargo.toml | 2 +- .../datasource-parquet/src/file_format.rs | 154 +- datafusion/datasource-parquet/src/mod.rs | 4 + datafusion/datasource-parquet/src/opener.rs | 18 +- .../datasource-parquet/src/row_filter.rs | 416 +- datafusion/datasource-parquet/src/source.rs | 3 +- datafusion/datasource-parquet/src/writer.rs | 2 +- datafusion/datasource/Cargo.toml | 4 +- datafusion/datasource/src/file_scan_config.rs | 15 +- datafusion/datasource/src/file_sink_config.rs | 2 + datafusion/datasource/src/mod.rs | 3 + datafusion/datasource/src/schema_adapter.rs | 98 +- datafusion/datasource/src/statistics.rs | 6 +- datafusion/datasource/src/url.rs | 7 +- .../datasource/src/write/orchestration.rs | 3 +- datafusion/execution/src/lib.rs | 3 +- datafusion/expr-common/src/accumulator.rs | 1 + datafusion/expr-common/src/lib.rs | 3 +- .../expr-common/src/type_coercion/binary.rs | 148 +- datafusion/expr/src/expr.rs | 379 +- datafusion/expr/src/expr_fn.rs | 40 +- datafusion/expr/src/expr_schema.rs | 14 +- datafusion/expr/src/lib.rs | 4 +- datafusion/expr/src/logical_plan/builder.rs | 131 +- .../expr/src/logical_plan/invariants.rs | 6 +- datafusion/expr/src/logical_plan/mod.rs | 6 +- datafusion/expr/src/logical_plan/plan.rs | 195 +- datafusion/expr/src/logical_plan/tree_node.rs | 4 + datafusion/expr/src/select_expr.rs | 101 + datafusion/expr/src/tree_node.rs | 5 +- datafusion/expr/src/type_coercion/mod.rs | 9 +- datafusion/expr/src/udaf.rs | 48 +- datafusion/expr/src/udf.rs | 138 +- datafusion/expr/src/utils.rs | 163 +- datafusion/expr/src/window_frame.rs | 27 +- datafusion/ffi/src/catalog_provider.rs | 338 ++ datafusion/ffi/src/lib.rs | 5 +- datafusion/ffi/src/schema_provider.rs | 385 ++ datafusion/ffi/src/table_provider.rs | 2 +- datafusion/ffi/src/tests/catalog.rs | 183 + datafusion/ffi/src/tests/mod.rs | 8 + datafusion/ffi/tests/ffi_integration.rs | 27 + .../functions-aggregate-common/src/lib.rs | 3 +- .../src/approx_distinct.rs | 39 +- datafusion/functions-aggregate/src/lib.rs | 3 +- datafusion/functions-aggregate/src/min_max.rs | 69 +- datafusion/functions-nested/src/array_has.rs | 7 + datafusion/functions-nested/src/extract.rs | 32 +- datafusion/functions-nested/src/lib.rs | 3 +- datafusion/functions-nested/src/max.rs | 11 +- .../functions-table/src/generate_series.rs | 11 +- datafusion/functions-table/src/lib.rs | 3 + datafusion/functions-window-common/src/lib.rs | 3 + datafusion/functions-window/src/lib.rs | 3 + datafusion/functions/Cargo.toml | 4 +- datafusion/functions/benches/date_trunc.rs | 7 +- .../functions/src/core/union_extract.rs | 4 +- datafusion/functions/src/lib.rs | 3 +- datafusion/macros/Cargo.toml | 4 +- .../src/analyzer/expand_wildcard_rule.rs | 333 -- .../src/analyzer/inline_table_scan.rs | 205 - datafusion/optimizer/src/analyzer/mod.rs | 8 - .../optimizer/src/analyzer/type_coercion.rs | 75 +- .../optimizer/src/common_subexpr_eliminate.rs | 10 +- datafusion/optimizer/src/eliminate_limit.rs | 1 + datafusion/optimizer/src/lib.rs | 3 +- .../optimizer/src/optimize_projections/mod.rs | 5 +- datafusion/optimizer/src/optimizer.rs | 31 +- .../optimizer/src/propagate_empty_relation.rs | 4 +- datafusion/optimizer/src/push_down_filter.rs | 86 + datafusion/optimizer/src/push_down_limit.rs | 1 + .../optimizer/src/scalar_subquery_to_join.rs | 8 +- .../simplify_expressions/expr_simplifier.rs | 4 + .../src/simplify_expressions/regex.rs | 21 + .../simplify_expressions/simplify_exprs.rs | 47 + .../optimizer/tests/optimizer_integration.rs | 40 +- datafusion/physical-expr-common/src/lib.rs | 3 +- .../physical-expr-common/src/physical_expr.rs | 69 +- .../physical-expr-common/src/sort_expr.rs | 15 +- datafusion/physical-expr/src/aggregate.rs | 18 + .../src/equivalence/properties/mod.rs | 6 +- .../physical-expr/src/expressions/binary.rs | 161 +- .../physical-expr/src/expressions/case.rs | 55 + .../physical-expr/src/expressions/cast.rs | 31 + .../physical-expr/src/expressions/column.rs | 4 + .../physical-expr/src/expressions/in_list.rs | 57 + .../src/expressions/is_not_null.rs | 31 + .../physical-expr/src/expressions/is_null.rs | 20 + .../physical-expr/src/expressions/like.rs | 33 + .../physical-expr/src/expressions/literal.rs | 17 + .../physical-expr/src/expressions/negative.rs | 18 + .../physical-expr/src/expressions/no_op.rs | 4 + .../physical-expr/src/expressions/not.rs | 21 + .../physical-expr/src/expressions/try_cast.rs | 29 + .../src/expressions/unknown_column.rs | 4 + datafusion/physical-expr/src/lib.rs | 8 +- datafusion/physical-expr/src/physical_expr.rs | 124 +- .../physical-expr/src/scalar_function.rs | 45 +- datafusion/physical-optimizer/Cargo.toml | 1 + .../src/aggregate_statistics.rs | 6 +- datafusion/physical-optimizer/src/lib.rs | 3 +- datafusion/physical-optimizer/src/pruning.rs | 56 +- .../src/topk_aggregation.rs | 6 +- datafusion/physical-plan/Cargo.toml | 1 + .../physical-plan/src/aggregates/mod.rs | 473 ++- .../src/aggregates/topk/hash_table.rs | 72 +- .../src/aggregates/topk/priority_map.rs | 125 +- .../physical-plan/src/coalesce_batches.rs | 7 +- .../physical-plan/src/coalesce_partitions.rs | 10 +- datafusion/physical-plan/src/display.rs | 25 +- .../physical-plan/src/execution_plan.rs | 25 +- datafusion/physical-plan/src/filter.rs | 3 +- datafusion/physical-plan/src/insert.rs | 5 +- .../physical-plan/src/joins/cross_join.rs | 30 +- .../physical-plan/src/joins/hash_join.rs | 889 +++-- .../src/joins/nested_loop_join.rs | 8 +- .../src/joins/sort_merge_join.rs | 5 +- .../src/joins/symmetric_hash_join.rs | 5 +- datafusion/physical-plan/src/lib.rs | 5 +- datafusion/physical-plan/src/limit.rs | 3 +- datafusion/physical-plan/src/memory.rs | 13 +- .../physical-plan/src/placeholder_row.rs | 5 +- datafusion/physical-plan/src/projection.rs | 10 +- .../physical-plan/src/repartition/mod.rs | 15 +- .../src/sorts/sort_preserving_merge.rs | 13 +- datafusion/physical-plan/src/stream.rs | 6 +- datafusion/physical-plan/src/union.rs | 10 +- datafusion/physical-plan/src/unnest.rs | 57 +- datafusion/physical-plan/src/work_table.rs | 3 +- datafusion/proto-common/src/lib.rs | 3 +- datafusion/proto/Cargo.toml | 2 +- datafusion/proto/proto/datafusion.proto | 8 +- datafusion/proto/src/generated/pbjson.rs | 147 + datafusion/proto/src/generated/prost.rs | 16 +- datafusion/proto/src/lib.rs | 3 +- datafusion/proto/src/logical_plan/mod.rs | 2 +- datafusion/proto/src/logical_plan/to_proto.rs | 4 + .../proto/src/physical_plan/from_proto.rs | 4 +- datafusion/proto/src/physical_plan/mod.rs | 31 +- .../proto/src/physical_plan/to_proto.rs | 1 + datafusion/proto/tests/cases/mod.rs | 18 +- .../tests/cases/roundtrip_logical_plan.rs | 14 +- .../tests/cases/roundtrip_physical_plan.rs | 18 +- datafusion/sql/src/expr/function.rs | 48 +- datafusion/sql/src/expr/identifier.rs | 36 +- datafusion/sql/src/expr/mod.rs | 144 +- datafusion/sql/src/expr/order_by.rs | 12 +- datafusion/sql/src/expr/subquery.rs | 96 +- datafusion/sql/src/expr/unary_op.rs | 22 +- datafusion/sql/src/expr/value.rs | 14 +- datafusion/sql/src/lib.rs | 3 +- datafusion/sql/src/parser.rs | 87 +- datafusion/sql/src/planner.rs | 158 +- datafusion/sql/src/query.rs | 26 +- datafusion/sql/src/relation/join.rs | 17 +- datafusion/sql/src/relation/mod.rs | 7 +- datafusion/sql/src/resolve.rs | 4 +- datafusion/sql/src/select.rs | 67 +- datafusion/sql/src/set_expr.rs | 2 +- datafusion/sql/src/statement.rs | 138 +- datafusion/sql/src/unparser/ast.rs | 68 +- datafusion/sql/src/unparser/dialect.rs | 10 +- datafusion/sql/src/unparser/expr.rs | 232 +- datafusion/sql/src/unparser/plan.rs | 157 +- datafusion/sql/src/unparser/rewrite.rs | 1 + datafusion/sql/src/unparser/utils.rs | 4 +- datafusion/sql/src/utils.rs | 54 +- datafusion/sql/tests/cases/diagnostic.rs | 108 +- datafusion/sql/tests/cases/plan_to_sql.rs | 258 +- datafusion/sql/tests/sql_integration.rs | 549 +-- datafusion/sqllogictest/Cargo.toml | 4 +- datafusion/sqllogictest/README.md | 4 +- datafusion/sqllogictest/src/lib.rs | 3 + .../sqllogictest/test_files/aggregate.slt | 46 + .../test_files/aggregate_skip_partial.slt | 21 + .../test_files/aggregates_topk.slt | 58 +- datafusion/sqllogictest/test_files/alias.slt | 59 + datafusion/sqllogictest/test_files/array.slt | 9 + datafusion/sqllogictest/test_files/copy.slt | 17 + datafusion/sqllogictest/test_files/cte.slt | 146 + datafusion/sqllogictest/test_files/ddl.slt | 54 + .../sqllogictest/test_files/explain.slt | 134 +- .../sqllogictest/test_files/explain_tree.slt | 1697 +++++--- datafusion/sqllogictest/test_files/expr.slt | 8 +- .../sqllogictest/test_files/group_by.slt | 39 +- .../test_files/information_schema.slt | 2 + datafusion/sqllogictest/test_files/joins.slt | 266 ++ datafusion/sqllogictest/test_files/limit.slt | 2 +- datafusion/sqllogictest/test_files/order.slt | 4 +- .../sqllogictest/test_files/prepare.slt | 15 + datafusion/sqllogictest/test_files/select.slt | 29 + .../sqllogictest/test_files/simplify_expr.slt | 39 +- .../test_files/string/string_view.slt | 8 +- datafusion/sqllogictest/test_files/struct.slt | 47 +- .../sqllogictest/test_files/subquery.slt | 22 +- .../sqllogictest/test_files/subquery_sort.slt | 29 + .../sqllogictest/test_files/type_coercion.slt | 2 +- datafusion/sqllogictest/test_files/union.slt | 110 +- .../sqllogictest/test_files/union_by_name.slt | 12 +- datafusion/sqllogictest/test_files/unnest.slt | 14 +- datafusion/sqllogictest/test_files/update.slt | 4 +- .../sqllogictest/test_files/wildcard.slt | 15 + datafusion/sqllogictest/test_files/window.slt | 59 + datafusion/substrait/Cargo.toml | 2 +- datafusion/substrait/src/lib.rs | 3 + .../substrait/src/logical_plan/consumer.rs | 96 +- .../substrait/src/logical_plan/producer.rs | 10 +- .../tests/cases/consumer_integration.rs | 66 +- .../substrait/tests/cases/logical_plans.rs | 66 +- .../test_plans/double_window.substrait.json | 126 + ...ble_window_distinct_windows.substrait.json | 138 + .../select_count_from_select_1.substrait.json | 92 + .../select_window_count.substrait.json | 137 + datafusion/substrait/tests/utils.rs | 1 + datafusion/wasmtest/Cargo.toml | 2 +- datafusion/wasmtest/src/lib.rs | 25 + dev/changelog/46.0.1.md | 38 + docs/source/contributor-guide/testing.md | 12 + docs/source/library-user-guide/upgrading.md | 95 + .../library-user-guide/working-with-exprs.md | 19 + docs/source/user-guide/configs.md | 3 +- docs/source/user-guide/sql/explain.md | 232 +- test-utils/Cargo.toml | 2 +- 330 files changed, 16131 insertions(+), 8453 deletions(-) create mode 100644 datafusion-cli/CONTRIBUTING.md create mode 100644 datafusion-cli/tests/snapshots/aws_options.snap create mode 100644 datafusion-cli/tests/snapshots/cli@load_local_csv.sql.snap create mode 100644 datafusion-cli/tests/snapshots/cli@load_s3_csv.sql.snap create mode 100644 datafusion-cli/tests/snapshots/cli@select.sql.snap create mode 100644 datafusion-cli/tests/snapshots/cli_format@automatic.snap create mode 100644 datafusion-cli/tests/snapshots/cli_format@csv.snap create mode 100644 datafusion-cli/tests/snapshots/cli_format@json.snap create mode 100644 datafusion-cli/tests/snapshots/cli_format@nd-json.snap create mode 100644 datafusion-cli/tests/snapshots/cli_format@table.snap create mode 100644 datafusion-cli/tests/snapshots/cli_format@tsv.snap create mode 100644 datafusion-cli/tests/snapshots/cli_quick_test@backslash.snap create mode 100644 datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap create mode 100644 datafusion-cli/tests/snapshots/cli_quick_test@files.snap create mode 100644 datafusion-cli/tests/snapshots/cli_quick_test@statements.snap rename datafusion-cli/tests/{data/backslash.txt => sql/backslash.sql} (100%) create mode 100644 datafusion-cli/tests/sql/integration/load_local_csv.sql create mode 100644 datafusion-cli/tests/sql/integration/load_s3_csv.sql rename datafusion-cli/tests/{data/sql.txt => sql/select.sql} (100%) create mode 100644 datafusion-examples/examples/tracing.rs rename datafusion/{core/src/datasource => catalog/src}/cte_worktable.rs (93%) rename datafusion/{core/src/datasource => catalog/src}/default_table_source.rs (98%) rename datafusion/{core/src/catalog_common => catalog/src}/listing_schema.rs (97%) rename datafusion/catalog/src/{memory.rs => memory/catalog.rs} (70%) rename datafusion/{core/src/catalog_common => catalog/src/memory}/mod.rs (77%) create mode 100644 datafusion/catalog/src/memory/schema.rs rename datafusion/{core/src/datasource => catalog/src}/stream.rs (97%) create mode 100644 datafusion/catalog/src/view.rs create mode 100644 datafusion/common-runtime/src/join_set.rs create mode 100644 datafusion/common-runtime/src/trace_utils.rs rename datafusion/core/src/datasource/{view.rs => view_test.rs} (77%) create mode 100644 datafusion/core/tests/serde/mod.rs create mode 100644 datafusion/expr/src/select_expr.rs create mode 100644 datafusion/ffi/src/catalog_provider.rs create mode 100644 datafusion/ffi/src/schema_provider.rs create mode 100644 datafusion/ffi/src/tests/catalog.rs delete mode 100644 datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs delete mode 100644 datafusion/optimizer/src/analyzer/inline_table_scan.rs create mode 100644 datafusion/sqllogictest/test_files/alias.slt create mode 100644 datafusion/substrait/tests/testdata/test_plans/double_window.substrait.json create mode 100644 datafusion/substrait/tests/testdata/test_plans/double_window_distinct_windows.substrait.json create mode 100644 datafusion/substrait/tests/testdata/test_plans/select_count_from_select_1.substrait.json create mode 100644 datafusion/substrait/tests/testdata/test_plans/select_window_count.substrait.json create mode 100644 dev/changelog/46.0.1.md diff --git a/.asf.yaml b/.asf.yaml index 8e939c695d57..5fe94dc04af5 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -24,6 +24,7 @@ notifications: commits: commits@datafusion.apache.org issues: github@datafusion.apache.org pullrequests: github@datafusion.apache.org + discussions: github@datafusion.apache.org jira_options: link label worklog github: description: "Apache DataFusion SQL Query Engine" @@ -44,6 +45,7 @@ github: rebase: false features: issues: true + discussions: true protected_branches: main: required_pull_request_reviews: diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 1ad1c36e1aa7..3942e752579a 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -81,7 +81,7 @@ jobs: - name: Run tests (excluding doctests) env: RUST_BACKTRACE: 1 - run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests + run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests,recursive_protection - name: Verify Working Directory Clean run: git diff --exit-code - name: Cleanup @@ -124,7 +124,7 @@ jobs: rust-version: stable - name: Run sqllogictest run: | - cargo test --profile release-nonlto --test sqllogictests -- --include-sqlite + cargo test --features backtrace --profile release-nonlto --test sqllogictests -- --include-sqlite cargo clean diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f37e5c9f7ef1..599e6e3cc3eb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -66,9 +66,12 @@ jobs: # the changes to `Cargo.lock` after building with the updated manifest. cargo check --profile ci --workspace --all-targets --features integration-tests --locked - # cargo check common, functions and substrait with no default features - linux-cargo-check-no-default-features: - name: cargo check no default features + # Check datafusion-common features + # + # Ensure via `cargo check` that the crate can be built with a + # subset of the features packages enabled. + linux-datafusion-common-features: + name: cargo check datafusion-common features needs: linux-build-lib runs-on: ubuntu-latest container: @@ -79,30 +82,84 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: stable - - name: Check datafusion without default features - # Some of the test binaries require the parquet feature still - #run: cargo check --all-targets --no-default-features -p datafusion - run: cargo check --profile ci --no-default-features -p datafusion - - - name: Check datafusion-common without default features - run: cargo check --profile ci --all-targets --no-default-features -p datafusion-common - - - name: Check datafusion-functions without default features - run: cargo check --profile ci --all-targets --no-default-features -p datafusion-functions - - - name: Check datafusion-substrait without default features - run: cargo check --profile ci --all-targets --no-default-features -p datafusion-substrait - - - name: Check workspace in debug mode - run: cargo check --profile ci --all-targets --workspace - - - name: Check workspace with additional features - run: cargo check --profile ci --workspace --benches --features avro,json,integration-tests + - name: Check datafusion-common (default features) + run: cargo check --profile ci --all-targets -p datafusion-common + # + # Note: Only check libraries (not --all-targets) to cover end user APIs + # + - name: Check datafusion-common (no-default-features) + run: cargo check --profile ci --no-default-features -p datafusion-common + # Note: don't check other feature flags as datafusion-common is not typically used standalone + + # Check datafusion-substrait features + # + # Ensure via `cargo check` that the crate can be built with a + # subset of the features packages enabled. + linux-datafusion-substrait-features: + name: cargo check datafusion-substrait features + needs: linux-build-lib + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v4 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: stable + - name: Check datafusion-substrait (default features) + run: cargo check --profile ci --all-targets -p datafusion-substrait + # + # Note: Only check libraries (not --all-targets) to cover end user APIs + # + - name: Check datafusion-substrait (no-default-features) + run: cargo check --profile ci --no-default-features -p datafusion-substrait + - name: Check datafusion-substrait (physical) + run: cargo check --profile ci --no-default-features -p datafusion-substrait --features=physical + - name: Install cmake + run: | + # note the builder setup runs apt-get update / installs protobuf compiler + apt-get install -y cmake + - name: Check datafusion-substrait (protoc) + run: cargo check --profile ci --no-default-features -p datafusion-substrait --features=protoc - # cargo check datafusion to ensure that the datafusion crate can be built with only a - # subset of the function packages enabled. + # Check datafusion-proto features + # + # Ensure via `cargo check` that the crate can be built with a + # subset of the features packages enabled. + linux-datafusion-proto-features: + name: cargo check datafusion-proto features + needs: linux-build-lib + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v4 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: stable + - name: Check datafusion-proto (default features) + run: cargo check --profile ci --all-targets -p datafusion-proto + # + # Note: Only check libraries (not --all-targets) to cover end user APIs + # + - name: Check datafusion-proto (no-default-features) + run: cargo check --profile ci --no-default-features -p datafusion-proto + - name: Check datafusion-proto (json) + run: cargo check --profile ci --no-default-features -p datafusion-proto --features=json + - name: Check datafusion-proto (parquet) + run: cargo check --profile ci --no-default-features -p datafusion-proto --features=parquet + - name: Check datafusion-proto (avro) + run: cargo check --profile ci --no-default-features -p datafusion-proto --features=avro + + + # Check datafusion crate features + # + # Ensure via `cargo check` that the crate can be built with a + # subset of the features packages enabled. linux-cargo-check-datafusion: - name: cargo check datafusion + name: cargo check datafusion features needs: linux-build-lib runs-on: ubuntu-latest container: @@ -113,31 +170,54 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: stable + - name: Check datafusion (default features) + run: cargo check --profile ci --all-targets -p datafusion + # + # Note: Only check libraries (not --all-targets) to cover end user APIs + # + - name: Check datafusion (no-default-features) + run: cargo check --profile ci --no-default-features -p datafusion - name: Check datafusion (nested_expressions) - run: cargo check --profile ci --no-default-features --features=nested_expressions -p datafusion - - - name: Check datafusion (crypto) - run: cargo check --profile ci --no-default-features --features=crypto_expressions -p datafusion - + run: cargo check --profile ci --no-default-features -p datafusion --features=nested_expressions + - name: Check datafusion (array_expressions) + run: cargo check --profile ci --no-default-features -p datafusion --features=array_expressions + - name: Check datafusion (avro) + run: cargo check --profile ci --no-default-features -p datafusion --features=avro + - name: Check datafusion (backtrace) + run: cargo check --profile ci --no-default-features -p datafusion --features=backtrace + - name: Check datafusion (compression) + run: cargo check --profile ci --no-default-features -p datafusion --features=compression + - name: Check datafusion (crypto_expressions) + run: cargo check --profile ci --no-default-features -p datafusion --features=crypto_expressions - name: Check datafusion (datetime_expressions) - run: cargo check --profile ci --no-default-features --features=datetime_expressions -p datafusion - + run: cargo check --profile ci --no-default-features -p datafusion --features=datetime_expressions - name: Check datafusion (encoding_expressions) - run: cargo check --profile ci --no-default-features --features=encoding_expressions -p datafusion - + run: cargo check --profile ci --no-default-features -p datafusion --features=encoding_expressions + - name: Check datafusion (force_hash_collisions) + run: cargo check --profile ci --no-default-features -p datafusion --features=force_hash_collisions - name: Check datafusion (math_expressions) - run: cargo check --profile ci --no-default-features --features=math_expressions -p datafusion - + run: cargo check --profile ci --no-default-features -p datafusion --features=math_expressions + - name: Check datafusion (parquet) + run: cargo check --profile ci --no-default-features -p datafusion --features=parquet + - name: Check datafusion (pyarrow) + run: cargo check --profile ci --no-default-features -p datafusion --features=pyarrow - name: Check datafusion (regex_expressions) - run: cargo check --profile ci --no-default-features --features=regex_expressions -p datafusion - + run: cargo check --profile ci --no-default-features -p datafusion --features=regex_expressions + - name: Check datafusion (recursive_protection) + run: cargo check --profile ci --no-default-features -p datafusion --features=recursive_protection + - name: Check datafusion (serde) + run: cargo check --profile ci --no-default-features -p datafusion --features=serde - name: Check datafusion (string_expressions) - run: cargo check --profile ci --no-default-features --features=string_expressions -p datafusion + run: cargo check --profile ci --no-default-features -p datafusion --features=string_expressions + - name: Check datafusion (unicode_expressions) + run: cargo check --profile ci --no-default-features -p datafusion --features=unicode_expressions - # cargo check datafusion-functions to ensure that the datafusion-functions crate can be built with - # only a subset of the function packages enabled. + # Check datafusion-functions crate features + # + # Ensure via `cargo check` that the crate can be built with a + # subset of the features packages enabled. linux-cargo-check-datafusion-functions: - name: cargo check functions + name: cargo check datafusion-functions features needs: linux-build-lib runs-on: ubuntu-latest container: @@ -148,44 +228,102 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: stable - - name: Check datafusion-functions (crypto) - run: cargo check --profile ci --all-targets --no-default-features --features=crypto_expressions -p datafusion-functions - + - name: Check datafusion-functions (default features) + run: cargo check --profile ci --all-targets -p datafusion-functions + # + # Note: Only check libraries (not --all-targets) to cover end user APIs + # + - name: Check datafusion-functions (no-default-features) + run: cargo check --profile ci --no-default-features -p datafusion-functions + # Fails due https://github.com/apache/datafusion/issues/15207 + #- name: Check datafusion-functions (core_expressions) + # run: cargo check --profile ci --no-default-features -p datafusion-functions --features=core_expressions + - name: Check datafusion-functions (crypto_expressions) + run: cargo check --profile ci --no-default-features -p datafusion-functions --features=crypto_expressions - name: Check datafusion-functions (datetime_expressions) - run: cargo check --profile ci --all-targets --no-default-features --features=datetime_expressions -p datafusion-functions - + run: cargo check --profile ci --no-default-features -p datafusion-functions --features=datetime_expressions - name: Check datafusion-functions (encoding_expressions) - run: cargo check --profile ci --all-targets --no-default-features --features=encoding_expressions -p datafusion-functions - + run: cargo check --profile ci --no-default-features -p datafusion-functions --features=encoding_expressions - name: Check datafusion-functions (math_expressions) - run: cargo check --profile ci --all-targets --no-default-features --features=math_expressions -p datafusion-functions - + run: cargo check --profile ci --no-default-features -p datafusion-functions --features=math_expressions - name: Check datafusion-functions (regex_expressions) - run: cargo check --profile ci --all-targets --no-default-features --features=regex_expressions -p datafusion-functions - + run: cargo check --profile ci --no-default-features -p datafusion-functions --features=regex_expressions - name: Check datafusion-functions (string_expressions) - run: cargo check --profile ci --all-targets --no-default-features --features=string_expressions -p datafusion-functions + run: cargo check --profile ci --no-default-features -p datafusion-functions --features=string_expressions + - name: Check datafusion-functions (unicode_expressions) + run: cargo check --profile ci --no-default-features -p datafusion-functions --features=unicode_expressions - # Run tests + # Library and integration tests linux-test: name: cargo test (amd64) needs: linux-build-lib runs-on: ubuntu-latest - container: - image: amd64/rust steps: - uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - name: Setup Rust toolchain - uses: ./.github/actions/setup-builder + run: rustup toolchain install stable + - name: Install Protobuf Compiler + run: sudo apt-get install -y protobuf-compiler + - name: Run tests (excluding doctests and datafusion-cli) + env: + RUST_BACKTRACE: 1 + run: | + cargo test \ + --profile ci \ + --exclude datafusion-examples \ + --exclude ffi_example_table_provider \ + --exclude datafusion-benchmarks \ + --exclude datafusion-cli \ + --workspace \ + --lib \ + --tests \ + --bins \ + --features serde,avro,json,backtrace,integration-tests + - name: Verify Working Directory Clean + run: git diff --exit-code + + # datafusion-cli tests + linux-test-datafusion-cli: + name: cargo test datafusion-cli (amd64) + needs: linux-build-lib + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 with: - rust-version: stable + submodules: true + fetch-depth: 1 + - name: Setup Rust toolchain + run: rustup toolchain install stable + - name: Setup Minio - S3-compatible storage + run: | + docker run -d --name minio-container \ + -p 9000:9000 \ + -e MINIO_ROOT_USER=TEST-DataFusionLogin -e MINIO_ROOT_PASSWORD=TEST-DataFusionPassword \ + -v $(pwd)/datafusion/core/tests/data:/source quay.io/minio/minio \ + server /data + docker exec minio-container /bin/sh -c "\ + mc ready local + mc alias set localminio http://localhost:9000 TEST-DataFusionLogin TEST-DataFusionPassword && \ + mc mb localminio/data && \ + mc cp -r /source/* localminio/data" - name: Run tests (excluding doctests) - run: cargo test --profile ci --exclude datafusion-examples --exclude ffi_example_table_provider --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests + env: + RUST_BACKTRACE: 1 + AWS_ENDPOINT: http://127.0.0.1:9000 + AWS_ACCESS_KEY_ID: TEST-DataFusionLogin + AWS_SECRET_ACCESS_KEY: TEST-DataFusionPassword + TEST_STORAGE_INTEGRATION: 1 + AWS_ALLOW_HTTP: true + run: cargo test --profile ci -p datafusion-cli --lib --tests --bins - name: Verify Working Directory Clean run: git diff --exit-code + - name: Minio Output + if: ${{ !cancelled() }} + run: docker logs minio-container + linux-test-example: name: cargo examples (amd64) @@ -259,6 +397,10 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: stable + - name: Install dependencies + run: | + apt-get update -qq + apt-get install -y -qq clang - name: Install wasm-pack run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh - name: Build with wasm-pack @@ -384,7 +526,7 @@ jobs: uses: ./.github/actions/setup-macos-aarch64-builder - name: Run tests (excluding doctests) shell: bash - run: cargo test --profile ci --lib --tests --bins --features avro,json,backtrace,integration-tests + run: cargo test --profile ci --exclude datafusion-cli --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests test-datafusion-pyarrow: name: cargo test pyarrow (amd64) diff --git a/Cargo.lock b/Cargo.lock index 2dc3698e36d9..61f313023b90 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14,7 +14,7 @@ dependencies = [ "core_extensions", "crossbeam-channel", "generational-arena", - "libloading", + "libloading 0.7.4", "lock_api", "parking_lot", "paste", @@ -413,7 +413,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.7.1", + "indexmap 2.8.0", "lexical-core", "num", "serde", @@ -453,6 +453,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85934a9d0261e0fa5d4e2a5295107d743b543a6e0484a835d4b8db2da15306f9" dependencies = [ "bitflags 2.8.0", + "serde", ] [[package]] @@ -520,7 +521,7 @@ version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" dependencies = [ - "bzip2 0.5.1", + "bzip2 0.5.2", "flate2", "futures-core", "memchr", @@ -548,7 +549,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -570,18 +571,18 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "async-trait" -version = "0.1.87" +version = "0.1.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d556ec1359574147ec0c4fc5eb525f3f23263a592b1a9c07e0a75b427de55c97" +checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -607,9 +608,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.18" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90aff65e86db5fe300752551c1b015ef72b708ac54bded8ef43d0d53cb7cb0b1" +checksum = "6a84fe2c5e9965fba0fbc2001db252f1d57527d82a905cca85127df227bca748" dependencies = [ "aws-credential-types", "aws-runtime", @@ -617,7 +618,7 @@ dependencies = [ "aws-sdk-ssooidc", "aws-sdk-sts", "aws-smithy-async", - "aws-smithy-http 0.61.1", + "aws-smithy-http", "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -626,7 +627,7 @@ dependencies = [ "bytes", "fastrand", "hex", - "http 0.2.12", + "http 1.2.0", "ring", "time", "tokio", @@ -637,9 +638,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" +checksum = "4471bef4c22a06d2c7a1b6492493d3fdf24a805323109d6874f9c94d5906ac14" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -647,16 +648,39 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-lc-rs" +version = "1.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dabb68eb3a7aa08b46fddfd59a3d55c978243557a90ab804769f7e20e67d2b01" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bbe221bbf523b625a4dd8585c7f38166e31167ec2ca98051dbcb4c3b6e825d2" +dependencies = [ + "bindgen", + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "aws-runtime" -version = "1.5.5" +version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76dd04d39cc12844c0994f2c9c5a6f5184c22e9188ec1ff723de41910a21dcad" +checksum = "0aff45ffe35196e593ea3b9dd65b320e51e2dda95aff4390bc459e461d09c6ad" dependencies = [ "aws-credential-types", "aws-sigv4", "aws-smithy-async", - "aws-smithy-http 0.60.12", + "aws-smithy-http", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -674,14 +698,14 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.61.0" +version = "1.62.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e65ff295979977039a25f5a0bf067a64bc5e6aa38f3cef4037cf42516265553c" +checksum = "1d5330ad4e8a1ff49e9f26b738611caa72b105c41d41733801d1a36e8f9de936" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", - "aws-smithy-http 0.61.1", + "aws-smithy-http", "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -696,14 +720,14 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.62.0" +version = "1.63.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91430a60f754f235688387b75ee798ef00cfd09709a582be2b7525ebb5306d4f" +checksum = "7956b1a85d49082347a7d17daa2e32df191f3e23c03d47294b99f95413026a78" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", - "aws-smithy-http 0.61.1", + "aws-smithy-http", "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -718,14 +742,14 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.62.0" +version = "1.63.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9276e139d39fff5a0b0c984fc2d30f970f9a202da67234f948fda02e5bea1dbe" +checksum = "065c533fbe6f84962af33fcf02b0350b7c1f79285baab5924615d2be3b232855" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", - "aws-smithy-http 0.61.1", + "aws-smithy-http", "aws-smithy-json", "aws-smithy-query", "aws-smithy-runtime", @@ -741,12 +765,12 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.9" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bfe75fad52793ce6dec0dc3d4b1f388f038b5eb866c8d4d7f3a8e21b5ea5051" +checksum = "69d03c3c05ff80d54ff860fe38c726f6f494c639ae975203a101335f223386db" dependencies = [ "aws-credential-types", - "aws-smithy-http 0.60.12", + "aws-smithy-http", "aws-smithy-runtime-api", "aws-smithy-types", "bytes", @@ -764,9 +788,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.4" +version = "1.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e" +checksum = "1e190749ea56f8c42bf15dd76c65e14f8f765233e6df9b0506d9d934ebef867c" dependencies = [ "futures-util", "pin-project-lite", @@ -775,9 +799,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.12" +version = "0.62.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc" +checksum = "c5949124d11e538ca21142d1fba61ab0a2a2c1bc3ed323cdb3e4b878bfb83166" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", @@ -785,6 +809,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.12", + "http 1.2.0", "http-body 0.4.6", "once_cell", "percent-encoding", @@ -794,30 +819,33 @@ dependencies = [ ] [[package]] -name = "aws-smithy-http" -version = "0.61.1" +name = "aws-smithy-http-client" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6f276f21c7921fe902826618d1423ae5bf74cf8c1b8472aee8434f3dfd31824" +checksum = "0497ef5d53065b7cd6a35e9c1654bd1fefeae5c52900d91d1b188b0af0f29324" dependencies = [ + "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "bytes", - "bytes-utils", - "futures-core", - "http 0.2.12", - "http-body 0.4.6", - "once_cell", - "percent-encoding", + "h2", + "http 1.2.0", + "hyper", + "hyper-rustls", + "hyper-util", "pin-project-lite", - "pin-utils", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tower 0.5.2", "tracing", ] [[package]] name = "aws-smithy-json" -version = "0.61.2" +version = "0.61.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422" +checksum = "92144e45819cae7dc62af23eac5a038a58aa544432d2102609654376a900bd07" dependencies = [ "aws-smithy-types", ] @@ -834,36 +862,33 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.8" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d526a12d9ed61fadefda24abe2e682892ba288c2018bcb38b1b4c111d13f6d92" +checksum = "f6328865e36c6fd970094ead6b05efd047d3a80ec5fc3be5e743910da9f2ebf8" dependencies = [ "aws-smithy-async", - "aws-smithy-http 0.60.12", + "aws-smithy-http", + "aws-smithy-http-client", "aws-smithy-runtime-api", "aws-smithy-types", "bytes", "fastrand", - "h2 0.3.26", "http 0.2.12", + "http 1.2.0", "http-body 0.4.6", "http-body 1.0.1", - "httparse", - "hyper 0.14.32", - "hyper-rustls 0.24.2", "once_cell", "pin-project-lite", "pin-utils", - "rustls 0.21.12", "tokio", "tracing", ] [[package]] name = "aws-smithy-runtime-api" -version = "1.7.3" +version = "1.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd" +checksum = "3da37cf5d57011cb1753456518ec76e31691f1f474b73934a284eb2a1c76510f" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -878,9 +903,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.13" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042" +checksum = "836155caafba616c0ff9b07944324785de2ab016141c3550bd1c07882f8cee8f" dependencies = [ "base64-simd", "bytes", @@ -910,9 +935,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f" +checksum = "3873f8deed8927ce8d04487630dc9ff73193bab64742a61d050e57a68dec4125" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -1020,6 +1045,29 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags 2.8.0", + "cexpr", + "clang-sys", + "itertools 0.10.5", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 2.0.100", + "which", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1055,16 +1103,15 @@ dependencies = [ [[package]] name = "blake3" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1230237285e3e10cde447185e8975408ae24deaa67205ce684805c25bc0c7937" +checksum = "b17679a8d69b6d7fd9cd9801a536cec9fa5e5970b69f9d4747f70b39b031f5e7" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", - "memmap2", ] [[package]] @@ -1091,16 +1138,16 @@ dependencies = [ "home", "http 1.2.0", "http-body-util", - "hyper 1.6.0", + "hyper", "hyper-named-pipe", - "hyper-rustls 0.27.5", + "hyper-rustls", "hyper-util", "hyperlocal", "log", "pin-project-lite", - "rustls 0.23.23", - "rustls-native-certs 0.8.1", - "rustls-pemfile 2.2.0", + "rustls", + "rustls-native-certs", + "rustls-pemfile", "rustls-pki-types", "serde", "serde_derive", @@ -1146,7 +1193,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1243,21 +1290,20 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" dependencies = [ "bzip2-sys", ] [[package]] name = "bzip2-sys" -version = "0.1.12+1.0.8" +version = "0.1.13+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" dependencies = [ "cc", - "libc", "pkg-config", ] @@ -1278,6 +1324,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -1307,9 +1362,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.1" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" +checksum = "350e47081e7261af42fc634dfea5be88662523cc5acd9fe51da3fe44ba058669" dependencies = [ "chrono", "chrono-tz-build", @@ -1353,6 +1408,17 @@ dependencies = [ "half", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading 0.8.6", +] + [[package]] name = "clap" version = "2.34.0" @@ -1366,9 +1432,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.31" +version = "4.5.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767" +checksum = "6088f3ae8c3608d19260cd7445411865a485688711b78b5be70d78cd96136f83" dependencies = [ "clap_builder", "clap_derive", @@ -1376,9 +1442,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.31" +version = "4.5.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863" +checksum = "22a7ef7f676155edfb82daa97f99441f3ebf4a58d5e32f295a56259f1b6facc8" dependencies = [ "anstream", "anstyle", @@ -1388,14 +1454,14 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.28" +version = "4.5.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" +checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1493,16 +1559,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "core-foundation" version = "0.10.0" @@ -1570,7 +1626,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.31", + "clap 4.5.32", "criterion-plot", "futures", "is-terminal", @@ -1677,7 +1733,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a2785755761f3ddc1492979ce1e48d2c00d09311c39e4466429188f3dd6501" dependencies = [ "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1701,7 +1757,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1712,7 +1768,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1737,14 +1793,14 @@ dependencies = [ [[package]] name = "datafusion" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "arrow-ipc", "arrow-schema", "async-trait", "bytes", - "bzip2 0.5.1", + "bzip2 0.5.2", "chrono", "criterion", "ctor", @@ -1779,6 +1835,7 @@ dependencies = [ "env_logger", "flate2", "futures", + "insta", "itertools 0.14.0", "log", "nix", @@ -1805,7 +1862,7 @@ dependencies = [ [[package]] name = "datafusion-benchmarks" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "datafusion", @@ -1829,26 +1886,29 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-trait", "dashmap", "datafusion-common", + "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr", "datafusion-physical-plan", "datafusion-sql", "futures", "itertools 0.14.0", "log", + "object_store", "parking_lot", "tokio", ] [[package]] name = "datafusion-catalog-listing" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-trait", @@ -1869,19 +1929,21 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "assert_cmd", "async-trait", "aws-config", "aws-credential-types", - "clap 4.5.31", + "clap 4.5.32", "ctor", "datafusion", "dirs", "env_logger", "futures", + "insta", + "insta-cmd", "mimalloc", "object_store", "parking_lot", @@ -1896,7 +1958,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "46.0.0" +version = "46.0.1" dependencies = [ "ahash 0.8.11", "apache-avro", @@ -1906,7 +1968,8 @@ dependencies = [ "chrono", "half", "hashbrown 0.14.5", - "indexmap 2.7.1", + "indexmap 2.8.0", + "insta", "libc", "log", "object_store", @@ -1922,21 +1985,22 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "46.0.0" +version = "46.0.1" dependencies = [ + "futures", "log", "tokio", ] [[package]] name = "datafusion-datasource" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-compression", "async-trait", "bytes", - "bzip2 0.5.1", + "bzip2 0.5.2", "chrono", "datafusion-catalog", "datafusion-common", @@ -1964,7 +2028,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "46.0.0" +version = "46.0.1" dependencies = [ "apache-avro", "arrow", @@ -1988,7 +2052,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-trait", @@ -2003,18 +2067,14 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "futures", - "itertools 0.14.0", - "log", "object_store", - "rand 0.8.5", "regex", "tokio", - "url", ] [[package]] name = "datafusion-datasource-json" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-trait", @@ -2036,7 +2096,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-trait", @@ -2065,11 +2125,11 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "46.0.0" +version = "46.0.1" [[package]] name = "datafusion-examples" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "arrow-flight", @@ -2090,13 +2150,15 @@ dependencies = [ "test-utils", "tokio", "tonic", + "tracing", + "tracing-subscriber", "url", "uuid", ] [[package]] name = "datafusion-execution" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "chrono", @@ -2114,7 +2176,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "chrono", @@ -2126,7 +2188,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "env_logger", - "indexmap 2.7.1", + "indexmap 2.8.0", "paste", "recursive", "serde_json", @@ -2135,18 +2197,18 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "datafusion-common", - "indexmap 2.7.1", + "indexmap 2.8.0", "itertools 0.14.0", "paste", ] [[package]] name = "datafusion-ffi" -version = "46.0.0" +version = "46.0.1" dependencies = [ "abi_stable", "arrow", @@ -2164,7 +2226,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "arrow-buffer", @@ -2193,7 +2255,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "46.0.0" +version = "46.0.1" dependencies = [ "ahash 0.8.11", "arrow", @@ -2214,7 +2276,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "46.0.0" +version = "46.0.1" dependencies = [ "ahash 0.8.11", "arrow", @@ -2227,7 +2289,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "arrow-ord", @@ -2248,7 +2310,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-trait", @@ -2262,7 +2324,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "datafusion-common", @@ -2278,7 +2340,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "46.0.0" +version = "46.0.1" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2286,16 +2348,16 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "46.0.0" +version = "46.0.1" dependencies = [ "datafusion-expr", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "datafusion-optimizer" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-trait", @@ -2309,7 +2371,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "env_logger", - "indexmap 2.7.1", + "indexmap 2.8.0", "itertools 0.14.0", "log", "recursive", @@ -2319,7 +2381,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "46.0.0" +version = "46.0.1" dependencies = [ "ahash 0.8.11", "arrow", @@ -2332,7 +2394,7 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "indexmap 2.7.1", + "indexmap 2.8.0", "itertools 0.14.0", "log", "paste", @@ -2343,7 +2405,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "46.0.0" +version = "46.0.1" dependencies = [ "ahash 0.8.11", "arrow", @@ -2355,7 +2417,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "datafusion-common", @@ -2366,6 +2428,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", + "insta", "itertools 0.14.0", "log", "recursive", @@ -2373,7 +2436,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "46.0.0" +version = "46.0.1" dependencies = [ "ahash 0.8.11", "arrow", @@ -2394,7 +2457,8 @@ dependencies = [ "futures", "half", "hashbrown 0.14.5", - "indexmap 2.7.1", + "indexmap 2.8.0", + "insta", "itertools 0.14.0", "log", "parking_lot", @@ -2407,7 +2471,7 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "chrono", @@ -2430,7 +2494,7 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "datafusion-common", @@ -2443,7 +2507,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "bigdecimal", @@ -2455,7 +2519,7 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-window", "env_logger", - "indexmap 2.7.1", + "indexmap 2.8.0", "log", "paste", "recursive", @@ -2466,14 +2530,14 @@ dependencies = [ [[package]] name = "datafusion-sqllogictest" -version = "46.0.0" +version = "46.0.1" dependencies = [ "arrow", "async-trait", "bigdecimal", "bytes", "chrono", - "clap 4.5.31", + "clap 4.5.32", "datafusion", "env_logger", "futures", @@ -2497,7 +2561,7 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "46.0.0" +version = "46.0.1" dependencies = [ "async-recursion", "async-trait", @@ -2516,7 +2580,7 @@ dependencies = [ [[package]] name = "datafusion-wasmtest" -version = "46.0.0" +version = "46.0.1" dependencies = [ "chrono", "console_error_panic_hook", @@ -2589,7 +2653,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2609,6 +2673,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.18" @@ -2624,7 +2694,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2662,7 +2732,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2677,14 +2747,14 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.6" +version = "0.11.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0" +checksum = "c3716d7a920fb4fac5d84e9d4bce8ceb321e9414b4409da61b07b75c1e3d0697" dependencies = [ "anstream", "anstyle", "env_filter", - "humantime", + "jiff", "log", ] @@ -2746,7 +2816,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" dependencies = [ "cfg-if", - "rustix", + "rustix 0.38.44", "windows-sys 0.52.0", ] @@ -2857,6 +2927,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "funty" version = "2.0.0" @@ -2919,7 +2995,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3031,22 +3107,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] -name = "h2" -version = "0.3.26" +name = "globset" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http 0.2.12", - "indexmap 2.7.1", - "slab", - "tokio", - "tokio-util", - "tracing", + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", ] [[package]] @@ -3061,7 +3131,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.2.0", - "indexmap 2.7.1", + "indexmap 2.8.0", "slab", "tokio", "tokio-util", @@ -3070,9 +3140,9 @@ dependencies = [ [[package]] name = "half" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +checksum = "7db2ff139bba50379da6aa0766b52fdcb62cb5b263009b09ed58ba604e14bbd1" dependencies = [ "cfg-if", "crunchy", @@ -3228,30 +3298,6 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" -[[package]] -name = "hyper" -version = "0.14.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.26", - "http 0.2.12", - "http-body 0.4.6", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - [[package]] name = "hyper" version = "1.6.0" @@ -3261,7 +3307,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.8", + "h2", "http 1.2.0", "http-body 1.0.1", "httparse", @@ -3280,7 +3326,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278" dependencies = [ "hex", - "hyper 1.6.0", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -3288,22 +3334,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "hyper-rustls" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper 0.14.32", - "log", - "rustls 0.21.12", - "rustls-native-certs 0.6.3", - "tokio", - "tokio-rustls 0.24.1", -] - [[package]] name = "hyper-rustls" version = "0.27.5" @@ -3312,13 +3342,13 @@ checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", "http 1.2.0", - "hyper 1.6.0", + "hyper", "hyper-util", - "rustls 0.23.23", - "rustls-native-certs 0.8.1", + "rustls", + "rustls-native-certs", "rustls-pki-types", "tokio", - "tokio-rustls 0.26.1", + "tokio-rustls", "tower-service", ] @@ -3328,7 +3358,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" dependencies = [ - "hyper 1.6.0", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -3346,7 +3376,7 @@ dependencies = [ "futures-util", "http 1.2.0", "http-body 1.0.1", - "hyper 1.6.0", + "hyper", "pin-project-lite", "socket2", "tokio", @@ -3362,7 +3392,7 @@ checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7" dependencies = [ "hex", "http-body-util", - "hyper 1.6.0", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -3507,7 +3537,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3550,9 +3580,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.1" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +checksum = "3954d50fe15b02142bf25d3b8bdadb634ec3948f103d04ffe3031bc8fe9d7058" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -3578,6 +3608,34 @@ version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" +[[package]] +name = "insta" +version = "1.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50259abbaa67d11d2bcafc7ba1d094ed7a0c70e3ce893f0d0997f73558cb3084" +dependencies = [ + "console", + "globset", + "linked-hash-map", + "once_cell", + "pin-project", + "regex", + "serde", + "similar", + "walkdir", +] + +[[package]] +name = "insta-cmd" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffeeefa927925cced49ccb01bf3e57c9d4cd132df21e576eb9415baeab2d3de6" +dependencies = [ + "insta", + "serde", + "serde_json", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -3640,6 +3698,30 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "jiff" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d699bc6dfc879fb1bf9bdff0d4c56f0884fc6f0d0eb0fba397a6d00cd9a6b85e" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d16e75759ee0aa64c57a56acbf43916987b20c77373cb7e808979e02b93c9f9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3665,6 +3747,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "lexical-core" version = "1.0.5" @@ -3731,9 +3819,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.170" +version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" [[package]] name = "libflate" @@ -3769,6 +3857,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "libloading" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" +dependencies = [ + "cfg-if", + "windows-targets 0.52.6", +] + [[package]] name = "libm" version = "0.2.11" @@ -3777,9 +3875,9 @@ checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] name = "libmimalloc-sys" -version = "0.1.39" +version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" +checksum = "07d0e07885d6a754b9c7993f2625187ad694ee985d60f23355ff0e7077261502" dependencies = [ "cc", "libc", @@ -3804,16 +3902,28 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.31", + "clap 4.5.32", "escape8259", ] +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + [[package]] name = "linux-raw-sys" version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" +[[package]] +name = "linux-raw-sys" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db9c683daf087dc577b7506e9695b3d556a9f3849903fa28186283afd6809e9" + [[package]] name = "litemap" version = "0.7.4" @@ -3878,15 +3988,6 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "memmap2" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" -dependencies = [ - "libc", -] - [[package]] name = "memoffset" version = "0.9.1" @@ -3898,9 +3999,9 @@ dependencies = [ [[package]] name = "mimalloc" -version = "0.1.43" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" +checksum = "99585191385958383e13f6b822e6b6d8d9cf928e7d286ceb092da92b43c87bc1" dependencies = [ "libmimalloc-sys", ] @@ -3921,6 +4022,12 @@ dependencies = [ "walkdir", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.4" @@ -3968,6 +4075,16 @@ dependencies = [ "libc", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -3983,6 +4100,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.3" @@ -4091,7 +4218,7 @@ dependencies = [ "chrono", "futures", "humantime", - "hyper 1.6.0", + "hyper", "itertools 0.13.0", "md-5", "parking_lot", @@ -4100,7 +4227,7 @@ dependencies = [ "rand 0.8.5", "reqwest", "ring", - "rustls-pemfile 2.2.0", + "rustls-pemfile", "serde", "serde_json", "snafu", @@ -4149,6 +4276,12 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "owo-colors" version = "4.1.0" @@ -4237,7 +4370,7 @@ dependencies = [ "regex", "regex-syntax", "structmeta", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4305,7 +4438,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", - "indexmap 2.7.1", + "indexmap 2.8.0", ] [[package]] @@ -4363,7 +4496,7 @@ checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4418,6 +4551,15 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postgres-derive" version = "0.4.6" @@ -4427,7 +4569,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4508,12 +4650,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.29" +version = "0.2.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" +checksum = "5316f57387668042f561aae71480de936257848f9c43ce528e311d89a07cadeb" dependencies = [ "proc-macro2", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4584,7 +4726,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.98", + "syn 2.0.100", "tempfile", ] @@ -4598,7 +4740,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4695,7 +4837,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4708,7 +4850,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4737,8 +4879,8 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", - "rustls 0.23.23", + "rustc-hash 2.1.1", + "rustls", "socket2", "thiserror 2.0.12", "tokio", @@ -4755,8 +4897,8 @@ dependencies = [ "getrandom 0.2.15", "rand 0.8.5", "ring", - "rustc-hash", - "rustls 0.23.23", + "rustc-hash 2.1.1", + "rustls", "rustls-pki-types", "slab", "thiserror 2.0.12", @@ -4781,9 +4923,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -4912,7 +5054,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5023,12 +5165,12 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2 0.4.8", + "h2", "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.6.0", - "hyper-rustls 0.27.5", + "hyper", + "hyper-rustls", "hyper-util", "ipnet", "js-sys", @@ -5038,16 +5180,16 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.23", - "rustls-native-certs 0.8.1", - "rustls-pemfile 2.2.0", + "rustls", + "rustls-native-certs", + "rustls-pemfile", "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", "sync_wrapper", "tokio", - "tokio-rustls 0.26.1", + "tokio-rustls", "tokio-util", "tower 0.5.2", "tower-service", @@ -5134,7 +5276,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.98", + "syn 2.0.100", "unicode-ident", ] @@ -5146,14 +5288,14 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", "rand 0.8.5", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "rust_decimal" -version = "1.36.0" +version = "1.37.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b082d80e3e3cc52b2ed634388d436fe1f4de6af5786cc2de9ba9737527bdf555" +checksum = "faa7de2ba56ac291bd90c6b9bece784a52ae1411f9506544b3eae36dd2356d50" dependencies = [ "arrayvec", "borsh", @@ -5172,6 +5314,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.1" @@ -5196,20 +5344,21 @@ dependencies = [ "bitflags 2.8.0", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.4.15", "windows-sys 0.59.0", ] [[package]] -name = "rustls" -version = "0.21.12" +name = "rustix" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" +checksum = "f7178faa4b75a30e269c71e61c353ce2748cf3d76f0c44c393f4e60abf49b825" dependencies = [ - "log", - "ring", - "rustls-webpki 0.101.7", - "sct", + "bitflags 2.8.0", + "errno", + "libc", + "linux-raw-sys 0.9.2", + "windows-sys 0.59.0", ] [[package]] @@ -5218,26 +5367,15 @@ version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ + "aws-lc-rs", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.102.8", + "rustls-webpki", "subtle", "zeroize", ] -[[package]] -name = "rustls-native-certs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" -dependencies = [ - "openssl-probe", - "rustls-pemfile 1.0.4", - "schannel", - "security-framework 2.11.1", -] - [[package]] name = "rustls-native-certs" version = "0.8.1" @@ -5247,16 +5385,7 @@ dependencies = [ "openssl-probe", "rustls-pki-types", "schannel", - "security-framework 3.2.0", -] - -[[package]] -name = "rustls-pemfile" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" -dependencies = [ - "base64 0.21.7", + "security-framework", ] [[package]] @@ -5277,22 +5406,13 @@ dependencies = [ "web-time", ] -[[package]] -name = "rustls-webpki" -version = "0.101.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "rustls-webpki" version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -5352,9 +5472,9 @@ dependencies = [ [[package]] name = "schemars" -version = "0.8.21" +version = "0.8.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09c024468a378b7e36765cd36702b7a90cc3cba11654f6685c8f233408e89e92" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" dependencies = [ "dyn-clone", "schemars_derive", @@ -5364,14 +5484,14 @@ dependencies = [ [[package]] name = "schemars_derive" -version = "0.8.21" +version = "0.8.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1eee588578aff73f856ab961cd2f79e36bc45d7ded33a7562adba4667aecc0e" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5380,35 +5500,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "sct" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "seahash" version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" -[[package]] -name = "security-framework" -version = "2.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" -dependencies = [ - "bitflags 2.8.0", - "core-foundation 0.9.4", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - [[package]] name = "security-framework" version = "3.2.0" @@ -5416,7 +5513,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" dependencies = [ "bitflags 2.8.0", - "core-foundation 0.10.0", + "core-foundation", "core-foundation-sys", "libc", "security-framework-sys", @@ -5449,9 +5546,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.218" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] @@ -5467,13 +5564,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.218" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5484,14 +5581,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "serde_json" -version = "1.0.139" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44f86c3acccc9c65b153fe1b85a3be07fe5515274ec9f0653b4a0875731c72a6" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", @@ -5507,7 +5604,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5519,7 +5616,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5544,7 +5641,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.7.1", + "indexmap 2.8.0", "serde", "serde_derive", "serde_json", @@ -5561,7 +5658,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5570,7 +5667,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.7.1", + "indexmap 2.8.0", "itoa", "ryu", "serde", @@ -5588,6 +5685,15 @@ dependencies = [ "digest", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" @@ -5654,7 +5760,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5718,9 +5824,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.54.0" +version = "0.55.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" +checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" dependencies = [ "log", "recursive", @@ -5735,7 +5841,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5789,7 +5895,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5800,7 +5906,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5852,7 +5958,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5865,7 +5971,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5880,9 +5986,9 @@ dependencies = [ [[package]] name = "substrait" -version = "0.53.2" +version = "0.55.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac3d70185423235f37b889764e184b81a5af4bb7c95833396ee9bd92577e1b" +checksum = "b3a359aeb711c1e1944c0c4178bbb2d679d39237ac5bfe28f7e0506e522e5ce6" dependencies = [ "heck 0.5.0", "pbjson", @@ -5899,7 +6005,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.98", + "syn 2.0.100", "typify", "walkdir", ] @@ -5923,9 +6029,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.98" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -5949,7 +6055,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -5980,15 +6086,14 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.17.1" +version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230" +checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf" dependencies = [ - "cfg-if", "fastrand", "getrandom 0.3.1", "once_cell", - "rustix", + "rustix 1.0.2", "windows-sys 0.59.0", ] @@ -6082,7 +6187,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -6093,7 +6198,17 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", +] + +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", ] [[package]] @@ -6184,9 +6299,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.43.0" +version = "1.44.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" +checksum = "f382da615b842244d4b8738c82ed1275e6c5dd90c459a30941cd07080b06c91a" dependencies = [ "backtrace", "bytes", @@ -6208,7 +6323,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -6237,23 +6352,13 @@ dependencies = [ "whoami", ] -[[package]] -name = "tokio-rustls" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" -dependencies = [ - "rustls 0.21.12", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.23", + "rustls", "tokio", ] @@ -6285,9 +6390,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.13" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" +checksum = "6b9590b93e6fcc1739458317cccd391ad3955e2bde8913edf6f95f9e65a8f034" dependencies = [ "bytes", "futures-core", @@ -6308,7 +6413,7 @@ version = "0.22.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474" dependencies = [ - "indexmap 2.7.1", + "indexmap 2.8.0", "toml_datetime", "winnow", ] @@ -6324,11 +6429,11 @@ dependencies = [ "axum", "base64 0.22.1", "bytes", - "h2 0.4.8", + "h2", "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.6.0", + "hyper", "hyper-timeout", "hyper-util", "percent-encoding", @@ -6409,7 +6514,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -6419,6 +6524,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" +dependencies = [ + "nu-ansi-term", + "sharded-slab", + "smallvec", + "thread_local", + "tracing-core", + "tracing-log", ] [[package]] @@ -6475,7 +6606,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -6509,7 +6640,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.98", + "syn 2.0.100", "thiserror 2.0.12", "unicode-ident", ] @@ -6527,7 +6658,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.98", + "syn 2.0.100", "typify-impl", ] @@ -6632,9 +6763,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.15.1" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" +checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" dependencies = [ "getrandom 0.3.1", "js-sys", @@ -6642,6 +6773,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "version_check" version = "0.9.5" @@ -6725,7 +6862,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "wasm-bindgen-shared", ] @@ -6760,7 +6897,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6795,7 +6932,7 @@ checksum = "17d5042cc5fa009658f9a7333ef24291b1291a25b6382dd68862a7f3b969f69b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -6831,6 +6968,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix 0.38.44", +] + [[package]] name = "whoami" version = "1.5.2" @@ -6912,7 +7061,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -6923,7 +7072,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -7159,8 +7308,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e105d177a3871454f754b33bb0ee637ecaaac997446375fd3e5d43a2ed00c909" dependencies = [ "libc", - "linux-raw-sys", - "rustix", + "linux-raw-sys 0.4.15", + "rustix 0.38.44", ] [[package]] @@ -7198,7 +7347,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "synstructure", ] @@ -7229,7 +7378,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -7240,7 +7389,7 @@ checksum = "76331675d372f91bf8d17e13afbd5fe639200b73d01f0fc748bb059f9cca2db7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -7260,7 +7409,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "synstructure", ] @@ -7289,7 +7438,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 871377f8dfc0..1a7a915ee89c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ repository = "https://github.com/apache/datafusion" # Define Minimum Supported Rust Version (MSRV) rust-version = "1.82.0" # Define DataFusion version -version = "46.0.0" +version = "46.0.1" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -99,50 +99,50 @@ arrow-ipc = { version = "54.2.0", default-features = false, features = [ ] } arrow-ord = { version = "54.1.0", default-features = false } arrow-schema = { version = "54.1.0", default-features = false } -async-trait = "0.1.87" +async-trait = "0.1.88" bigdecimal = "0.4.7" bytes = "1.10" chrono = { version = "0.4.38", default-features = false } criterion = "0.5.1" ctor = "0.2.9" dashmap = "6.0.1" -datafusion = { path = "datafusion/core", version = "46.0.0", default-features = false } -datafusion-catalog = { path = "datafusion/catalog", version = "46.0.0" } -datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "46.0.0" } -datafusion-common = { path = "datafusion/common", version = "46.0.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "46.0.0" } -datafusion-datasource = { path = "datafusion/datasource", version = "46.0.0", default-features = false } -datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "46.0.0", default-features = false } -datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "46.0.0", default-features = false } -datafusion-datasource-json = { path = "datafusion/datasource-json", version = "46.0.0", default-features = false } -datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "46.0.0", default-features = false } -datafusion-doc = { path = "datafusion/doc", version = "46.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "46.0.0" } -datafusion-expr = { path = "datafusion/expr", version = "46.0.0" } -datafusion-expr-common = { path = "datafusion/expr-common", version = "46.0.0" } -datafusion-ffi = { path = "datafusion/ffi", version = "46.0.0" } -datafusion-functions = { path = "datafusion/functions", version = "46.0.0" } -datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "46.0.0" } -datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "46.0.0" } -datafusion-functions-nested = { path = "datafusion/functions-nested", version = "46.0.0" } -datafusion-functions-table = { path = "datafusion/functions-table", version = "46.0.0" } -datafusion-functions-window = { path = "datafusion/functions-window", version = "46.0.0" } -datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "46.0.0" } -datafusion-macros = { path = "datafusion/macros", version = "46.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "46.0.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "46.0.0", default-features = false } -datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "46.0.0", default-features = false } -datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "46.0.0" } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "46.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "46.0.0" } -datafusion-proto-common = { path = "datafusion/proto-common", version = "46.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "46.0.0" } +datafusion = { path = "datafusion/core", version = "46.0.1", default-features = false } +datafusion-catalog = { path = "datafusion/catalog", version = "46.0.1" } +datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "46.0.1" } +datafusion-common = { path = "datafusion/common", version = "46.0.1", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "46.0.1" } +datafusion-datasource = { path = "datafusion/datasource", version = "46.0.1", default-features = false } +datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "46.0.1", default-features = false } +datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "46.0.1", default-features = false } +datafusion-datasource-json = { path = "datafusion/datasource-json", version = "46.0.1", default-features = false } +datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "46.0.1", default-features = false } +datafusion-doc = { path = "datafusion/doc", version = "46.0.1" } +datafusion-execution = { path = "datafusion/execution", version = "46.0.1" } +datafusion-expr = { path = "datafusion/expr", version = "46.0.1" } +datafusion-expr-common = { path = "datafusion/expr-common", version = "46.0.1" } +datafusion-ffi = { path = "datafusion/ffi", version = "46.0.1" } +datafusion-functions = { path = "datafusion/functions", version = "46.0.1" } +datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "46.0.1" } +datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "46.0.1" } +datafusion-functions-nested = { path = "datafusion/functions-nested", version = "46.0.1" } +datafusion-functions-table = { path = "datafusion/functions-table", version = "46.0.1" } +datafusion-functions-window = { path = "datafusion/functions-window", version = "46.0.1" } +datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "46.0.1" } +datafusion-macros = { path = "datafusion/macros", version = "46.0.1" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "46.0.1", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "46.0.1", default-features = false } +datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "46.0.1", default-features = false } +datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "46.0.1" } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "46.0.1" } +datafusion-proto = { path = "datafusion/proto", version = "46.0.1" } +datafusion-proto-common = { path = "datafusion/proto-common", version = "46.0.1" } +datafusion-sql = { path = "datafusion/sql", version = "46.0.1" } doc-comment = "0.3" env_logger = "0.11" futures = "0.3" -half = { version = "2.2.1", default-features = false } +half = { version = "2.5.0", default-features = false } hashbrown = { version = "0.14.5", features = ["raw"] } -indexmap = "2.7.1" +indexmap = "2.8.0" itertools = "0.14" log = "^0.4" object_store = { version = "0.11.0", default-features = false } @@ -155,21 +155,22 @@ parquet = { version = "54.2.1", default-features = false, features = [ pbjson = { version = "0.7.0" } pbjson-types = "0.7" # Should match arrow-flight's version of prost. +insta = { version = "1.41.1", features = ["glob", "filters"] } prost = "0.13.1" rand = "0.8.5" recursive = "0.1.1" regex = "1.8" rstest = "0.24.0" serde_json = "1" -sqlparser = { version = "0.54.0", features = ["visitor"] } +sqlparser = { version = "0.55.0", features = ["visitor"] } tempfile = "3" -tokio = { version = "1.43", features = ["macros", "rt", "sync"] } +tokio = { version = "1.44", features = ["macros", "rt", "sync"] } url = "2.5.4" [profile.release] codegen-units = 1 lto = true -strip = true +strip = true # Eliminate debug information to minimize binary size # the release profile takes a long time to build so we can use this profile during development to save time # cargo build --profile release-nonlto @@ -182,6 +183,7 @@ lto = false opt-level = 3 overflow-checks = false rpath = false +strip = false # Retain debug info for flamegraphs [profile.ci] inherits = "dev" @@ -197,6 +199,7 @@ incremental = false [workspace.lints.clippy] # Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml) large_futures = "warn" +used_underscore_binding = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 18478de6c8b8..063f4dac22d8 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -45,13 +45,13 @@ mimalloc = { version = "0.1", optional = true, default-features = false } object_store = { workspace = true } parquet = { workspace = true, default-features = true } rand = { workspace = true } -serde = { version = "1.0.218", features = ["derive"] } +serde = { version = "1.0.219", features = ["derive"] } serde_json = { workspace = true } snmalloc-rs = { version = "0.3", optional = true } structopt = { version = "0.3", default-features = false } test-utils = { path = "../test-utils/", version = "0.1.0" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tokio-util = { version = "0.7.4" } +tokio-util = { version = "0.7.14" } [dev-dependencies] datafusion-proto = { workspace = true } diff --git a/benchmarks/README.md b/benchmarks/README.md index f17d6b5a07b6..39b4584bd202 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -333,7 +333,8 @@ The output of `dfbench` help includes a description of each benchmark, which is ## Cancellation -Test performance of cancelling queries +Test performance of cancelling queries. + Queries in DataFusion should stop executing "quickly" after they are cancelled (the output stream is dropped). diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs index a9750d9b4b84..e07cb4779cd1 100644 --- a/benchmarks/src/clickbench.rs +++ b/benchmarks/src/clickbench.rs @@ -129,6 +129,7 @@ impl RunOpt { self.register_hits(&ctx).await?; let iterations = self.common.iterations; + let mut millis = Vec::with_capacity(iterations); let mut benchmark_run = BenchmarkRun::new(); for query_id in query_range { benchmark_run.start_new_case(&format!("Query {query_id}")); @@ -140,6 +141,7 @@ impl RunOpt { let results = ctx.sql(sql).await?.collect().await?; let elapsed = start.elapsed(); let ms = elapsed.as_secs_f64() * 1000.0; + millis.push(ms); let row_count: usize = results.iter().map(|b| b.num_rows()).sum(); println!( "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows" @@ -149,6 +151,8 @@ impl RunOpt { if self.common.debug { ctx.sql(sql).await?.explain(false, false)?.show().await?; } + let avg = millis.iter().sum::() / millis.len() as f64; + println!("Query {query_id} avg time: {avg:.2} ms"); } benchmark_run.maybe_write_json(self.output_path.as_ref())?; Ok(()) diff --git a/benchmarks/src/imdb/convert.rs b/benchmarks/src/imdb/convert.rs index 4e470d711da5..e7949aa715c2 100644 --- a/benchmarks/src/imdb/convert.rs +++ b/benchmarks/src/imdb/convert.rs @@ -16,6 +16,7 @@ // under the License. use datafusion::dataframe::DataFrameWriteOptions; +use datafusion::logical_expr::select_expr::SelectExpr; use datafusion_common::instant::Instant; use std::path::PathBuf; @@ -74,7 +75,8 @@ impl ConvertOpt { .iter() .take(schema.fields.len()) .map(Expr::from) - .collect(); + .map(SelectExpr::from) + .collect::>(); csv = csv.select(selection)?; diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs index 30178d17aa54..7f391d930045 100644 --- a/benchmarks/src/tpch/convert.rs +++ b/benchmarks/src/tpch/convert.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use datafusion::logical_expr::select_expr::SelectExpr; use datafusion_common::instant::Instant; use std::fs; use std::path::{Path, PathBuf}; @@ -89,7 +90,8 @@ impl ConvertOpt { .iter() .take(schema.fields.len() - 1) .map(Expr::from) - .collect(); + .map(SelectExpr::from) + .collect::>(); csv = csv.select(selection)?; // optionally, repartition the file diff --git a/datafusion-cli/CONTRIBUTING.md b/datafusion-cli/CONTRIBUTING.md new file mode 100644 index 000000000000..4b464dffc57c --- /dev/null +++ b/datafusion-cli/CONTRIBUTING.md @@ -0,0 +1,75 @@ + + +# Development instructions + +## Running Tests + +Tests can be run using `cargo` + +```shell +cargo test +``` + +## Running Storage Integration Tests + +By default, storage integration tests are not run. To run them you will need to set `TEST_STORAGE_INTEGRATION=1` and +then provide the necessary configuration for that object store. + +For some of the tests, [snapshots](https://datafusion.apache.org/contributor-guide/testing.html#snapshot-testing) are used. + +### AWS + +To test the S3 integration against [Minio](https://github.com/minio/minio) + +First start up a container with Minio and load test files. + +```shell +docker run -d \ + --name datafusion-test-minio \ + -p 9000:9000 \ + -e MINIO_ROOT_USER=TEST-DataFusionLogin \ + -e MINIO_ROOT_PASSWORD=TEST-DataFusionPassword \ + -v $(pwd)/../datafusion/core/tests/data:/source \ + quay.io/minio/minio server /data + +docker exec datafusion-test-minio /bin/sh -c "\ + mc ready local + mc alias set localminio http://localhost:9000 TEST-DataFusionLogin TEST-DataFusionPassword && \ + mc mb localminio/data && \ + mc cp -r /source/* localminio/data" +``` + +Setup environment + +```shell +export TEST_STORAGE_INTEGRATION=1 +export AWS_ACCESS_KEY_ID=TEST-DataFusionLogin +export AWS_SECRET_ACCESS_KEY=TEST-DataFusionPassword +export AWS_ENDPOINT=http://127.0.0.1:9000 +export AWS_ALLOW_HTTP=true +``` + +Note that `AWS_ENDPOINT` is set without slash at the end. + +Run tests + +```shell +cargo test +``` diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 13246804cd64..eecbcbd48f25 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -37,9 +37,9 @@ backtrace = ["datafusion/backtrace"] [dependencies] arrow = { workspace = true } async-trait = { workspace = true } -aws-config = "1.5.18" +aws-config = "1.6.0" aws-credential-types = "1.2.0" -clap = { version = "4.5.31", features = ["derive", "cargo"] } +clap = { version = "4.5.32", features = ["derive", "cargo"] } datafusion = { workspace = true, features = [ "avro", "crypto_expressions", @@ -67,5 +67,7 @@ url = { workspace = true } [dev-dependencies] assert_cmd = "2.0" ctor = { workspace = true } +insta = { workspace = true } +insta-cmd = "0.6.0" predicates = "3.0" rstest = { workspace = true } diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 52665df3751e..e21006312d85 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -322,7 +322,8 @@ fn extract_memory_pool_size(size: &str) -> Result { #[cfg(test)] mod tests { use super::*; - use datafusion::assert_batches_eq; + use datafusion::common::test_util::batches_to_string; + use insta::assert_snapshot; fn assert_conversion(input: &str, expected: Result) { let result = extract_memory_pool_size(input); @@ -391,21 +392,26 @@ mod tests { let df = ctx.sql(sql).await?; let rbs = df.collect().await?; - let excepted = [ - "+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", - "| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |", - "+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", - "| ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | \"f0.list.item\" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 |", - "+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", - ]; - assert_batches_eq!(excepted, &rbs); + assert_snapshot!(batches_to_string(&rbs), @r#" + +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | + +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 | + +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + "#); // input with double quote let sql = "SELECT * FROM parquet_metadata(\"../datafusion/core/tests/data/fixed_size_list_array.parquet\")"; let df = ctx.sql(sql).await?; let rbs = df.collect().await?; - assert_batches_eq!(excepted, &rbs); + assert_snapshot!(batches_to_string(&rbs), @r#" + +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | + +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 | + +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + "#); Ok(()) } @@ -421,15 +427,13 @@ mod tests { let df = ctx.sql(sql).await?; let rbs = df.collect().await?; - let excepted = [ - -"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", -"| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |", -"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", -"| ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | \"String\" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 |", -"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+" - ]; - assert_batches_eq!(excepted, &rbs); + assert_snapshot!(batches_to_string(&rbs),@r#" + +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | + +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 | + +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ + "#); Ok(()) } diff --git a/datafusion-cli/tests/cli_integration.rs b/datafusion-cli/tests/cli_integration.rs index fa170ae19259..a54a920e97bb 100644 --- a/datafusion-cli/tests/cli_integration.rs +++ b/datafusion-cli/tests/cli_integration.rs @@ -17,10 +17,24 @@ use std::process::Command; -use assert_cmd::prelude::{CommandCargoExt, OutputAssertExt}; -use predicates::prelude::predicate; use rstest::rstest; +use insta::{glob, Settings}; +use insta_cmd::{assert_cmd_snapshot, get_cargo_bin}; +use std::{env, fs}; + +fn cli() -> Command { + Command::new(get_cargo_bin("datafusion-cli")) +} + +fn make_settings() -> Settings { + let mut settings = Settings::clone_current(); + settings.set_prepend_module_to_snapshot(false); + settings.add_filter(r"Elapsed .* seconds\.", "[ELAPSED]"); + settings.add_filter(r"DataFusion CLI v.*", "[CLI_VERSION]"); + settings +} + #[cfg(test)] #[ctor::ctor] fn init() { @@ -28,35 +42,106 @@ fn init() { let _ = env_logger::try_init(); } -// Disabled due to https://github.com/apache/datafusion/issues/10793 -#[cfg(not(target_family = "windows"))] #[rstest] -#[case::exec_from_commands( - ["--command", "select 1", "--format", "json", "-q"], - "[{\"Int64(1)\":1}]\n" -)] #[case::exec_multiple_statements( - ["--command", "select 1; select 2;", "--format", "json", "-q"], - "[{\"Int64(1)\":1}]\n[{\"Int64(2)\":2}]\n" + "statements", + ["--command", "select 1; select 2;", "-q"], )] #[case::exec_backslash( - ["--file", "tests/data/backslash.txt", "--format", "json", "-q"], - "[{\"Utf8(\\\"\\\\\\\")\":\"\\\\\",\"Utf8(\\\"\\\\\\\\\\\")\":\"\\\\\\\\\",\"Utf8(\\\"\\\\\\\\\\\\\\\\\\\\\\\")\":\"\\\\\\\\\\\\\\\\\\\\\",\"Utf8(\\\"dsdsds\\\\\\\\\\\\\\\\\\\")\":\"dsdsds\\\\\\\\\\\\\\\\\",\"Utf8(\\\"\\\\t\\\")\":\"\\\\t\",\"Utf8(\\\"\\\\0\\\")\":\"\\\\0\",\"Utf8(\\\"\\\\n\\\")\":\"\\\\n\"}]\n" + "backslash", + ["--file", "tests/sql/backslash.sql", "--format", "json", "-q"], )] #[case::exec_from_files( - ["--file", "tests/data/sql.txt", "--format", "json", "-q"], - "[{\"Int64(1)\":1}]\n" + "files", + ["--file", "tests/sql/select.sql", "-q"], )] #[case::set_batch_size( - ["--command", "show datafusion.execution.batch_size", "--format", "json", "-q", "-b", "1"], - "[{\"name\":\"datafusion.execution.batch_size\",\"value\":\"1\"}]\n" + "batch_size", + ["--command", "show datafusion.execution.batch_size", "-q", "-b", "1"], )] #[test] fn cli_quick_test<'a>( + #[case] snapshot_name: &'a str, #[case] args: impl IntoIterator, - #[case] expected: &str, ) { - let mut cmd = Command::cargo_bin("datafusion-cli").unwrap(); + let mut settings = make_settings(); + settings.set_snapshot_suffix(snapshot_name); + let _bound = settings.bind_to_scope(); + + let mut cmd = cli(); cmd.args(args); - cmd.assert().stdout(predicate::eq(expected)); + + assert_cmd_snapshot!(cmd); +} + +#[rstest] +#[case("csv")] +#[case("tsv")] +#[case("table")] +#[case("json")] +#[case("nd-json")] +#[case("automatic")] +#[test] +fn test_cli_format<'a>(#[case] format: &'a str) { + let mut settings = make_settings(); + settings.set_snapshot_suffix(format); + let _bound = settings.bind_to_scope(); + + let mut cmd = cli(); + cmd.args(["--command", "select 1", "-q", "--format", format]); + + assert_cmd_snapshot!(cmd); +} + +#[tokio::test] +async fn test_cli() { + if env::var("TEST_STORAGE_INTEGRATION").is_err() { + eprintln!("Skipping external storages integration tests"); + return; + } + + let settings = make_settings(); + let _bound = settings.bind_to_scope(); + + glob!("sql/integration/*.sql", |path| { + let input = fs::read_to_string(path).unwrap(); + assert_cmd_snapshot!(cli().pass_stdin(input)) + }); +} + +#[tokio::test] +async fn test_aws_options() { + // Separate test is needed to pass aws as options in sql and not via env + + if env::var("TEST_STORAGE_INTEGRATION").is_err() { + eprintln!("Skipping external storages integration tests"); + return; + } + + let settings = make_settings(); + let _bound = settings.bind_to_scope(); + + let access_key_id = + env::var("AWS_ACCESS_KEY_ID").expect("AWS_ACCESS_KEY_ID is not set"); + let secret_access_key = + env::var("AWS_SECRET_ACCESS_KEY").expect("AWS_SECRET_ACCESS_KEY is not set"); + let endpoint_url = env::var("AWS_ENDPOINT").expect("AWS_ENDPOINT is not set"); + + let input = format!( + r#"CREATE EXTERNAL TABLE CARS +STORED AS CSV +LOCATION 's3://data/cars.csv' +OPTIONS( + 'aws.access_key_id' '{}', + 'aws.secret_access_key' '{}', + 'aws.endpoint' '{}', + 'aws.allow_http' 'true' +); + +SELECT * FROM CARS limit 1; +"#, + access_key_id, secret_access_key, endpoint_url + ); + + assert_cmd_snapshot!(cli().env_clear().pass_stdin(input)); } diff --git a/datafusion-cli/tests/snapshots/aws_options.snap b/datafusion-cli/tests/snapshots/aws_options.snap new file mode 100644 index 000000000000..283cf57bc662 --- /dev/null +++ b/datafusion-cli/tests/snapshots/aws_options.snap @@ -0,0 +1,25 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: [] + stdin: "CREATE EXTERNAL TABLE CARS\nSTORED AS CSV\nLOCATION 's3://data/cars.csv'\nOPTIONS(\n 'aws.access_key_id' 'TEST-DataFusionLogin',\n 'aws.secret_access_key' 'TEST-DataFusionPassword',\n 'aws.endpoint' 'http://127.0.0.1:9000',\n 'aws.allow_http' 'true'\n);\n\nSELECT * FROM CARS limit 1;\n" +--- +success: true +exit_code: 0 +----- stdout ----- +[CLI_VERSION] +0 row(s) fetched. +[ELAPSED] + ++-----+-------+---------------------+ +| car | speed | time | ++-----+-------+---------------------+ +| red | 20.0 | 1996-04-12T12:05:03 | ++-----+-------+---------------------+ +1 row(s) fetched. +[ELAPSED] + +\q + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli@load_local_csv.sql.snap b/datafusion-cli/tests/snapshots/cli@load_local_csv.sql.snap new file mode 100644 index 000000000000..029d5f8d5b9f --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli@load_local_csv.sql.snap @@ -0,0 +1,26 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: [] + stdin: "CREATE EXTERNAL TABLE CARS\nSTORED AS CSV\nLOCATION '../datafusion/core/tests/data/cars.csv'\nOPTIONS ('has_header' 'TRUE');\n\nSELECT * FROM CARS limit 1;" +input_file: tests/sql/load_local_csv.sql +--- +success: true +exit_code: 0 +----- stdout ----- +[CLI_VERSION] +0 row(s) fetched. +[ELAPSED] + ++-----+-------+---------------------+ +| car | speed | time | ++-----+-------+---------------------+ +| red | 20.0 | 1996-04-12T12:05:03 | ++-----+-------+---------------------+ +1 row(s) fetched. +[ELAPSED] + +\q + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli@load_s3_csv.sql.snap b/datafusion-cli/tests/snapshots/cli@load_s3_csv.sql.snap new file mode 100644 index 000000000000..858989621a1f --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli@load_s3_csv.sql.snap @@ -0,0 +1,26 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: [] + stdin: "CREATE EXTERNAL TABLE CARS\nSTORED AS CSV\nLOCATION 's3://data/cars.csv';\n\nSELECT * FROM CARS limit 1;" +input_file: tests/sql/load_s3_csv.sql +--- +success: true +exit_code: 0 +----- stdout ----- +[CLI_VERSION] +0 row(s) fetched. +[ELAPSED] + ++-----+-------+---------------------+ +| car | speed | time | ++-----+-------+---------------------+ +| red | 20.0 | 1996-04-12T12:05:03 | ++-----+-------+---------------------+ +1 row(s) fetched. +[ELAPSED] + +\q + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli@select.sql.snap b/datafusion-cli/tests/snapshots/cli@select.sql.snap new file mode 100644 index 000000000000..c137d9fe2b13 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli@select.sql.snap @@ -0,0 +1,23 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: [] + stdin: select 1; +input_file: tests/sql/select.sql +--- +success: true +exit_code: 0 +----- stdout ----- +[CLI_VERSION] ++----------+ +| Int64(1) | ++----------+ +| 1 | ++----------+ +1 row(s) fetched. +[ELAPSED] + +\q + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_format@automatic.snap b/datafusion-cli/tests/snapshots/cli_format@automatic.snap new file mode 100644 index 000000000000..2591f493e90a --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_format@automatic.snap @@ -0,0 +1,21 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--command" + - select 1 + - "-q" + - "--format" + - automatic +--- +success: true +exit_code: 0 +----- stdout ----- ++----------+ +| Int64(1) | ++----------+ +| 1 | ++----------+ + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_format@csv.snap b/datafusion-cli/tests/snapshots/cli_format@csv.snap new file mode 100644 index 000000000000..c41b042298eb --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_format@csv.snap @@ -0,0 +1,18 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--command" + - select 1 + - "-q" + - "--format" + - csv +--- +success: true +exit_code: 0 +----- stdout ----- +Int64(1) +1 + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_format@json.snap b/datafusion-cli/tests/snapshots/cli_format@json.snap new file mode 100644 index 000000000000..8f804a337cce --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_format@json.snap @@ -0,0 +1,17 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--command" + - select 1 + - "-q" + - "--format" + - json +--- +success: true +exit_code: 0 +----- stdout ----- +[{"Int64(1)":1}] + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_format@nd-json.snap b/datafusion-cli/tests/snapshots/cli_format@nd-json.snap new file mode 100644 index 000000000000..7b4ce1e2530c --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_format@nd-json.snap @@ -0,0 +1,17 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--command" + - select 1 + - "-q" + - "--format" + - nd-json +--- +success: true +exit_code: 0 +----- stdout ----- +{"Int64(1)":1} + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_format@table.snap b/datafusion-cli/tests/snapshots/cli_format@table.snap new file mode 100644 index 000000000000..99914182462a --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_format@table.snap @@ -0,0 +1,21 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--command" + - select 1 + - "-q" + - "--format" + - table +--- +success: true +exit_code: 0 +----- stdout ----- ++----------+ +| Int64(1) | ++----------+ +| 1 | ++----------+ + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_format@tsv.snap b/datafusion-cli/tests/snapshots/cli_format@tsv.snap new file mode 100644 index 000000000000..968268c31dd5 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_format@tsv.snap @@ -0,0 +1,18 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--command" + - select 1 + - "-q" + - "--format" + - tsv +--- +success: true +exit_code: 0 +----- stdout ----- +Int64(1) +1 + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@backslash.snap b/datafusion-cli/tests/snapshots/cli_quick_test@backslash.snap new file mode 100644 index 000000000000..c01699146aa8 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_quick_test@backslash.snap @@ -0,0 +1,17 @@ +--- +source: datafusion-cli/tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--file" + - tests/sql/backslash.sql + - "--format" + - json + - "-q" +--- +success: true +exit_code: 0 +----- stdout ----- +[{"Utf8(\"\\\")":"\\","Utf8(\"\\\\\")":"\\\\","Utf8(\"\\\\\\\\\\\")":"\\\\\\\\\\","Utf8(\"dsdsds\\\\\\\\\")":"dsdsds\\\\\\\\","Utf8(\"\\t\")":"\\t","Utf8(\"\\0\")":"\\0","Utf8(\"\\n\")":"\\n"}] + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap b/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap new file mode 100644 index 000000000000..c27d527df0b6 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap @@ -0,0 +1,21 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--command" + - show datafusion.execution.batch_size + - "-q" + - "-b" + - "1" +--- +success: true +exit_code: 0 +----- stdout ----- ++---------------------------------+-------+ +| name | value | ++---------------------------------+-------+ +| datafusion.execution.batch_size | 1 | ++---------------------------------+-------+ + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@files.snap b/datafusion-cli/tests/snapshots/cli_quick_test@files.snap new file mode 100644 index 000000000000..7c44e41729a1 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_quick_test@files.snap @@ -0,0 +1,19 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--file" + - tests/sql/select.sql + - "-q" +--- +success: true +exit_code: 0 +----- stdout ----- ++----------+ +| Int64(1) | ++----------+ +| 1 | ++----------+ + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap b/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap new file mode 100644 index 000000000000..3b975bb6a927 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap @@ -0,0 +1,24 @@ +--- +source: tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--command" + - select 1; select 2; + - "-q" +--- +success: true +exit_code: 0 +----- stdout ----- ++----------+ +| Int64(1) | ++----------+ +| 1 | ++----------+ ++----------+ +| Int64(2) | ++----------+ +| 2 | ++----------+ + +----- stderr ----- diff --git a/datafusion-cli/tests/data/backslash.txt b/datafusion-cli/tests/sql/backslash.sql similarity index 100% rename from datafusion-cli/tests/data/backslash.txt rename to datafusion-cli/tests/sql/backslash.sql diff --git a/datafusion-cli/tests/sql/integration/load_local_csv.sql b/datafusion-cli/tests/sql/integration/load_local_csv.sql new file mode 100644 index 000000000000..8920c48c5f5f --- /dev/null +++ b/datafusion-cli/tests/sql/integration/load_local_csv.sql @@ -0,0 +1,6 @@ +CREATE EXTERNAL TABLE CARS +STORED AS CSV +LOCATION '../datafusion/core/tests/data/cars.csv' +OPTIONS ('has_header' 'TRUE'); + +SELECT * FROM CARS limit 1; \ No newline at end of file diff --git a/datafusion-cli/tests/sql/integration/load_s3_csv.sql b/datafusion-cli/tests/sql/integration/load_s3_csv.sql new file mode 100644 index 000000000000..10c2e38b9764 --- /dev/null +++ b/datafusion-cli/tests/sql/integration/load_s3_csv.sql @@ -0,0 +1,5 @@ +CREATE EXTERNAL TABLE CARS +STORED AS CSV +LOCATION 's3://data/cars.csv'; + +SELECT * FROM CARS limit 1; \ No newline at end of file diff --git a/datafusion-cli/tests/data/sql.txt b/datafusion-cli/tests/sql/select.sql similarity index 100% rename from datafusion-cli/tests/data/sql.txt rename to datafusion-cli/tests/sql/select.sql diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index d2bbdd78e3f2..f6b7d641d126 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -73,8 +73,10 @@ tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } tonic = "0.12.1" +tracing = { version = "0.1" } +tracing-subscriber = { version = "0.3" } url = { workspace = true } -uuid = "1.15" +uuid = "1.16" [target.'cfg(not(target_os = "windows"))'.dev-dependencies] nix = { version = "0.29.0", features = ["fs"] } diff --git a/datafusion-examples/examples/planner_api.rs b/datafusion-examples/examples/planner_api.rs index 41110a3e0a9c..4943e593bd0b 100644 --- a/datafusion-examples/examples/planner_api.rs +++ b/datafusion-examples/examples/planner_api.rs @@ -16,8 +16,8 @@ // under the License. use datafusion::error::Result; -use datafusion::logical_expr::{LogicalPlan, PlanType}; -use datafusion::physical_plan::{displayable, DisplayFormatType}; +use datafusion::logical_expr::LogicalPlan; +use datafusion::physical_plan::displayable; use datafusion::physical_planner::DefaultPhysicalPlanner; use datafusion::prelude::*; @@ -77,13 +77,7 @@ async fn to_physical_plan_in_one_api_demo( println!( "Physical plan direct from logical plan:\n\n{}\n\n", - displayable(physical_plan.as_ref()) - .to_stringified( - false, - PlanType::InitialPhysicalPlan, - DisplayFormatType::Default - ) - .plan + displayable(physical_plan.as_ref()).indent(false) ); Ok(()) @@ -123,13 +117,7 @@ async fn to_physical_plan_step_by_step_demo( .await?; println!( "Final physical plan:\n\n{}\n\n", - displayable(physical_plan.as_ref()) - .to_stringified( - false, - PlanType::InitialPhysicalPlan, - DisplayFormatType::Default - ) - .plan + displayable(physical_plan.as_ref()).indent(false) ); // Call the physical optimizer with an existing physical plan (in this @@ -142,13 +130,7 @@ async fn to_physical_plan_step_by_step_demo( planner.optimize_physical_plan(physical_plan, &ctx.state(), |_, _| {})?; println!( "Optimized physical plan:\n\n{}\n\n", - displayable(physical_plan.as_ref()) - .to_stringified( - false, - PlanType::InitialPhysicalPlan, - DisplayFormatType::Default - ) - .plan + displayable(physical_plan.as_ref()).indent(false) ); Ok(()) diff --git a/datafusion-examples/examples/tracing.rs b/datafusion-examples/examples/tracing.rs new file mode 100644 index 000000000000..334ee0f4e568 --- /dev/null +++ b/datafusion-examples/examples/tracing.rs @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This example demonstrates the tracing injection feature for the DataFusion runtime. +//! Tasks spawned on new threads behave differently depending on whether a tracer is injected. +//! The log output clearly distinguishes the two cases. +//! +//! # Expected Log Output +//! +//! When **no tracer** is injected, logs from tasks running on `tokio-runtime-worker` threads +//! will _not_ include the `run_instrumented_query` span: +//! +//! ```text +//! 10:29:40.714 INFO main ThreadId(01) tracing: ***** RUNNING WITHOUT INJECTED TRACER ***** +//! 10:29:40.714 INFO main ThreadId(01) run_instrumented_query: tracing: Starting query execution +//! 10:29:40.728 INFO main ThreadId(01) run_instrumented_query: tracing: Executing SQL query sql="SELECT COUNT(*), string_col FROM alltypes GROUP BY string_col" +//! 10:29:40.743 DEBUG main ThreadId(01) run_instrumented_query: datafusion_optimizer::optimizer: Optimizer took 6 ms +//! 10:29:40.759 DEBUG tokio-runtime-worker ThreadId(03) datafusion_physical_plan::aggregates::row_hash: Creating GroupedHashAggregateStream +//! 10:29:40.758 DEBUG tokio-runtime-worker ThreadId(04) datafusion_physical_plan::aggregates::row_hash: Creating GroupedHashAggregateStream +//! 10:29:40.771 INFO main ThreadId(01) run_instrumented_query: tracing: Query complete: 6 batches returned +//! 10:29:40.772 INFO main ThreadId(01) tracing: ***** WITHOUT tracer: Non-main tasks did NOT inherit the `run_instrumented_query` span ***** +//! ``` +//! +//! When a tracer **is** injected, tasks spawned on non‑main threads _do_ inherit the span: +//! +//! ```text +//! 10:29:40.772 INFO main ThreadId(01) tracing: Injecting custom tracer... +//! 10:29:40.772 INFO main ThreadId(01) tracing: ***** RUNNING WITH INJECTED TRACER ***** +//! 10:29:40.772 INFO main ThreadId(01) run_instrumented_query: tracing: Starting query execution +//! 10:29:40.775 INFO main ThreadId(01) run_instrumented_query: tracing: Executing SQL query sql="SELECT COUNT(*), string_col FROM alltypes GROUP BY string_col" +//! 10:29:40.784 DEBUG main ThreadId(01) run_instrumented_query: datafusion_optimizer::optimizer: Optimizer took 7 ms +//! 10:29:40.801 DEBUG tokio-runtime-worker ThreadId(03) run_instrumented_query: datafusion_physical_plan::aggregates::row_hash: Creating GroupedHashAggregateStream +//! 10:29:40.801 DEBUG tokio-runtime-worker ThreadId(04) run_instrumented_query: datafusion_physical_plan::aggregates::row_hash: Creating GroupedHashAggregateStream +//! 10:29:40.809 INFO main ThreadId(01) run_instrumented_query: tracing: Query complete: 6 batches returned +//! 10:29:40.809 INFO main ThreadId(01) tracing: ***** WITH tracer: Non-main tasks DID inherit the `run_instrumented_query` span ***** +//! ``` + +use datafusion::common::runtime::{set_join_set_tracer, JoinSetTracer}; +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::listing::ListingOptions; +use datafusion::error::Result; +use datafusion::prelude::*; +use datafusion::test_util::parquet_test_data; +use futures::future::BoxFuture; +use futures::FutureExt; +use std::any::Any; +use std::sync::Arc; +use tracing::{info, instrument, Instrument, Level, Span}; + +#[tokio::main] +async fn main() -> Result<()> { + // Initialize tracing subscriber with thread info. + tracing_subscriber::fmt() + .with_thread_ids(true) + .with_thread_names(true) + .with_max_level(Level::DEBUG) + .init(); + + // Run query WITHOUT tracer injection. + info!("***** RUNNING WITHOUT INJECTED TRACER *****"); + run_instrumented_query().await?; + info!("***** WITHOUT tracer: `tokio-runtime-worker` tasks did NOT inherit the `run_instrumented_query` span *****"); + + // Inject custom tracer so tasks run in the current span. + info!("Injecting custom tracer..."); + set_join_set_tracer(&SpanTracer).expect("Failed to set tracer"); + + // Run query WITH tracer injection. + info!("***** RUNNING WITH INJECTED TRACER *****"); + run_instrumented_query().await?; + info!("***** WITH tracer: `tokio-runtime-worker` tasks DID inherit the `run_instrumented_query` span *****"); + + Ok(()) +} + +/// A simple tracer that ensures any spawned task or blocking closure +/// inherits the current span via `in_current_span`. +struct SpanTracer; + +/// Implement the `JoinSetTracer` trait so we can inject instrumentation +/// for both async futures and blocking closures. +impl JoinSetTracer for SpanTracer { + /// Instruments a boxed future to run in the current span. The future's + /// return type is erased to `Box`, which we simply + /// run inside the `Span::current()` context. + fn trace_future( + &self, + fut: BoxFuture<'static, Box>, + ) -> BoxFuture<'static, Box> { + fut.in_current_span().boxed() + } + + /// Instruments a boxed blocking closure by running it inside the + /// `Span::current()` context. + fn trace_block( + &self, + f: Box Box + Send>, + ) -> Box Box + Send> { + let span = Span::current(); + Box::new(move || span.in_scope(f)) + } +} + +#[instrument(level = "info")] +async fn run_instrumented_query() -> Result<()> { + info!("Starting query execution"); + + let ctx = SessionContext::new(); + let test_data = parquet_test_data(); + let file_format = ParquetFormat::default().with_enable_pruning(true); + let listing_options = ListingOptions::new(Arc::new(file_format)) + .with_file_extension("alltypes_tiny_pages_plain.parquet"); + + let table_path = format!("file://{test_data}/"); + info!("Registering table 'alltypes' from {}", table_path); + ctx.register_listing_table("alltypes", &table_path, listing_options, None, None) + .await + .expect("Failed to register table"); + + let sql = "SELECT COUNT(*), string_col FROM alltypes GROUP BY string_col"; + info!(sql, "Executing SQL query"); + let result = ctx.sql(sql).await?.collect().await?; + info!("Query complete: {} batches returned", result.len()); + Ok(()) +} diff --git a/datafusion-testing b/datafusion-testing index 3462eaa78745..243047b9dd68 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit 3462eaa787459957e38df267a4a21f5bea605807 +Subproject commit 243047b9dd682be688628539c604daaddfe640f9 diff --git a/datafusion/catalog-listing/src/helpers.rs b/datafusion/catalog-listing/src/helpers.rs index 9ac8423042d3..7742f5f9a153 100644 --- a/datafusion/catalog-listing/src/helpers.rs +++ b/datafusion/catalog-listing/src/helpers.rs @@ -1079,5 +1079,9 @@ mod tests { fn table_options_mut(&mut self) -> &mut TableOptions { unimplemented!() } + + fn task_ctx(&self) -> Arc { + unimplemented!() + } } } diff --git a/datafusion/catalog-listing/src/mod.rs b/datafusion/catalog-listing/src/mod.rs index cb0d86d8666e..fb0a960f37b6 100644 --- a/datafusion/catalog-listing/src/mod.rs +++ b/datafusion/catalog-listing/src/mod.rs @@ -20,5 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] pub mod helpers; diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index 864749411198..113d68825390 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -35,16 +35,17 @@ arrow = { workspace = true } async-trait = { workspace = true } dashmap = { workspace = true } datafusion-common = { workspace = true } +datafusion-common-runtime = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-physical-expr = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-sql = { workspace = true } futures = { workspace = true } itertools = { workspace = true } log = { workspace = true } +object_store = { workspace = true } parking_lot = { workspace = true } - -[dev-dependencies] tokio = { workspace = true } [lints] diff --git a/datafusion/core/src/datasource/cte_worktable.rs b/datafusion/catalog/src/cte_worktable.rs similarity index 93% rename from datafusion/core/src/datasource/cte_worktable.rs rename to datafusion/catalog/src/cte_worktable.rs index b63755f644a8..d72a30909c02 100644 --- a/datafusion/core/src/datasource/cte_worktable.rs +++ b/datafusion/catalog/src/cte_worktable.rs @@ -20,18 +20,17 @@ use std::sync::Arc; use std::{any::Any, borrow::Cow}; +use crate::Session; use arrow::datatypes::SchemaRef; use async_trait::async_trait; -use datafusion_catalog::Session; use datafusion_physical_plan::work_table::WorkTableExec; -use crate::{ - error::Result, - logical_expr::{Expr, LogicalPlan, TableProviderFilterPushDown}, - physical_plan::ExecutionPlan, -}; +use datafusion_physical_plan::ExecutionPlan; -use crate::datasource::{TableProvider, TableType}; +use datafusion_common::error::Result; +use datafusion_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableType}; + +use crate::TableProvider; /// The temporary working table where the previous iteration of a recursive query is stored /// Naming is based on PostgreSQL's implementation. diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/catalog/src/default_table_source.rs similarity index 98% rename from datafusion/core/src/datasource/default_table_source.rs rename to datafusion/catalog/src/default_table_source.rs index 541e0b6dfa91..9db8242caa99 100644 --- a/datafusion/core/src/datasource/default_table_source.rs +++ b/datafusion/catalog/src/default_table_source.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use std::{any::Any, borrow::Cow}; -use crate::datasource::TableProvider; +use crate::TableProvider; use arrow::datatypes::SchemaRef; use datafusion_common::{internal_err, Constraints}; @@ -133,7 +133,7 @@ fn preserves_table_type() { async fn scan( &self, - _: &dyn datafusion_catalog::Session, + _: &dyn crate::Session, _: Option<&Vec>, _: &[Expr], _: Option, diff --git a/datafusion/catalog/src/lib.rs b/datafusion/catalog/src/lib.rs index a339d4916b8d..a1c0a6185da4 100644 --- a/datafusion/catalog/src/lib.rs +++ b/datafusion/catalog/src/lib.rs @@ -20,24 +20,18 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] //! Interfaces and default implementations of catalogs and schemas. //! //! Implementations //! * Information schema: [`information_schema`] //! * Simple memory based catalog: [`MemoryCatalogProviderList`], [`MemoryCatalogProvider`], [`MemorySchemaProvider`] +//! * Listing schema: [`listing_schema`] pub mod memory; -#[deprecated( - since = "46.0.0", - note = "use datafusion_sql::resolve::resolve_table_references" -)] -pub use datafusion_sql::resolve::resolve_table_references; -#[deprecated( - since = "46.0.0", - note = "use datafusion_common::{ResolvedTableReference, TableReference}" -)] -pub use datafusion_sql::{ResolvedTableReference, TableReference}; pub use memory::{ MemoryCatalogProvider, MemoryCatalogProviderList, MemorySchemaProvider, }; @@ -45,6 +39,7 @@ mod r#async; mod catalog; mod dynamic_file; pub mod information_schema; +pub mod listing_schema; mod schema; mod session; mod table; @@ -54,4 +49,8 @@ pub use r#async::*; pub use schema::*; pub use session::*; pub use table::*; +pub mod cte_worktable; +pub mod default_table_source; +pub mod stream; pub mod streaming; +pub mod view; diff --git a/datafusion/core/src/catalog_common/listing_schema.rs b/datafusion/catalog/src/listing_schema.rs similarity index 97% rename from datafusion/core/src/catalog_common/listing_schema.rs rename to datafusion/catalog/src/listing_schema.rs index dc55a07ef82d..cc2c2ee606b3 100644 --- a/datafusion/core/src/catalog_common/listing_schema.rs +++ b/datafusion/catalog/src/listing_schema.rs @@ -22,9 +22,9 @@ use std::collections::HashSet; use std::path::Path; use std::sync::{Arc, Mutex}; -use crate::catalog::{SchemaProvider, TableProvider, TableProviderFactory}; -use crate::execution::context::SessionState; +use crate::{SchemaProvider, TableProvider, TableProviderFactory}; +use crate::Session; use datafusion_common::{ Constraints, DFSchema, DataFusionError, HashMap, TableReference, }; @@ -88,7 +88,7 @@ impl ListingSchemaProvider { } /// Reload table information from ObjectStore - pub async fn refresh(&self, state: &SessionState) -> datafusion_common::Result<()> { + pub async fn refresh(&self, state: &dyn Session) -> datafusion_common::Result<()> { let entries: Vec<_> = self.store.list(Some(&self.path)).try_collect().await?; let base = Path::new(self.path.as_ref()); let mut tables = HashSet::new(); diff --git a/datafusion/catalog/src/memory.rs b/datafusion/catalog/src/memory/catalog.rs similarity index 70% rename from datafusion/catalog/src/memory.rs rename to datafusion/catalog/src/memory/catalog.rs index d22a98d3d064..b71888c54e9d 100644 --- a/datafusion/catalog/src/memory.rs +++ b/datafusion/catalog/src/memory/catalog.rs @@ -18,10 +18,9 @@ //! [`MemoryCatalogProvider`], [`MemoryCatalogProviderList`]: In-memory //! implementations of [`CatalogProviderList`] and [`CatalogProvider`]. -use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider}; -use async_trait::async_trait; +use crate::{CatalogProvider, CatalogProviderList, SchemaProvider}; use dashmap::DashMap; -use datafusion_common::{exec_err, DataFusionError}; +use datafusion_common::exec_err; use std::any::Any; use std::sync::Arc; @@ -134,67 +133,3 @@ impl CatalogProvider for MemoryCatalogProvider { } } } - -/// Simple in-memory implementation of a schema. -#[derive(Debug)] -pub struct MemorySchemaProvider { - tables: DashMap>, -} - -impl MemorySchemaProvider { - /// Instantiates a new MemorySchemaProvider with an empty collection of tables. - pub fn new() -> Self { - Self { - tables: DashMap::new(), - } - } -} - -impl Default for MemorySchemaProvider { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl SchemaProvider for MemorySchemaProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn table_names(&self) -> Vec { - self.tables - .iter() - .map(|table| table.key().clone()) - .collect() - } - - async fn table( - &self, - name: &str, - ) -> datafusion_common::Result>, DataFusionError> { - Ok(self.tables.get(name).map(|table| Arc::clone(table.value()))) - } - - fn register_table( - &self, - name: String, - table: Arc, - ) -> datafusion_common::Result>> { - if self.table_exist(name.as_str()) { - return exec_err!("The table {name} already exists"); - } - Ok(self.tables.insert(name, table)) - } - - fn deregister_table( - &self, - name: &str, - ) -> datafusion_common::Result>> { - Ok(self.tables.remove(name).map(|(_, table)| table)) - } - - fn table_exist(&self, name: &str) -> bool { - self.tables.contains_key(name) - } -} diff --git a/datafusion/core/src/catalog_common/mod.rs b/datafusion/catalog/src/memory/mod.rs similarity index 77% rename from datafusion/core/src/catalog_common/mod.rs rename to datafusion/catalog/src/memory/mod.rs index 213afb32405e..4c5cf1a9ae9d 100644 --- a/datafusion/core/src/catalog_common/mod.rs +++ b/datafusion/catalog/src/memory/mod.rs @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Interfaces and default implementations of catalogs and schemas. -//! -//! Implementations -//! * Listing schema: [`listing_schema`] +pub(crate) mod catalog; +pub(crate) mod schema; -pub mod listing_schema; -pub use crate::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; +pub use catalog::*; +pub use schema::*; diff --git a/datafusion/catalog/src/memory/schema.rs b/datafusion/catalog/src/memory/schema.rs new file mode 100644 index 000000000000..f1b3628f7aff --- /dev/null +++ b/datafusion/catalog/src/memory/schema.rs @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`MemorySchemaProvider`]: In-memory implementations of [`SchemaProvider`]. + +use crate::{SchemaProvider, TableProvider}; +use async_trait::async_trait; +use dashmap::DashMap; +use datafusion_common::{exec_err, DataFusionError}; +use std::any::Any; +use std::sync::Arc; + +/// Simple in-memory implementation of a schema. +#[derive(Debug)] +pub struct MemorySchemaProvider { + tables: DashMap>, +} + +impl MemorySchemaProvider { + /// Instantiates a new MemorySchemaProvider with an empty collection of tables. + pub fn new() -> Self { + Self { + tables: DashMap::new(), + } + } +} + +impl Default for MemorySchemaProvider { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl SchemaProvider for MemorySchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.tables + .iter() + .map(|table| table.key().clone()) + .collect() + } + + async fn table( + &self, + name: &str, + ) -> datafusion_common::Result>, DataFusionError> { + Ok(self.tables.get(name).map(|table| Arc::clone(table.value()))) + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> datafusion_common::Result>> { + if self.table_exist(name.as_str()) { + return exec_err!("The table {name} already exists"); + } + Ok(self.tables.insert(name, table)) + } + + fn deregister_table( + &self, + name: &str, + ) -> datafusion_common::Result>> { + Ok(self.tables.remove(name).map(|(_, table)| table)) + } + + fn table_exist(&self, name: &str) -> bool { + self.tables.contains_key(name) + } +} diff --git a/datafusion/catalog/src/session.rs b/datafusion/catalog/src/session.rs index 9dd870e43568..88b9669cff6d 100644 --- a/datafusion/catalog/src/session.rs +++ b/datafusion/catalog/src/session.rs @@ -132,6 +132,9 @@ pub trait Session: Send + Sync { /// Returns a mutable reference to [`TableOptions`] fn table_options_mut(&mut self) -> &mut TableOptions; + + /// Get a new TaskContext to run in this session + fn task_ctx(&self) -> Arc; } /// Create a new task context instance from Session @@ -145,7 +148,7 @@ impl From<&dyn Session> for TaskContext { state.scalar_functions().clone(), state.aggregate_functions().clone(), state.window_functions().clone(), - state.runtime_env().clone(), + Arc::clone(state.runtime_env()), ) } } diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/catalog/src/stream.rs similarity index 97% rename from datafusion/core/src/datasource/stream.rs rename to datafusion/catalog/src/stream.rs index d5fe070be82c..3fb672490712 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/catalog/src/stream.rs @@ -25,9 +25,7 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; -use crate::catalog::{TableProvider, TableProviderFactory}; -use crate::datasource::create_ordering; - +use crate::{Session, TableProvider, TableProviderFactory}; use arrow::array::{RecordBatch, RecordBatchReader, RecordBatchWriter}; use arrow::datatypes::SchemaRef; use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result}; @@ -35,13 +33,13 @@ use datafusion_common_runtime::SpawnedTask; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_expr::{CreateExternalTable, Expr, SortExpr, TableType}; +use datafusion_physical_expr::create_ordering; use datafusion_physical_plan::insert::{DataSink, DataSinkExec}; use datafusion_physical_plan::stream::RecordBatchReceiverStreamBuilder; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use async_trait::async_trait; -use datafusion_catalog::Session; use futures::StreamExt; /// A [`TableProviderFactory`] for [`StreamTable`] @@ -292,7 +290,7 @@ impl StreamConfig { /// data stored in object storage, should instead consider [`ListingTable`]. /// /// [Hadoop]: https://hadoop.apache.org/ -/// [`ListingTable`]: crate::datasource::listing::ListingTable +/// [`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html #[derive(Debug)] pub struct StreamTable(Arc); @@ -400,8 +398,8 @@ impl PartitionStream for StreamRead { struct StreamWrite(Arc); impl DisplayAs for StreamWrite { - fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { - self.0.source.stream_write_display(_t, f) + fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + self.0.source.stream_write_display(t, f) } } diff --git a/datafusion/catalog/src/view.rs b/datafusion/catalog/src/view.rs new file mode 100644 index 000000000000..8dfb79718c9b --- /dev/null +++ b/datafusion/catalog/src/view.rs @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! View data source which uses a LogicalPlan as it's input. + +use std::{any::Any, borrow::Cow, sync::Arc}; + +use crate::Session; +use crate::TableProvider; + +use arrow::datatypes::SchemaRef; +use async_trait::async_trait; +use datafusion_common::error::Result; +use datafusion_common::Column; +use datafusion_expr::TableType; +use datafusion_expr::{Expr, LogicalPlan}; +use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown}; +use datafusion_physical_plan::ExecutionPlan; + +/// An implementation of `TableProvider` that uses another logical plan. +#[derive(Debug)] +pub struct ViewTable { + /// LogicalPlan of the view + logical_plan: LogicalPlan, + /// File fields + partition columns + table_schema: SchemaRef, + /// SQL used to create the view, if available + definition: Option, +} + +impl ViewTable { + /// Create new view that is executed at query runtime. + /// + /// Takes a `LogicalPlan` and optionally the SQL text of the `CREATE` + /// statement. + /// + /// Notes: the `LogicalPlan` is not validated or type coerced. If this is + /// needed it should be done after calling this function. + pub fn new(logical_plan: LogicalPlan, definition: Option) -> Self { + let table_schema = logical_plan.schema().as_ref().to_owned().into(); + Self { + logical_plan, + table_schema, + definition, + } + } + + #[deprecated( + since = "47.0.0", + note = "Use `ViewTable::new` instead and apply TypeCoercion to the logical plan if needed" + )] + pub fn try_new( + logical_plan: LogicalPlan, + definition: Option, + ) -> Result { + Ok(Self::new(logical_plan, definition)) + } + + /// Get definition ref + pub fn definition(&self) -> Option<&String> { + self.definition.as_ref() + } + + /// Get logical_plan ref + pub fn logical_plan(&self) -> &LogicalPlan { + &self.logical_plan + } +} + +#[async_trait] +impl TableProvider for ViewTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn get_logical_plan(&self) -> Option> { + Some(Cow::Borrowed(&self.logical_plan)) + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.table_schema) + } + + fn table_type(&self) -> TableType { + TableType::View + } + + fn get_table_definition(&self) -> Option<&str> { + self.definition.as_deref() + } + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + // A filter is added on the View when given + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> Result> { + let filter = filters.iter().cloned().reduce(|acc, new| acc.and(new)); + let plan = self.logical_plan().clone(); + let mut plan = LogicalPlanBuilder::from(plan); + + if let Some(filter) = filter { + plan = plan.filter(filter)?; + } + + let mut plan = if let Some(projection) = projection { + // avoiding adding a redundant projection (e.g. SELECT * FROM view) + let current_projection = + (0..plan.schema().fields().len()).collect::>(); + if projection == ¤t_projection { + plan + } else { + let fields: Vec = projection + .iter() + .map(|i| { + Expr::Column(Column::from( + self.logical_plan.schema().qualified_field(*i), + )) + }) + .collect(); + plan.project(fields)? + } + } else { + plan + }; + + if let Some(limit) = limit { + plan = plan.limit(0, Some(limit))?; + } + + state.create_physical_plan(&plan.build()?).await + } +} diff --git a/datafusion/common-runtime/Cargo.toml b/datafusion/common-runtime/Cargo.toml index 6fd9b7ac8fe8..5e7816b669de 100644 --- a/datafusion/common-runtime/Cargo.toml +++ b/datafusion/common-runtime/Cargo.toml @@ -38,8 +38,9 @@ workspace = true name = "datafusion_common_runtime" [dependencies] +futures = { workspace = true } log = { workspace = true } tokio = { workspace = true } [dev-dependencies] -tokio = { version = "1.36", features = ["rt", "rt-multi-thread", "time"] } +tokio = { version = "1.44", features = ["rt", "rt-multi-thread", "time"] } diff --git a/datafusion/common-runtime/src/common.rs b/datafusion/common-runtime/src/common.rs index 30f7526bc0b2..361f6af95cf1 100644 --- a/datafusion/common-runtime/src/common.rs +++ b/datafusion/common-runtime/src/common.rs @@ -17,7 +17,8 @@ use std::future::Future; -use tokio::task::{JoinError, JoinSet}; +use crate::JoinSet; +use tokio::task::JoinError; /// Helper that provides a simple API to spawn a single task and join it. /// Provides guarantees of aborting on `Drop` to keep it cancel-safe. diff --git a/datafusion/common-runtime/src/join_set.rs b/datafusion/common-runtime/src/join_set.rs new file mode 100644 index 000000000000..1857a4111dbc --- /dev/null +++ b/datafusion/common-runtime/src/join_set.rs @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::trace_utils::{trace_block, trace_future}; +use std::future::Future; +use std::task::{Context, Poll}; +use tokio::runtime::Handle; +use tokio::task::{AbortHandle, Id, JoinError, LocalSet}; + +/// A wrapper around Tokio's JoinSet that forwards all API calls while optionally +/// instrumenting spawned tasks and blocking closures with custom tracing behavior. +/// If no tracer is injected via `trace_utils::set_tracer`, tasks and closures are executed +/// without any instrumentation. +#[derive(Debug)] +pub struct JoinSet { + inner: tokio::task::JoinSet, +} + +impl Default for JoinSet { + fn default() -> Self { + Self::new() + } +} + +impl JoinSet { + /// [JoinSet::new](tokio::task::JoinSet::new) - Create a new JoinSet. + pub fn new() -> Self { + Self { + inner: tokio::task::JoinSet::new(), + } + } + + /// [JoinSet::len](tokio::task::JoinSet::len) - Return the number of tasks. + pub fn len(&self) -> usize { + self.inner.len() + } + + /// [JoinSet::is_empty](tokio::task::JoinSet::is_empty) - Check if the JoinSet is empty. + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } +} + +impl JoinSet { + /// [JoinSet::spawn](tokio::task::JoinSet::spawn) - Spawn a new task. + pub fn spawn(&mut self, task: F) -> AbortHandle + where + F: Future, + F: Send + 'static, + T: Send, + { + self.inner.spawn(trace_future(task)) + } + + /// [JoinSet::spawn_on](tokio::task::JoinSet::spawn_on) - Spawn a task on a provided runtime. + pub fn spawn_on(&mut self, task: F, handle: &Handle) -> AbortHandle + where + F: Future, + F: Send + 'static, + T: Send, + { + self.inner.spawn_on(trace_future(task), handle) + } + + /// [JoinSet::spawn_local](tokio::task::JoinSet::spawn_local) - Spawn a local task. + pub fn spawn_local(&mut self, task: F) -> AbortHandle + where + F: Future, + F: 'static, + { + self.inner.spawn_local(task) + } + + /// [JoinSet::spawn_local_on](tokio::task::JoinSet::spawn_local_on) - Spawn a local task on a provided LocalSet. + pub fn spawn_local_on(&mut self, task: F, local_set: &LocalSet) -> AbortHandle + where + F: Future, + F: 'static, + { + self.inner.spawn_local_on(task, local_set) + } + + /// [JoinSet::spawn_blocking](tokio::task::JoinSet::spawn_blocking) - Spawn a blocking task. + pub fn spawn_blocking(&mut self, f: F) -> AbortHandle + where + F: FnOnce() -> T, + F: Send + 'static, + T: Send, + { + self.inner.spawn_blocking(trace_block(f)) + } + + /// [JoinSet::spawn_blocking_on](tokio::task::JoinSet::spawn_blocking_on) - Spawn a blocking task on a provided runtime. + pub fn spawn_blocking_on(&mut self, f: F, handle: &Handle) -> AbortHandle + where + F: FnOnce() -> T, + F: Send + 'static, + T: Send, + { + self.inner.spawn_blocking_on(trace_block(f), handle) + } + + /// [JoinSet::join_next](tokio::task::JoinSet::join_next) - Await the next completed task. + pub async fn join_next(&mut self) -> Option> { + self.inner.join_next().await + } + + /// [JoinSet::try_join_next](tokio::task::JoinSet::try_join_next) - Try to join the next completed task. + pub fn try_join_next(&mut self) -> Option> { + self.inner.try_join_next() + } + + /// [JoinSet::abort_all](tokio::task::JoinSet::abort_all) - Abort all tasks. + pub fn abort_all(&mut self) { + self.inner.abort_all() + } + + /// [JoinSet::detach_all](tokio::task::JoinSet::detach_all) - Detach all tasks. + pub fn detach_all(&mut self) { + self.inner.detach_all() + } + + /// [JoinSet::poll_join_next](tokio::task::JoinSet::poll_join_next) - Poll for the next completed task. + pub fn poll_join_next( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + self.inner.poll_join_next(cx) + } + + /// [JoinSet::join_next_with_id](tokio::task::JoinSet::join_next_with_id) - Await the next completed task with its ID. + pub async fn join_next_with_id(&mut self) -> Option> { + self.inner.join_next_with_id().await + } + + /// [JoinSet::try_join_next_with_id](tokio::task::JoinSet::try_join_next_with_id) - Try to join the next completed task with its ID. + pub fn try_join_next_with_id(&mut self) -> Option> { + self.inner.try_join_next_with_id() + } + + /// [JoinSet::poll_join_next_with_id](tokio::task::JoinSet::poll_join_next_with_id) - Poll for the next completed task with its ID. + pub fn poll_join_next_with_id( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + self.inner.poll_join_next_with_id(cx) + } + + /// [JoinSet::shutdown](tokio::task::JoinSet::shutdown) - Abort all tasks and wait for shutdown. + pub async fn shutdown(&mut self) { + self.inner.shutdown().await + } + + /// [JoinSet::join_all](tokio::task::JoinSet::join_all) - Await all tasks. + pub async fn join_all(self) -> Vec { + self.inner.join_all().await + } +} diff --git a/datafusion/common-runtime/src/lib.rs b/datafusion/common-runtime/src/lib.rs index 7bd8dc4cfe36..ec8db0bdcd91 100644 --- a/datafusion/common-runtime/src/lib.rs +++ b/datafusion/common-runtime/src/lib.rs @@ -20,9 +20,14 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] pub mod common; +mod join_set; +mod trace_utils; pub use common::SpawnedTask; +pub use join_set::JoinSet; +pub use trace_utils::{set_join_set_tracer, JoinSetTracer}; diff --git a/datafusion/common-runtime/src/trace_utils.rs b/datafusion/common-runtime/src/trace_utils.rs new file mode 100644 index 000000000000..c3a39c355fc8 --- /dev/null +++ b/datafusion/common-runtime/src/trace_utils.rs @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use futures::future::BoxFuture; +use futures::FutureExt; +use std::any::Any; +use std::error::Error; +use std::fmt::{Display, Formatter, Result as FmtResult}; +use std::future::Future; +use tokio::sync::OnceCell; + +/// A trait for injecting instrumentation into either asynchronous futures or +/// blocking closures at runtime. +pub trait JoinSetTracer: Send + Sync + 'static { + /// Function pointer type for tracing a future. + /// + /// This function takes a boxed future (with its output type erased) + /// and returns a boxed future (with its output still erased). The + /// tracer must apply instrumentation without altering the output. + fn trace_future( + &self, + fut: BoxFuture<'static, Box>, + ) -> BoxFuture<'static, Box>; + + /// Function pointer type for tracing a blocking closure. + /// + /// This function takes a boxed closure (with its return type erased) + /// and returns a boxed closure (with its return type still erased). The + /// tracer must apply instrumentation without changing the return value. + fn trace_block( + &self, + f: Box Box + Send>, + ) -> Box Box + Send>; +} + +/// A no-op tracer that does not modify or instrument any futures or closures. +/// This is used as a fallback if no custom tracer is set. +struct NoopTracer; + +impl JoinSetTracer for NoopTracer { + fn trace_future( + &self, + fut: BoxFuture<'static, Box>, + ) -> BoxFuture<'static, Box> { + fut + } + + fn trace_block( + &self, + f: Box Box + Send>, + ) -> Box Box + Send> { + f + } +} + +/// A custom error type for tracer injection failures. +#[derive(Debug)] +pub enum JoinSetTracerError { + /// The global tracer has already been set. + AlreadySet, +} + +impl Display for JoinSetTracerError { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + JoinSetTracerError::AlreadySet => { + write!(f, "The global JoinSetTracer is already set") + } + } + } +} + +impl Error for JoinSetTracerError {} + +/// Global storage for an injected tracer. If no tracer is injected, a no-op +/// tracer is used instead. This ensures that calls to [`trace_future`] or +/// [`trace_block`] never panic due to missing instrumentation. +static GLOBAL_TRACER: OnceCell<&'static dyn JoinSetTracer> = OnceCell::const_new(); + +/// A no-op tracer singleton that is returned by [`get_tracer`] if no custom +/// tracer has been registered. +static NOOP_TRACER: NoopTracer = NoopTracer; + +/// Return the currently registered tracer, or the no-op tracer if none was +/// registered. +#[inline] +fn get_tracer() -> &'static dyn JoinSetTracer { + GLOBAL_TRACER.get().copied().unwrap_or(&NOOP_TRACER) +} + +/// Set the custom tracer for both futures and blocking closures. +/// +/// This should be called once at startup. If called more than once, an +/// `Err(JoinSetTracerError)` is returned. If not called at all, a no-op tracer that does nothing +/// is used. +pub fn set_join_set_tracer( + tracer: &'static dyn JoinSetTracer, +) -> Result<(), JoinSetTracerError> { + GLOBAL_TRACER + .set(tracer) + .map_err(|_set_err| JoinSetTracerError::AlreadySet) +} + +/// Optionally instruments a future with custom tracing. +/// +/// If a tracer has been injected via `set_tracer`, the future's output is +/// boxed (erasing its type), passed to the tracer, and then downcast back +/// to the expected type. If no tracer is set, the original future is returned. +/// +/// # Type Parameters +/// * `T` - The concrete output type of the future. +/// * `F` - The future type. +/// +/// # Parameters +/// * `future` - The future to potentially instrument. +pub fn trace_future(future: F) -> BoxFuture<'static, T> +where + F: Future + Send + 'static, + T: Send + 'static, +{ + // Erase the future’s output type first: + let erased_future = async move { + let result = future.await; + Box::new(result) as Box + } + .boxed(); + + // Forward through the global tracer: + get_tracer() + .trace_future(erased_future) + // Downcast from `Box` back to `T`: + .map(|any_box| { + *any_box + .downcast::() + .expect("Tracer must preserve the future’s output type!") + }) + .boxed() +} + +/// Optionally instruments a blocking closure with custom tracing. +/// +/// If a tracer has been injected via `set_tracer`, the closure is wrapped so that +/// its return value is boxed (erasing its type), passed to the tracer, and then the +/// result is downcast back to the original type. If no tracer is set, the closure is +/// returned unmodified (except for being boxed). +/// +/// # Type Parameters +/// * `T` - The concrete return type of the closure. +/// * `F` - The closure type. +/// +/// # Parameters +/// * `f` - The blocking closure to potentially instrument. +pub fn trace_block(f: F) -> Box T + Send> +where + F: FnOnce() -> T + Send + 'static, + T: Send + 'static, +{ + // Erase the closure’s return type first: + let erased_closure = Box::new(|| { + let result = f(); + Box::new(result) as Box + }); + + // Forward through the global tracer: + let traced_closure = get_tracer().trace_block(erased_closure); + + // Downcast from `Box` back to `T`: + Box::new(move || { + let any_box = traced_closure(); + *any_box + .downcast::() + .expect("Tracer must preserve the closure’s return type!") + }) +} diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 9f039fe184a6..39b47a96bccf 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -58,7 +58,7 @@ base64 = "0.22.1" half = { workspace = true } hashbrown = { workspace = true } indexmap = { workspace = true } -libc = "0.2.170" +libc = "0.2.171" log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } @@ -73,4 +73,5 @@ web-time = "1.1.0" [dev-dependencies] chrono = { workspace = true } +insta = { workspace = true } rand = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 8c093a9db899..b0f17630c910 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -252,6 +252,11 @@ config_namespace! { /// string length and thus DataFusion can not enforce such limits. pub support_varchar_with_length: bool, default = true + /// If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. + /// If false, `VARCHAR` is mapped to `Utf8` during SQL planning. + /// Default is false. + pub map_varchar_to_utf8view: bool, default = false + /// When set to true, the source locations relative to the original SQL /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected /// and recorded in the logical plan nodes. diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 65bb40810f18..43d082f9dc93 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -564,6 +564,7 @@ impl DFSchema { } /// Check to see if fields in 2 Arrow schemas are compatible + #[deprecated(since = "47.0.0", note = "This method is no longer used")] pub fn check_arrow_schema_type_compatible( &self, arrow_schema: &Schema, @@ -604,26 +605,57 @@ impl DFSchema { }) } - /// Returns true if the two schemas have the same qualified named - /// fields with the same data types. Returns false otherwise. + #[deprecated(since = "47.0.0", note = "Use has_equivalent_names_and_types` instead")] + pub fn equivalent_names_and_types(&self, other: &Self) -> bool { + self.has_equivalent_names_and_types(other).is_ok() + } + + /// Returns Ok if the two schemas have the same qualified named + /// fields with the compatible data types. /// - /// This is a specialized version of Eq that ignores differences - /// in nullability and metadata. + /// Returns an `Err` with a message otherwise. + /// + /// This is a specialized version of Eq that ignores differences in + /// nullability and metadata. /// /// Use [DFSchema]::logically_equivalent_names_and_types for a weaker /// logical type checking, which for example would consider a dictionary /// encoded UTF8 array to be equivalent to a plain UTF8 array. - pub fn equivalent_names_and_types(&self, other: &Self) -> bool { + pub fn has_equivalent_names_and_types(&self, other: &Self) -> Result<()> { + // case 1 : schema length mismatch if self.fields().len() != other.fields().len() { - return false; + _plan_err!( + "Schema mismatch: the schema length are not same \ + Expected schema length: {}, got: {}", + self.fields().len(), + other.fields().len() + ) + } else { + // case 2 : schema length match, but fields mismatch + // check if the fields name are the same and have the same data types + self.fields() + .iter() + .zip(other.fields().iter()) + .try_for_each(|(f1, f2)| { + if f1.name() != f2.name() + || (!DFSchema::datatype_is_semantically_equal( + f1.data_type(), + f2.data_type(), + ) && !can_cast_types(f2.data_type(), f1.data_type())) + { + _plan_err!( + "Schema mismatch: Expected field '{}' with type {:?}, \ + but got '{}' with type {:?}.", + f1.name(), + f1.data_type(), + f2.name(), + f2.data_type() + ) + } else { + Ok(()) + } + }) } - let self_fields = self.iter(); - let other_fields = other.iter(); - self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { - q1 == q2 - && f1.name() == f2.name() - && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) - }) } /// Checks if two [`DataType`]s are logically equal. This is a notably weaker constraint diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index d5b7c22a546c..b137624532b9 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] mod column; diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 367f359ae742..2b758f456876 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -38,6 +38,7 @@ use crate::cast::{ as_fixed_size_binary_array, as_fixed_size_list_array, }; use crate::error::{DataFusionError, Result, _exec_err, _internal_err, _not_impl_err}; +use crate::format::DEFAULT_CAST_OPTIONS; use crate::hash_utils::create_hashes; use crate::utils::SingleRowListArrayBuilder; use arrow::array::{ @@ -58,8 +59,6 @@ use arrow::datatypes::{ UInt8Type, UnionFields, UnionMode, DECIMAL128_MAX_PRECISION, }; use arrow::util::display::{array_value_to_string, ArrayFormatter, FormatOptions}; - -use crate::format::DEFAULT_CAST_OPTIONS; use half::f16; pub use struct_builder::ScalarStructBuilder; @@ -3976,7 +3975,7 @@ mod tests { as_map_array, as_string_array, as_struct_array, as_uint32_array, as_uint64_array, }; - use crate::assert_batches_eq; + use crate::test_util::batches_to_string; use arrow::array::{types::Float64Type, NullBufferBuilder}; use arrow::buffer::{Buffer, OffsetBuffer}; use arrow::compute::{is_null, kernels}; @@ -3984,6 +3983,7 @@ mod tests { use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_columns; use chrono::NaiveDate; + use insta::assert_snapshot; use rand::Rng; #[test] @@ -6910,14 +6910,13 @@ mod tests { //verify compared to arrow display let batch = RecordBatch::try_from_iter(vec![("s", arr as _)]).unwrap(); - let expected = [ - "+-------------+", - "| s |", - "+-------------+", - "| {a: 1, b: } |", - "+-------------+", - ]; - assert_batches_eq!(&expected, &[batch]); + assert_snapshot!(batches_to_string(&[batch]), @r" + +-------------+ + | s | + +-------------+ + | {a: 1, b: } | + +-------------+ + "); } #[test] @@ -6946,14 +6945,13 @@ mod tests { //verify compared to arrow display let batch = RecordBatch::try_from_iter(vec![("s", arr as _)]).unwrap(); - let expected = [ - "+--------------+", - "| s |", - "+--------------+", - "| {a: 1, b: 2} |", - "+--------------+", - ]; - assert_batches_eq!(&expected, &[batch]); + assert_snapshot!(batches_to_string(&[batch]), @r" + +--------------+ + | s | + +--------------+ + | {a: 1, b: 2} | + +--------------+ + "); } #[test] @@ -6969,15 +6967,13 @@ mod tests { //verify compared to arrow display let batch = RecordBatch::try_from_iter(vec![("s", arr as _)]).unwrap(); - #[rustfmt::skip] - let expected = [ - "+---+", - "| s |", - "+---+", - "| |", - "+---+", - ]; - assert_batches_eq!(&expected, &[batch]); + assert_snapshot!(batches_to_string(&[batch]), @r" + +---+ + | s | + +---+ + | | + +---+ + "); } #[test] @@ -7011,17 +7007,16 @@ mod tests { //verify compared to arrow display let batch = RecordBatch::try_from_iter(vec![("m", arr as _)]).unwrap(); - let expected = [ - "+--------------------+", - "| m |", - "+--------------------+", - "| {joe: 1} |", - "| {blogs: 2, foo: 4} |", - "| {} |", - "| |", - "+--------------------+", - ]; - assert_batches_eq!(&expected, &[batch]); + assert_snapshot!(batches_to_string(&[batch]), @r" + +--------------------+ + | m | + +--------------------+ + | {joe: 1} | + | {blogs: 2, foo: 4} | + | {} | + | | + +--------------------+ + "); } #[test] diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index 298f54389cf8..b801c452af2c 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -17,6 +17,9 @@ //! Utility functions to make testing DataFusion based crates easier +use crate::arrow::util::pretty::pretty_format_batches_with_options; +use crate::format::DEFAULT_FORMAT_OPTIONS; +use arrow::array::RecordBatch; use std::{error::Error, path::PathBuf}; /// Compares formatted output of a record batch with an expected @@ -73,6 +76,31 @@ macro_rules! assert_batches_eq { }; } +pub fn batches_to_string(batches: &[RecordBatch]) -> String { + let actual = pretty_format_batches_with_options(batches, &DEFAULT_FORMAT_OPTIONS) + .unwrap() + .to_string(); + + actual.trim().to_string() +} + +pub fn batches_to_sort_string(batches: &[RecordBatch]) -> String { + let actual_lines = + pretty_format_batches_with_options(batches, &DEFAULT_FORMAT_OPTIONS) + .unwrap() + .to_string(); + + let mut actual_lines: Vec<&str> = actual_lines.trim().lines().collect(); + + // sort except for header + footer + let num_lines = actual_lines.len(); + if num_lines > 3 { + actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + actual_lines.join("\n") +} + /// Compares formatted output of a record batch with an expected /// vector of strings in a way that order does not matter. /// This is a macro so errors appear on the correct line diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index af029b177607..3073abf8bb1a 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -79,7 +79,12 @@ recursive_protection = [ "datafusion-physical-optimizer/recursive_protection", "datafusion-sql/recursive_protection", ] -serde = ["dep:serde"] +serde = [ + "dep:serde", + # Enable `#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]` + # statements in `arrow-schema` crate + "arrow-schema/serde", +] string_expressions = ["datafusion-functions/string_expressions"] unicode_expressions = [ "datafusion-sql/unicode_expressions", @@ -93,7 +98,7 @@ arrow-ipc = { workspace = true } arrow-schema = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } -bzip2 = { version = "0.5.1", optional = true } +bzip2 = { version = "0.5.2", optional = true } chrono = { workspace = true } datafusion-catalog = { workspace = true } datafusion-catalog-listing = { workspace = true } @@ -133,13 +138,13 @@ sqlparser = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } url = { workspace = true } -uuid = { version = "1.15", features = ["v4", "js"] } +uuid = { version = "1.16", features = ["v4", "js"] } xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } [dev-dependencies] async-trait = { workspace = true } -criterion = { workspace = true, features = ["async_tokio"] } +criterion = { workspace = true, features = ["async_tokio", "async_futures"] } ctor = { workspace = true } dashmap = "6.1.0" datafusion-doc = { workspace = true } @@ -147,6 +152,7 @@ datafusion-functions-window-common = { workspace = true } datafusion-physical-optimizer = { workspace = true } doc-comment = { workspace = true } env_logger = { workspace = true } +insta = { workspace = true } paste = "^1.0" rand = { workspace = true, features = ["small_rng"] } rand_distr = "0.4.3" diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index e998e489a927..9c27c7c5d307 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -54,6 +54,7 @@ use datafusion_common::{ exec_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, ParamValues, ScalarValue, SchemaError, UnnestOptions, }; +use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::{ case, dml::InsertOp, @@ -342,13 +343,28 @@ impl DataFrame { /// # Ok(()) /// # } /// ``` - pub fn select(self, expr_list: Vec) -> Result { - let window_func_exprs = find_window_exprs(&expr_list); + pub fn select( + self, + expr_list: impl IntoIterator>, + ) -> Result { + let expr_list: Vec = + expr_list.into_iter().map(|e| e.into()).collect::>(); + + let expressions = expr_list + .iter() + .filter_map(|e| match e { + SelectExpr::Expression(expr) => Some(expr.clone()), + _ => None, + }) + .collect::>(); + + let window_func_exprs = find_window_exprs(&expressions); let plan = if window_func_exprs.is_empty() { self.plan } else { LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)? }; + let project_plan = LogicalPlanBuilder::from(plan).project(expr_list)?.build()?; Ok(DataFrame { @@ -2039,6 +2055,7 @@ impl DataFrame { })), relation: None, name: field.name().to_string(), + metadata: None, }), Err(_) => col(field.name()), } diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 6835d9a6da2a..3172e5692559 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -46,7 +46,7 @@ use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ not_impl_err, DataFusionError, GetExt, Statistics, DEFAULT_ARROW_EXTENSION, }; -use datafusion_common_runtime::SpawnedTask; +use datafusion_common_runtime::{JoinSet, SpawnedTask}; use datafusion_datasource::display::FileGroupDisplay; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; @@ -62,7 +62,6 @@ use futures::stream::BoxStream; use futures::StreamExt; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; use tokio::io::AsyncWriteExt; -use tokio::task::JoinSet; /// Initial writing buffer size. Note this is just a size hint for efficiency. It /// will grow beyond the set value if needed. @@ -302,8 +301,8 @@ impl DisplayAs for ArrowFileSink { write!(f, ")") } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + writeln!(f, "format: arrow")?; + write!(f, "file={}", &self.config.original_url) } } } diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index e2c7d1ecafa3..3b71593b3334 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -1373,6 +1373,7 @@ mod tests { let object_store_url = ObjectStoreUrl::local_filesystem(); let file_sink_config = FileSinkConfig { + original_url: String::default(), object_store_url: object_store_url.clone(), file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], table_paths: vec![ListingTableUrl::parse(table_path)?], @@ -1458,6 +1459,7 @@ mod tests { // set file config to include partitioning on field_a let file_sink_config = FileSinkConfig { + original_url: String::default(), object_store_url: object_store_url.clone(), file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], table_paths: vec![ListingTableUrl::parse("file:///")?], @@ -1541,6 +1543,7 @@ mod tests { let object_store_url = ObjectStoreUrl::local_filesystem(); let file_sink_config = FileSinkConfig { + original_url: String::default(), object_store_url: object_store_url.clone(), file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], table_paths: vec![ListingTableUrl::parse("file:///")?], diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 4d7762784d78..21b35bac2174 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1037,6 +1037,7 @@ impl TableProvider for ListingTable { // Sink related option, apart from format let config = FileSinkConfig { + original_url: String::default(), object_store_url: self.table_paths()[0].object_store(), table_paths: self.table_paths().clone(), file_groups, diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index d96944fa7a69..23d09719f202 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -24,7 +24,6 @@ use std::sync::Arc; use crate::datasource::{TableProvider, TableType}; use crate::error::Result; -use crate::execution::context::SessionState; use crate::logical_expr::Expr; use crate::physical_plan::insert::{DataSink, DataSinkExec}; use crate::physical_plan::repartition::RepartitionExec; @@ -38,6 +37,7 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_catalog::Session; use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt}; +use datafusion_common_runtime::JoinSet; pub use datafusion_datasource::memory::MemorySourceConfig; pub use datafusion_datasource::source::DataSourceExec; use datafusion_execution::TaskContext; @@ -49,7 +49,6 @@ use futures::StreamExt; use log::debug; use parking_lot::Mutex; use tokio::sync::RwLock; -use tokio::task::JoinSet; /// Type alias for partition data pub type PartitionData = Arc>>; @@ -129,7 +128,7 @@ impl MemTable { pub async fn load( t: Arc, output_partitions: Option, - state: &SessionState, + state: &dyn Session, ) -> Result { let schema = t.schema(); let constraints = t.constraints(); @@ -267,6 +266,8 @@ impl TableProvider for MemTable { /// # Returns /// /// * A plan that returns the number of rows written. + /// + /// [`SessionState`]: crate::execution::context::SessionState async fn insert_into( &self, _state: &dyn Session, diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 18a1318dd40d..a932ae76c621 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -19,8 +19,6 @@ //! //! [`ListingTable`]: crate::datasource::listing::ListingTable -pub mod cte_worktable; -pub mod default_table_source; pub mod dynamic_file; pub mod empty; pub mod file_format; @@ -30,11 +28,7 @@ pub mod memory; pub mod physical_plan; pub mod provider; mod statistics; -pub mod stream; -pub mod view; - -pub use datafusion_datasource::schema_adapter; -pub use datafusion_datasource::source; +mod view_test; // backwards compatibility pub use self::default_table_source::{ @@ -44,96 +38,16 @@ pub use self::memory::MemTable; pub use self::view::ViewTable; pub use crate::catalog::TableProvider; pub use crate::logical_expr::TableType; +pub use datafusion_catalog::cte_worktable; +pub use datafusion_catalog::default_table_source; +pub use datafusion_catalog::stream; +pub use datafusion_catalog::view; +pub use datafusion_datasource::schema_adapter; +pub use datafusion_datasource::source; pub use datafusion_execution::object_store; +pub use datafusion_physical_expr::create_ordering; pub use statistics::get_statistics_with_limit; -use arrow::compute::SortOptions; -use arrow::datatypes::Schema; -use datafusion_common::{plan_err, Result}; -use datafusion_expr::{Expr, SortExpr}; -use datafusion_physical_expr::{expressions, LexOrdering, PhysicalSortExpr}; - -/// Converts logical sort expressions to physical sort expressions -/// -/// This function transforms a collection of logical sort expressions into their physical -/// representation that can be used during query execution. -/// -/// # Arguments -/// -/// * `schema` - The schema containing column definitions -/// * `sort_order` - A collection of logical sort expressions grouped into lexicographic orderings -/// -/// # Returns -/// -/// A vector of lexicographic orderings for physical execution, or an error if the transformation fails -/// -/// # Examples -/// -/// ``` -/// // Create orderings from columns "id" and "name" -/// # use arrow::datatypes::{Schema, Field, DataType}; -/// # use datafusion::datasource::create_ordering; -/// # use datafusion_common::Column; -/// # use datafusion_expr::{Expr, SortExpr}; -/// # -/// // Create a schema with two fields -/// let schema = Schema::new(vec![ -/// Field::new("id", DataType::Int32, false), -/// Field::new("name", DataType::Utf8, false), -/// ]); -/// -/// let sort_exprs = vec![ -/// vec![ -/// SortExpr { expr: Expr::Column(Column::new(Some("t"), "id")), asc: true, nulls_first: false } -/// ], -/// vec![ -/// SortExpr { expr: Expr::Column(Column::new(Some("t"), "name")), asc: false, nulls_first: true } -/// ] -/// ]; -/// let result = create_ordering(&schema, &sort_exprs).unwrap(); -/// ``` -pub fn create_ordering( - schema: &Schema, - sort_order: &[Vec], -) -> Result> { - let mut all_sort_orders = vec![]; - - for (group_idx, exprs) in sort_order.iter().enumerate() { - // Construct PhysicalSortExpr objects from Expr objects: - let mut sort_exprs = LexOrdering::default(); - for (expr_idx, sort) in exprs.iter().enumerate() { - match &sort.expr { - Expr::Column(col) => match expressions::col(&col.name, schema) { - Ok(expr) => { - sort_exprs.push(PhysicalSortExpr { - expr, - options: SortOptions { - descending: !sort.asc, - nulls_first: sort.nulls_first, - }, - }); - } - // Cannot find expression in the projected_schema, stop iterating - // since rest of the orderings are violated - Err(_) => break, - }, - expr => { - return plan_err!( - "Expected single column reference in sort_order[{}][{}], got {}", - group_idx, - expr_idx, - expr - ); - } - } - } - if !sort_exprs.is_empty() { - all_sort_orders.push(sort_exprs); - } - } - Ok(all_sort_orders) -} - #[cfg(all(test, feature = "parquet"))] mod tests { @@ -347,12 +261,5 @@ mod tests { Ok(RecordBatch::try_new(schema, new_columns).unwrap()) } - - fn map_partial_batch( - &self, - batch: RecordBatch, - ) -> datafusion_common::Result { - self.map_batch(batch) - } } } diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 888f3ad9e3b9..b5534d6b3d1c 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -224,6 +224,327 @@ mod tests { ) } + #[tokio::test] + async fn test_pushdown_with_missing_column_in_file() { + let c1: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + + let file_schema = + Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)])); + + let table_schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new(file_schema.clone(), vec![c1]).unwrap(); + + // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`, + // the default behavior is to fill in missing columns with nulls. + // Thus this predicate will come back as false. + let filter = col("c2").eq(lit(1_i32)); + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + let total_rows = rt + .batches + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum::(); + assert_eq!(total_rows, 0, "Expected no rows to match the predicate"); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 3, "Expected all rows to be pruned"); + + // If we excplicitly allow nulls the rest of the predicate should work + let filter = col("c2").is_null().and(col("c1").eq(lit(1_i32))); + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + let batches = rt.batches.unwrap(); + #[rustfmt::skip] + let expected = [ + "+----+----+", + "| c1 | c2 |", + "+----+----+", + "| 1 | |", + "+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 2, "Expected all rows to be pruned"); + } + + #[tokio::test] + async fn test_pushdown_with_missing_column_in_file_multiple_types() { + let c1: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + + let file_schema = + Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)])); + + let table_schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Utf8, true), + ])); + + let batch = RecordBatch::try_new(file_schema.clone(), vec![c1]).unwrap(); + + // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`, + // the default behavior is to fill in missing columns with nulls. + // Thus this predicate will come back as false. + let filter = col("c2").eq(lit("abc")); + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + let total_rows = rt + .batches + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum::(); + assert_eq!(total_rows, 0, "Expected no rows to match the predicate"); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 3, "Expected all rows to be pruned"); + + // If we excplicitly allow nulls the rest of the predicate should work + let filter = col("c2").is_null().and(col("c1").eq(lit(1_i32))); + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + let batches = rt.batches.unwrap(); + #[rustfmt::skip] + let expected = [ + "+----+----+", + "| c1 | c2 |", + "+----+----+", + "| 1 | |", + "+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 2, "Expected all rows to be pruned"); + } + + #[tokio::test] + async fn test_pushdown_with_missing_middle_column() { + let c1: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + let c3 = Arc::new(Int32Array::from(vec![7, 8, 9])); + + let file_schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c3", DataType::Int32, true), + ])); + + let table_schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Utf8, true), + Field::new("c3", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new(file_schema.clone(), vec![c1, c3]).unwrap(); + + // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`, + // the default behavior is to fill in missing columns with nulls. + // Thus this predicate will come back as false. + let filter = col("c2").eq(lit("abc")); + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + let total_rows = rt + .batches + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum::(); + assert_eq!(total_rows, 0, "Expected no rows to match the predicate"); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 3, "Expected all rows to be pruned"); + + // If we excplicitly allow nulls the rest of the predicate should work + let filter = col("c2").is_null().and(col("c1").eq(lit(1_i32))); + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + let batches = rt.batches.unwrap(); + #[rustfmt::skip] + let expected = [ + "+----+----+----+", + "| c1 | c2 | c3 |", + "+----+----+----+", + "| 1 | | 7 |", + "+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 2, "Expected all rows to be pruned"); + } + + #[tokio::test] + async fn test_pushdown_with_file_column_order_mismatch() { + let c3 = Arc::new(Int32Array::from(vec![7, 8, 9])); + + let file_schema = Arc::new(Schema::new(vec![ + Field::new("c3", DataType::Int32, true), + Field::new("c3", DataType::Int32, true), + ])); + + let table_schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Utf8, true), + Field::new("c3", DataType::Int32, true), + ])); + + let batch = + RecordBatch::try_new(file_schema.clone(), vec![c3.clone(), c3]).unwrap(); + + // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`, + // the default behavior is to fill in missing columns with nulls. + // Thus this predicate will come back as false. + let filter = col("c2").eq(lit("abc")); + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + let total_rows = rt + .batches + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum::(); + assert_eq!(total_rows, 0, "Expected no rows to match the predicate"); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 3, "Expected all rows to be pruned"); + + // If we excplicitly allow nulls the rest of the predicate should work + let filter = col("c2").is_null().and(col("c3").eq(lit(7_i32))); + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + let batches = rt.batches.unwrap(); + #[rustfmt::skip] + let expected = [ + "+----+----+----+", + "| c1 | c2 | c3 |", + "+----+----+----+", + "| | | 7 |", + "+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 2, "Expected all rows to be pruned"); + } + + #[tokio::test] + async fn test_pushdown_with_missing_column_nested_conditions() { + // Create test data with c1 and c3 columns + let c1: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let c3: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30, 40, 50])); + + let file_schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c3", DataType::Int32, true), + ])); + + let table_schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Int32, true), + Field::new("c3", DataType::Int32, true), + ])); + + let batch = RecordBatch::try_new(file_schema.clone(), vec![c1, c3]).unwrap(); + + // Test with complex nested AND/OR: + // (c1 = 1 OR c2 = 5) AND (c3 = 10 OR c2 IS NULL) + // Should return 1 row where c1=1 AND c3=10 (since c2 IS NULL is always true) + let filter = col("c1") + .eq(lit(1_i32)) + .or(col("c2").eq(lit(5_i32))) + .and(col("c3").eq(lit(10_i32)).or(col("c2").is_null())); + + let rt = RoundTrip::new() + .with_schema(table_schema.clone()) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch.clone()]) + .await; + + let batches = rt.batches.unwrap(); + #[rustfmt::skip] + let expected = [ + "+----+----+----+", + "| c1 | c2 | c3 |", + "+----+----+----+", + "| 1 | | 10 |", + "+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 4, "Expected 4 rows to be pruned"); + + // Test a more complex nested condition: + // (c1 < 3 AND c2 IS NOT NULL) OR (c3 > 20 AND c2 IS NULL) + // First part should return 0 rows (c2 IS NOT NULL is always false) + // Second part should return rows where c3 > 20 (3 rows: where c3 is 30, 40, 50) + let filter = col("c1") + .lt(lit(3_i32)) + .and(col("c2").is_not_null()) + .or(col("c3").gt(lit(20_i32)).and(col("c2").is_null())); + + let rt = RoundTrip::new() + .with_schema(table_schema) + .with_predicate(filter.clone()) + .with_pushdown_predicate() + .round_trip(vec![batch]) + .await; + + let batches = rt.batches.unwrap(); + #[rustfmt::skip] + let expected = [ + "+----+----+----+", + "| c1 | c2 | c3 |", + "+----+----+----+", + "| 3 | | 30 |", + "| 4 | | 40 |", + "| 5 | | 50 |", + "+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + let metrics = rt.parquet_exec.metrics().unwrap(); + let metric = get_value(&metrics, "pushdown_rows_pruned"); + assert_eq!(metric, 2, "Expected 2 rows to be pruned"); + } + #[tokio::test] async fn evolved_schema() { let c1: ArrayRef = diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view_test.rs similarity index 77% rename from datafusion/core/src/datasource/view.rs rename to datafusion/core/src/datasource/view_test.rs index 91e9b6789fda..c3dd5a2dd979 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view_test.rs @@ -17,158 +17,14 @@ //! View data source which uses a LogicalPlan as it's input. -use std::{any::Any, borrow::Cow, sync::Arc}; - -use crate::{ - error::Result, - logical_expr::{Expr, LogicalPlan}, - physical_plan::ExecutionPlan, -}; -use arrow::datatypes::SchemaRef; -use async_trait::async_trait; -use datafusion_catalog::Session; -use datafusion_common::config::ConfigOptions; -use datafusion_common::Column; -use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown}; -use datafusion_optimizer::analyzer::expand_wildcard_rule::ExpandWildcardRule; -use datafusion_optimizer::analyzer::type_coercion::TypeCoercion; -use datafusion_optimizer::Analyzer; - -use crate::datasource::{TableProvider, TableType}; - -/// An implementation of `TableProvider` that uses another logical plan. -#[derive(Debug)] -pub struct ViewTable { - /// LogicalPlan of the view - logical_plan: LogicalPlan, - /// File fields + partition columns - table_schema: SchemaRef, - /// SQL used to create the view, if available - definition: Option, -} - -impl ViewTable { - /// Create new view that is executed at query runtime. - /// Takes a `LogicalPlan` and an optional create statement as input. - pub fn try_new( - logical_plan: LogicalPlan, - definition: Option, - ) -> Result { - let logical_plan = Self::apply_required_rule(logical_plan)?; - let table_schema = logical_plan.schema().as_ref().to_owned().into(); - - let view = Self { - logical_plan, - table_schema, - definition, - }; - - Ok(view) - } - - fn apply_required_rule(logical_plan: LogicalPlan) -> Result { - let options = ConfigOptions::default(); - Analyzer::with_rules(vec![ - Arc::new(ExpandWildcardRule::new()), - Arc::new(TypeCoercion::new()), - ]) - .execute_and_check(logical_plan, &options, |_, _| {}) - } - - /// Get definition ref - pub fn definition(&self) -> Option<&String> { - self.definition.as_ref() - } - - /// Get logical_plan ref - pub fn logical_plan(&self) -> &LogicalPlan { - &self.logical_plan - } -} - -#[async_trait] -impl TableProvider for ViewTable { - fn as_any(&self) -> &dyn Any { - self - } - - fn get_logical_plan(&self) -> Option> { - Some(Cow::Borrowed(&self.logical_plan)) - } - - fn schema(&self) -> SchemaRef { - Arc::clone(&self.table_schema) - } - - fn table_type(&self) -> TableType { - TableType::View - } - - fn get_table_definition(&self) -> Option<&str> { - self.definition.as_deref() - } - fn supports_filters_pushdown( - &self, - filters: &[&Expr], - ) -> Result> { - // A filter is added on the View when given - Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) - } - - async fn scan( - &self, - state: &dyn Session, - projection: Option<&Vec>, - filters: &[Expr], - limit: Option, - ) -> Result> { - let filter = filters.iter().cloned().reduce(|acc, new| acc.and(new)); - let plan = self.logical_plan().clone(); - let mut plan = LogicalPlanBuilder::from(plan); - - if let Some(filter) = filter { - plan = plan.filter(filter)?; - } - - let mut plan = if let Some(projection) = projection { - // avoiding adding a redundant projection (e.g. SELECT * FROM view) - let current_projection = - (0..plan.schema().fields().len()).collect::>(); - if projection == ¤t_projection { - plan - } else { - let fields: Vec = projection - .iter() - .map(|i| { - Expr::Column(Column::from( - self.logical_plan.schema().qualified_field(*i), - )) - }) - .collect(); - plan.project(fields)? - } - } else { - plan - }; - - if let Some(limit) = limit { - plan = plan.limit(0, Some(limit))?; - } - - state.create_physical_plan(&plan.build()?).await - } -} - #[cfg(test)] mod tests { - use datafusion_expr::{col, lit}; - + use crate::error::Result; use crate::execution::options::ParquetReadOptions; use crate::prelude::SessionContext; use crate::test_util::parquet_test_data; use crate::{assert_batches_eq, execution::context::SessionConfig}; - - use super::*; + use datafusion_expr::{col, lit}; #[tokio::test] async fn issue_3242() -> Result<()> { diff --git a/datafusion/core/src/execution/context/csv.rs b/datafusion/core/src/execution/context/csv.rs index 3e7db1caa20f..15d6d21f038a 100644 --- a/datafusion/core/src/execution/context/csv.rs +++ b/datafusion/core/src/execution/context/csv.rs @@ -89,8 +89,9 @@ impl SessionContext { #[cfg(test)] mod tests { use super::*; - use crate::assert_batches_eq; use crate::test_util::{plan_and_collect, populate_csv_partitions}; + use datafusion_common::test_util::batches_to_string; + use insta::assert_snapshot; use tempfile::TempDir; @@ -115,14 +116,13 @@ mod tests { plan_and_collect(&ctx, "SELECT sum(c1), sum(c2), count(*) FROM test").await?; assert_eq!(results.len(), 1); - let expected = [ - "+--------------+--------------+----------+", - "| sum(test.c1) | sum(test.c2) | count(*) |", - "+--------------+--------------+----------+", - "| 10 | 110 | 20 |", - "+--------------+--------------+----------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!(batches_to_string(&results), @r" + +--------------+--------------+----------+ + | sum(test.c1) | sum(test.c2) | count(*) | + +--------------+--------------+----------+ + | 10 | 110 | 20 | + +--------------+--------------+----------+ + "); Ok(()) } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index ad0993ed43ca..714e94234a2c 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -25,10 +25,10 @@ use std::sync::{Arc, Weak}; use super::options::ReadOptions; use crate::{ + catalog::listing_schema::ListingSchemaProvider, catalog::{ CatalogProvider, CatalogProviderList, TableProvider, TableProviderFactory, }, - catalog_common::listing_schema::ListingSchemaProvider, dataframe::DataFrame, datasource::listing::{ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, @@ -75,9 +75,12 @@ use chrono::{DateTime, Utc}; use datafusion_catalog::{ DynamicFileCatalog, SessionStore, TableFunction, TableFunctionImpl, UrlTableFactory, }; +use datafusion_common::config::ConfigOptions; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; +use datafusion_optimizer::analyzer::type_coercion::TypeCoercion; +use datafusion_optimizer::Analyzer; use datafusion_optimizer::{AnalyzerRule, OptimizerRule}; use object_store::ObjectStore; use parking_lot::RwLock; @@ -856,6 +859,16 @@ impl SessionContext { } } + /// Applies the `TypeCoercion` rewriter to the logical plan. + fn apply_type_coercion(logical_plan: LogicalPlan) -> Result { + let options = ConfigOptions::default(); + Analyzer::with_rules(vec![Arc::new(TypeCoercion::new())]).execute_and_check( + logical_plan, + &options, + |_, _| {}, + ) + } + async fn create_view(&self, cmd: CreateView) -> Result { let CreateView { name, @@ -874,13 +887,14 @@ impl SessionContext { match (or_replace, view) { (true, Ok(_)) => { self.deregister_table(name.clone())?; - let table = Arc::new(ViewTable::try_new((*input).clone(), definition)?); - + let input = Self::apply_type_coercion(input.as_ref().clone())?; + let table = Arc::new(ViewTable::new(input, definition)); self.register_table(name, table)?; self.return_empty_dataframe() } (_, Err(_)) => { - let table = Arc::new(ViewTable::try_new((*input).clone(), definition)?); + let input = Self::apply_type_coercion(input.as_ref().clone())?; + let table = Arc::new(ViewTable::new(input, definition)); self.register_table(name, table)?; self.return_empty_dataframe() } @@ -1814,7 +1828,6 @@ impl<'n> TreeNodeVisitor<'n> for BadPlanVisitor<'_> { #[cfg(test)] mod tests { use super::{super::options::CsvReadOptions, *}; - use crate::assert_batches_eq; use crate::execution::memory_pool::MemoryConsumer; use crate::test; use crate::test_util::{plan_and_collect, populate_csv_partitions}; @@ -1823,7 +1836,9 @@ mod tests { use std::error::Error; use std::path::PathBuf; + use datafusion_common::test_util::batches_to_string; use datafusion_common_runtime::SpawnedTask; + use insta::{allow_duplicates, assert_snapshot}; use crate::catalog::SchemaProvider; use crate::execution::session_state::SessionStateBuilder; @@ -1886,14 +1901,13 @@ mod tests { plan_and_collect(&ctx, "SELECT @@version, @name, @integer + 1 FROM dual") .await?; - let expected = [ - "+----------------------+------------------------+---------------------+", - "| @@version | @name | @integer + Int64(1) |", - "+----------------------+------------------------+---------------------+", - "| system-var-@@version | user-defined-var-@name | 42 |", - "+----------------------+------------------------+---------------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!(batches_to_string(&results), @r" + +----------------------+------------------------+---------------------+ + | @@version | @name | @integer + Int64(1) | + +----------------------+------------------------+---------------------+ + | system-var-@@version | user-defined-var-@name | 42 | + +----------------------+------------------------+---------------------+ + "); Ok(()) } @@ -1974,14 +1988,15 @@ mod tests { let actual = arrow::util::pretty::pretty_format_batches(&result) .unwrap() .to_string(); - let expected = r#"+--------------------+ -| c_name | -+--------------------+ -| Customer#000000002 | -| Customer#000000003 | -| Customer#000000004 | -+--------------------+"#; - assert_eq!(actual, expected); + assert_snapshot!(actual, @r" + +--------------------+ + | c_name | + +--------------------+ + | Customer#000000002 | + | Customer#000000003 | + | Customer#000000004 | + +--------------------+ + "); Ok(()) } @@ -2006,14 +2021,15 @@ mod tests { let actual = arrow::util::pretty::pretty_format_batches(&result) .unwrap() .to_string(); - let expected = r#"+--------------------+ -| c_name | -+--------------------+ -| Customer#000000002 | -| Customer#000000003 | -| Customer#000000004 | -+--------------------+"#; - assert_eq!(actual, expected); + assert_snapshot!(actual, @r" + +--------------------+ + | c_name | + +--------------------+ + | Customer#000000002 | + | Customer#000000003 | + | Customer#000000004 | + +--------------------+ + "); Ok(()) } @@ -2096,6 +2112,8 @@ mod tests { .unwrap(); ctx.register_catalog("my_catalog", Arc::new(catalog)); + let mut results = Vec::new(); + for table_ref in &["my_catalog.my_schema.test", "my_schema.test", "test"] { let result = plan_and_collect( &ctx, @@ -2104,14 +2122,18 @@ mod tests { .await .unwrap(); - let expected = [ - "+-------+", - "| count |", - "+-------+", - "| 1 |", - "+-------+", - ]; - assert_batches_eq!(expected, &result); + results.push(result); + } + allow_duplicates! { + for result in &results { + assert_snapshot!(batches_to_string(result), @r" + +-------+ + | count | + +-------+ + | 1 | + +-------+ + "); + } } } @@ -2146,15 +2168,14 @@ mod tests { ) .await?; - let expected = [ - "+-----+-------+", - "| cat | total |", - "+-----+-------+", - "| a | 1 |", - "| b | 3 |", - "+-----+-------+", - ]; - assert_batches_eq!(expected, &result); + assert_snapshot!(batches_to_string(&result), @r" + +-----+-------+ + | cat | total | + +-----+-------+ + | a | 1 | + | b | 3 | + +-----+-------+ + "); Ok(()) } @@ -2243,14 +2264,13 @@ mod tests { .await? .collect() .await?; - let expected = [ - "+-----------------------------+", - "| Utf8(\"2021-01-01 00:00:00\") |", - "+-----------------------------+", - "| 2021-01-01T00:00:00 |", - "+-----------------------------+", - ]; - assert_batches_eq!(expected, &result); + assert_snapshot!(batches_to_string(&result), @r#" + +-----------------------------+ + | Utf8("2021-01-01 00:00:00") | + +-----------------------------+ + | 2021-01-01T00:00:00 | + +-----------------------------+ + "#); Ok(()) } #[test] diff --git a/datafusion/core/src/execution/mod.rs b/datafusion/core/src/execution/mod.rs index 10aa16ffe47a..2e3e09685bcc 100644 --- a/datafusion/core/src/execution/mod.rs +++ b/datafusion/core/src/execution/mod.rs @@ -27,6 +27,4 @@ pub use session_state_defaults::SessionStateDefaults; // backwards compatibility pub use crate::datasource::file_format::options; - -// backwards compatibility pub use datafusion_execution::*; diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 0e83156ab53f..515163102c41 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -266,6 +266,10 @@ impl Session for SessionState { fn table_options_mut(&mut self) -> &mut TableOptions { self.table_options_mut() } + + fn task_ctx(&self) -> Arc { + self.task_ctx() + } } impl SessionState { @@ -436,16 +440,16 @@ impl SessionState { /// Resolve all table references in the SQL statement. Does not include CTE references. /// - /// See [`datafusion_catalog::resolve_table_references`] for more information. + /// See [`datafusion_sql::resolve::resolve_table_references`] for more information. /// - /// [`datafusion_catalog::resolve_table_references`]: datafusion_catalog::resolve_table_references + /// [`datafusion_sql::resolve::resolve_table_references`]: datafusion_sql::resolve::resolve_table_references pub fn resolve_table_references( &self, statement: &Statement, ) -> datafusion_common::Result> { let enable_ident_normalization = self.config.options().sql_parser.enable_ident_normalization; - let (table_refs, _) = datafusion_catalog::resolve_table_references( + let (table_refs, _) = datafusion_sql::resolve::resolve_table_references( statement, enable_ident_normalization, )?; @@ -489,6 +493,7 @@ impl SessionState { enable_options_value_normalization: sql_parser_options .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, + map_varchar_to_utf8view: sql_parser_options.map_varchar_to_utf8view, collect_spans: sql_parser_options.collect_spans, } } @@ -576,6 +581,7 @@ impl SessionState { return Ok(LogicalPlan::Explain(Explain { verbose: e.verbose, plan: Arc::clone(&e.plan), + explain_format: e.explain_format.clone(), stringified_plans, schema: Arc::clone(&e.schema), logical_optimization_succeeded: false, @@ -611,6 +617,7 @@ impl SessionState { Ok(LogicalPlan::Explain(Explain { verbose: e.verbose, + explain_format: e.explain_format.clone(), plan, stringified_plans, schema: Arc::clone(&e.schema), @@ -1412,10 +1419,14 @@ impl SessionStateBuilder { &state.runtime_env, ); - state.catalog_list.register_catalog( + let existing_default_catalog = state.catalog_list.register_catalog( state.config.options().catalog.default_catalog.clone(), Arc::new(default_catalog), ); + + if existing_default_catalog.is_some() { + debug!("Overwrote the default catalog"); + } } if let Some(analyzer_rules) = analyzer_rules { diff --git a/datafusion/core/src/execution/session_state_defaults.rs b/datafusion/core/src/execution/session_state_defaults.rs index b48ef90f2bd5..a241738bd3a4 100644 --- a/datafusion/core/src/execution/session_state_defaults.rs +++ b/datafusion/core/src/execution/session_state_defaults.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. +use crate::catalog::listing_schema::ListingSchemaProvider; use crate::catalog::{CatalogProvider, TableProviderFactory}; -use crate::catalog_common::listing_schema::ListingSchemaProvider; use crate::datasource::file_format::arrow::ArrowFormatFactory; #[cfg(feature = "avro")] use crate::datasource::file_format::avro::AvroFormatFactory; diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index b4d5f9740baa..803ec66eaab3 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] #![warn(missing_docs, clippy::needless_borrow)] @@ -699,7 +700,6 @@ pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION"); extern crate core; extern crate sqlparser; -pub mod catalog_common; pub mod dataframe; pub mod datasource; pub mod error; diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index f200fb0e0f71..600a8de1deb0 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -19,7 +19,6 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::str::FromStr; use std::sync::Arc; use crate::datasource::file_format::file_type_to_format; @@ -40,7 +39,6 @@ use crate::physical_expr::{create_physical_expr, create_physical_exprs}; use crate::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; use crate::physical_plan::analyze::AnalyzeExec; use crate::physical_plan::explain::ExplainExec; -use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::filter::FilterExec; use crate::physical_plan::joins::utils as join_utils; use crate::physical_plan::joins::{ @@ -78,8 +76,9 @@ use datafusion_expr::expr::{ use datafusion_expr::expr_rewriter::unnormalize_cols; use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary; use datafusion_expr::{ - DescribeTable, DmlStatement, Extension, FetchType, Filter, JoinType, RecursiveQuery, - SkipType, SortExpr, StringifiedPlan, WindowFrame, WindowFrameBound, WriteOp, + Analyze, DescribeTable, DmlStatement, Explain, ExplainFormat, Extension, FetchType, + Filter, JoinType, RecursiveQuery, SkipType, StringifiedPlan, WindowFrame, + WindowFrameBound, WriteOp, }; use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr}; use datafusion_physical_expr::expressions::Literal; @@ -88,7 +87,6 @@ use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::execution_plan::InvariantLevel; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::unnest::ListUnnest; -use datafusion_physical_plan::DisplayFormatType; use crate::schema_equivalence::schema_satisfied_by; use async_trait::async_trait; @@ -177,16 +175,17 @@ impl PhysicalPlanner for DefaultPhysicalPlanner { logical_plan: &LogicalPlan, session_state: &SessionState, ) -> Result> { - match self.handle_explain(logical_plan, session_state).await? { - Some(plan) => Ok(plan), - None => { - let plan = self - .create_initial_plan(logical_plan, session_state) - .await?; - - self.optimize_physical_plan(plan, session_state, |_, _| {}) - } + if let Some(plan) = self + .handle_explain_or_analyze(logical_plan, session_state) + .await? + { + return Ok(plan); } + let plan = self + .create_initial_plan(logical_plan, session_state) + .await?; + + self.optimize_physical_plan(plan, session_state, |_, _| {}) } /// Create a physical expression from a logical expression @@ -500,6 +499,7 @@ impl DefaultPhysicalPlanner { partition_by, options: source_option_tuples, }) => { + let original_url = output_url.clone(); let input_exec = children.one()?; let parsed_url = ListingTableUrl::parse(output_url)?; let object_store_url = parsed_url.object_store(); @@ -529,6 +529,7 @@ impl DefaultPhysicalPlanner { // Set file sink related options let config = FileSinkConfig { + original_url, object_store_url, table_paths: vec![parsed_url], file_groups: vec![], @@ -1586,6 +1587,7 @@ type AggregateExprWithOptionalArgs = ( pub fn create_aggregate_expr_with_name_and_maybe_filter( e: &Expr, name: Option, + human_displan: String, logical_input_schema: &DFSchema, physical_input_schema: &Schema, execution_props: &ExecutionProps, @@ -1640,6 +1642,7 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter( .order_by(ordering_reqs) .schema(Arc::new(physical_input_schema.to_owned())) .alias(name) + .human_display(human_displan) .with_ignore_nulls(ignore_nulls) .with_distinct(*distinct) .build() @@ -1662,52 +1665,35 @@ pub fn create_aggregate_expr_and_maybe_filter( execution_props: &ExecutionProps, ) -> Result { // unpack (nested) aliased logical expressions, e.g. "sum(col) as total" - let (name, e) = match e { - Expr::Alias(Alias { expr, name, .. }) => (Some(name.clone()), expr.as_ref()), - Expr::AggregateFunction(_) => (Some(e.schema_name().to_string()), e), - _ => (None, e), + let (name, human_display, e) = match e { + Expr::Alias(Alias { expr, name, .. }) => { + (Some(name.clone()), String::default(), expr.as_ref()) + } + Expr::AggregateFunction(_) => ( + Some(e.schema_name().to_string()), + e.human_display().to_string(), + e, + ), + _ => (None, String::default(), e), }; create_aggregate_expr_with_name_and_maybe_filter( e, name, + human_display, logical_input_schema, physical_input_schema, execution_props, ) } -/// Create a physical sort expression from a logical expression -pub fn create_physical_sort_expr( - e: &SortExpr, - input_dfschema: &DFSchema, - execution_props: &ExecutionProps, -) -> Result { - let SortExpr { - expr, - asc, - nulls_first, - } = e; - Ok(PhysicalSortExpr { - expr: create_physical_expr(expr, input_dfschema, execution_props)?, - options: SortOptions { - descending: !asc, - nulls_first: *nulls_first, - }, - }) -} - -/// Create vector of physical sort expression from a vector of logical expression -pub fn create_physical_sort_exprs( - exprs: &[SortExpr], - input_dfschema: &DFSchema, - execution_props: &ExecutionProps, -) -> Result { - exprs - .iter() - .map(|expr| create_physical_sort_expr(expr, input_dfschema, execution_props)) - .collect::>() -} +#[deprecated( + since = "47.0.0", + note = "use datafusion::{create_physical_sort_expr, create_physical_sort_exprs}" +)] +pub use datafusion_physical_expr::{ + create_physical_sort_expr, create_physical_sort_exprs, +}; impl DefaultPhysicalPlanner { /// Handles capturing the various plans for EXPLAIN queries @@ -1715,167 +1701,213 @@ impl DefaultPhysicalPlanner { /// Returns /// Some(plan) if optimized, and None if logical_plan was not an /// explain (and thus needs to be optimized as normal) - async fn handle_explain( + async fn handle_explain_or_analyze( &self, logical_plan: &LogicalPlan, session_state: &SessionState, ) -> Result>> { - if let LogicalPlan::Explain(e) = logical_plan { - use PlanType::*; - let mut stringified_plans = vec![]; + let execution_plan = match logical_plan { + LogicalPlan::Explain(e) => self.handle_explain(e, session_state).await?, + LogicalPlan::Analyze(a) => self.handle_analyze(a, session_state).await?, + _ => return Ok(None), + }; + Ok(Some(execution_plan)) + } - let config = &session_state.config_options().explain; - let explain_format = DisplayFormatType::from_str(&config.format)?; + /// Planner for `LogicalPlan::Explain` + async fn handle_explain( + &self, + e: &Explain, + session_state: &SessionState, + ) -> Result> { + use PlanType::*; + let mut stringified_plans = vec![]; - let skip_logical_plan = config.physical_plan_only - || explain_format == DisplayFormatType::TreeRender; + let config = &session_state.config_options().explain; + let explain_format = &e.explain_format; - if !skip_logical_plan { - stringified_plans.clone_from(&e.stringified_plans); - if e.logical_optimization_succeeded { - stringified_plans.push(e.plan.to_stringified(FinalLogicalPlan)); - } + match explain_format { + ExplainFormat::Indent => { /* fall through */ } + ExplainFormat::Tree => { + // Tree render does not try to explain errors, + let physical_plan = self + .create_initial_plan(e.plan.as_ref(), session_state) + .await?; + + let optimized_plan = self.optimize_physical_plan( + physical_plan, + session_state, + |_plan, _optimizer| {}, + )?; + + stringified_plans.push(StringifiedPlan::new( + FinalPhysicalPlan, + displayable(optimized_plan.as_ref()) + .tree_render() + .to_string(), + )); + } + ExplainFormat::PostgresJSON => { + stringified_plans.push(StringifiedPlan::new( + FinalLogicalPlan, + e.plan.display_pg_json().to_string(), + )); + } + ExplainFormat::Graphviz => { + stringified_plans.push(StringifiedPlan::new( + FinalLogicalPlan, + e.plan.display_graphviz().to_string(), + )); } + }; - if !config.logical_plan_only && e.logical_optimization_succeeded { - match self - .create_initial_plan(e.plan.as_ref(), session_state) - .await - { - Ok(input) => { - // Include statistics / schema if enabled - stringified_plans.push( - displayable(input.as_ref()) - .set_show_statistics(config.show_statistics) - .set_show_schema(config.show_schema) - .to_stringified( - e.verbose, - InitialPhysicalPlan, - explain_format, - ), - ); + if !stringified_plans.is_empty() { + return Ok(Arc::new(ExplainExec::new( + Arc::clone(e.schema.inner()), + stringified_plans, + e.verbose, + ))); + } - // Show statistics + schema in verbose output even if not - // explicitly requested - if e.verbose { - if !config.show_statistics { - stringified_plans.push( - displayable(input.as_ref()) - .set_show_statistics(true) - .to_stringified( - e.verbose, - InitialPhysicalPlanWithStats, - explain_format, - ), - ); - } - if !config.show_schema { - stringified_plans.push( - displayable(input.as_ref()) - .set_show_schema(true) - .to_stringified( - e.verbose, - InitialPhysicalPlanWithSchema, - explain_format, - ), - ); - } + // The indent mode is quite sophisticated, and handles quite a few + // different cases / options for displaying the plan. + if !config.physical_plan_only { + stringified_plans.clone_from(&e.stringified_plans); + if e.logical_optimization_succeeded { + stringified_plans.push(e.plan.to_stringified(FinalLogicalPlan)); + } + } + + if !config.logical_plan_only && e.logical_optimization_succeeded { + match self + .create_initial_plan(e.plan.as_ref(), session_state) + .await + { + Ok(input) => { + // Include statistics / schema if enabled + stringified_plans.push(StringifiedPlan::new( + InitialPhysicalPlan, + displayable(input.as_ref()) + .set_show_statistics(config.show_statistics) + .set_show_schema(config.show_schema) + .indent(e.verbose) + .to_string(), + )); + + // Show statistics + schema in verbose output even if not + // explicitly requested + if e.verbose { + if !config.show_statistics { + stringified_plans.push(StringifiedPlan::new( + InitialPhysicalPlanWithStats, + displayable(input.as_ref()) + .set_show_statistics(true) + .indent(e.verbose) + .to_string(), + )); } + if !config.show_schema { + stringified_plans.push(StringifiedPlan::new( + InitialPhysicalPlanWithSchema, + displayable(input.as_ref()) + .set_show_schema(true) + .indent(e.verbose) + .to_string(), + )); + } + } - let optimized_plan = self.optimize_physical_plan( - input, - session_state, - |plan, optimizer| { - let optimizer_name = optimizer.name().to_string(); - let plan_type = OptimizedPhysicalPlan { optimizer_name }; - stringified_plans.push( - displayable(plan) - .set_show_statistics(config.show_statistics) - .set_show_schema(config.show_schema) - .to_stringified( - e.verbose, - plan_type, - explain_format, - ), - ); - }, - ); - match optimized_plan { - Ok(input) => { - // This plan will includes statistics if show_statistics is on - stringified_plans.push( - displayable(input.as_ref()) - .set_show_statistics(config.show_statistics) - .set_show_schema(config.show_schema) - .to_stringified( - e.verbose, - FinalPhysicalPlan, - explain_format, - ), - ); - - // Show statistics + schema in verbose output even if not - // explicitly requested - if e.verbose { - if !config.show_statistics { - stringified_plans.push( - displayable(input.as_ref()) - .set_show_statistics(true) - .to_stringified( - e.verbose, - FinalPhysicalPlanWithStats, - explain_format, - ), - ); - } - if !config.show_schema { - stringified_plans.push( - displayable(input.as_ref()) - .set_show_schema(true) - .to_stringified( - e.verbose, - FinalPhysicalPlanWithSchema, - explain_format, - ), - ); - } + let optimized_plan = self.optimize_physical_plan( + input, + session_state, + |plan, optimizer| { + let optimizer_name = optimizer.name().to_string(); + let plan_type = OptimizedPhysicalPlan { optimizer_name }; + stringified_plans.push(StringifiedPlan::new( + plan_type, + displayable(plan) + .set_show_statistics(config.show_statistics) + .set_show_schema(config.show_schema) + .indent(e.verbose) + .to_string(), + )); + }, + ); + match optimized_plan { + Ok(input) => { + // This plan will includes statistics if show_statistics is on + stringified_plans.push(StringifiedPlan::new( + FinalPhysicalPlan, + displayable(input.as_ref()) + .set_show_statistics(config.show_statistics) + .set_show_schema(config.show_schema) + .indent(e.verbose) + .to_string(), + )); + + // Show statistics + schema in verbose output even if not + // explicitly requested + if e.verbose { + if !config.show_statistics { + stringified_plans.push(StringifiedPlan::new( + FinalPhysicalPlanWithStats, + displayable(input.as_ref()) + .set_show_statistics(true) + .indent(e.verbose) + .to_string(), + )); + } + if !config.show_schema { + stringified_plans.push(StringifiedPlan::new( + FinalPhysicalPlanWithSchema, + // This will include schema if show_schema is on + // and will be set to true if verbose is on + displayable(input.as_ref()) + .set_show_schema(true) + .indent(e.verbose) + .to_string(), + )); } } - Err(DataFusionError::Context(optimizer_name, e)) => { - let plan_type = OptimizedPhysicalPlan { optimizer_name }; - stringified_plans - .push(StringifiedPlan::new(plan_type, e.to_string())) - } - Err(e) => return Err(e), } - } - Err(err) => { - stringified_plans.push(StringifiedPlan::new( - PhysicalPlanError, - err.strip_backtrace(), - )); + Err(DataFusionError::Context(optimizer_name, e)) => { + let plan_type = OptimizedPhysicalPlan { optimizer_name }; + stringified_plans + .push(StringifiedPlan::new(plan_type, e.to_string())) + } + Err(e) => return Err(e), } } + Err(err) => { + stringified_plans.push(StringifiedPlan::new( + PhysicalPlanError, + err.strip_backtrace(), + )); + } } - - Ok(Some(Arc::new(ExplainExec::new( - SchemaRef::new(e.schema.as_ref().to_owned().into()), - stringified_plans, - e.verbose, - )))) - } else if let LogicalPlan::Analyze(a) = logical_plan { - let input = self.create_physical_plan(&a.input, session_state).await?; - let schema = SchemaRef::new((*a.schema).clone().into()); - let show_statistics = session_state.config_options().explain.show_statistics; - Ok(Some(Arc::new(AnalyzeExec::new( - a.verbose, - show_statistics, - input, - schema, - )))) - } else { - Ok(None) } + + Ok(Arc::new(ExplainExec::new( + Arc::clone(e.schema.inner()), + stringified_plans, + e.verbose, + ))) + } + + async fn handle_analyze( + &self, + a: &Analyze, + session_state: &SessionState, + ) -> Result> { + let input = self.create_physical_plan(&a.input, session_state).await?; + let schema = SchemaRef::new((*a.schema).clone().into()); + let show_statistics = session_state.config_options().explain.show_statistics; + Ok(Arc::new(AnalyzeExec::new( + a.verbose, + show_statistics, + input, + schema, + ))) } /// Optimize a physical plan by applying each physical optimizer, diff --git a/datafusion/core/tests/catalog/memory.rs b/datafusion/core/tests/catalog/memory.rs index 3e45fb753226..b0753eb5c949 100644 --- a/datafusion/core/tests/catalog/memory.rs +++ b/datafusion/core/tests/catalog/memory.rs @@ -24,7 +24,8 @@ use datafusion::datasource::listing::{ use datafusion::prelude::SessionContext; use datafusion_catalog::memory::*; use datafusion_catalog::{SchemaProvider, TableProvider}; -use datafusion_common::assert_batches_eq; +use datafusion_common::test_util::batches_to_string; +use insta::assert_snapshot; use std::any::Any; use std::sync::Arc; @@ -152,19 +153,18 @@ async fn test_schema_register_listing_table() { let actual = df.collect().await.unwrap(); - let expected = [ - "+----+----------+", - "| id | bool_col |", - "+----+----------+", - "| 4 | true |", - "| 5 | false |", - "| 6 | true |", - "| 7 | false |", - "| 2 | true |", - "| 3 | false |", - "| 0 | true |", - "| 1 | false |", - "+----+----------+", - ]; - assert_batches_eq!(expected, &actual); + assert_snapshot!(batches_to_string(&actual), @r" + +----+----------+ + | id | bool_col | + +----+----------+ + | 4 | true | + | 5 | false | + | 6 | true | + | 7 | false | + | 2 | true | + | 3 | false | + | 0 | true | + | 1 | false | + +----+----------+ + "); } diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs index 66b4103160e7..9bcb9e41f86a 100644 --- a/datafusion/core/tests/core_integration.rs +++ b/datafusion/core/tests/core_integration.rs @@ -42,8 +42,13 @@ mod custom_sources_cases; /// Run all tests that are found in the `optimizer` directory mod optimizer; +/// Run all tests that are found in the `physical_optimizer` directory mod physical_optimizer; +/// Run all tests that are found in the `serde` directory +mod serde; + +/// Run all tests that are found in the `catalog` directory mod catalog; #[cfg(test)] diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index fec3ab786fce..c763d4c8de2d 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -28,13 +28,13 @@ use std::sync::Arc; use datafusion::error::Result; use datafusion::prelude::*; - -use datafusion::assert_batches_eq; +use datafusion_common::test_util::batches_to_string; use datafusion_common::{DFSchema, ScalarValue}; use datafusion_expr::expr::Alias; use datafusion_expr::{table_scan, ExprSchemable, LogicalPlanBuilder}; use datafusion_functions_aggregate::expr_fn::{approx_median, approx_percentile_cont}; use datafusion_functions_nested::map::map; +use insta::assert_snapshot; fn test_schema() -> SchemaRef { Arc::new(Schema::new(vec![ @@ -75,34 +75,32 @@ async fn create_test_table() -> Result { } /// Executes an expression on the test dataframe as a select. -/// Compares formatted output of a record batch with an expected -/// vector of strings, using the assert_batch_eq! macro -macro_rules! assert_fn_batches { - ($EXPR:expr, $EXPECTED: expr) => { - assert_fn_batches!($EXPR, $EXPECTED, 10) - }; - ($EXPR:expr, $EXPECTED: expr, $LIMIT: expr) => { - let df = create_test_table().await?; - let df = df.select(vec![$EXPR])?.limit(0, Some($LIMIT))?; - let batches = df.collect().await?; - - assert_batches_eq!($EXPECTED, &batches); - }; +async fn get_batches(expr: Expr) -> Result> { + get_batches_with_limit(expr, 10).await +} + +async fn get_batches_with_limit(expr: Expr, limit: usize) -> Result> { + let df = create_test_table().await?; + let df = df.select(vec![expr])?.limit(0, Some(limit))?; + df.collect().await } #[tokio::test] async fn test_fn_ascii() -> Result<()> { let expr = ascii(col("a")); - let expected = [ - "+---------------+", - "| ascii(test.a) |", - "+---------------+", - "| 97 |", - "+---------------+", - ]; - - assert_fn_batches!(expr, expected, 1); + let batches = get_batches_with_limit(expr, 1).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +---------------+ + | ascii(test.a) | + +---------------+ + | 97 | + +---------------+ + " + ); Ok(()) } @@ -111,17 +109,21 @@ async fn test_fn_ascii() -> Result<()> { async fn test_fn_bit_length() -> Result<()> { let expr = bit_length(col("a")); - let expected = [ - "+--------------------+", - "| bit_length(test.a) |", - "+--------------------+", - "| 48 |", - "| 48 |", - "| 48 |", - "| 72 |", - "+--------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +--------------------+ + | bit_length(test.a) | + +--------------------+ + | 48 | + | 48 | + | 48 | + | 72 | + +--------------------+ + " + ); Ok(()) } @@ -130,15 +132,17 @@ async fn test_fn_bit_length() -> Result<()> { async fn test_fn_btrim() -> Result<()> { let expr = btrim(vec![lit(" a b c ")]); - let expected = [ - "+-----------------------------------------+", - "| btrim(Utf8(\" a b c \")) |", - "+-----------------------------------------+", - "| a b c |", - "+-----------------------------------------+", - ]; + let batches = get_batches_with_limit(expr, 1).await?; - assert_fn_batches!(expr, expected, 1); + assert_snapshot!( + batches_to_string(&batches), + @r#" + +-----------------------------------------+ + | btrim(Utf8(" a b c ")) | + +-----------------------------------------+ + | a b c | + +-----------------------------------------+ + "#); Ok(()) } @@ -147,18 +151,21 @@ async fn test_fn_btrim() -> Result<()> { async fn test_fn_btrim_with_chars() -> Result<()> { let expr = btrim(vec![col("a"), lit("ab")]); - let expected = [ - "+--------------------------+", - "| btrim(test.a,Utf8(\"ab\")) |", - "+--------------------------+", - "| cDEF |", - "| c123 |", - "| CBAdef |", - "| 123AbcDef |", - "+--------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +--------------------------+ + | btrim(test.a,Utf8("ab")) | + +--------------------------+ + | cDEF | + | c123 | + | CBAdef | + | 123AbcDef | + +--------------------------+ + "# + ); Ok(()) } @@ -167,18 +174,20 @@ async fn test_fn_btrim_with_chars() -> Result<()> { async fn test_fn_nullif() -> Result<()> { let expr = nullif(col("a"), lit("abcDEF")); - let expected = [ - "+-------------------------------+", - "| nullif(test.a,Utf8(\"abcDEF\")) |", - "+-------------------------------+", - "| |", - "| abc123 |", - "| CBAdef |", - "| 123AbcDef |", - "+-------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +-------------------------------+ + | nullif(test.a,Utf8("abcDEF")) | + +-------------------------------+ + | | + | abc123 | + | CBAdef | + | 123AbcDef | + +-------------------------------+ + "#); Ok(()) } @@ -187,18 +196,20 @@ async fn test_fn_nullif() -> Result<()> { async fn test_fn_arrow_cast() -> Result<()> { let expr = arrow_typeof(arrow_cast(col("b"), lit("Float64"))); - let expected = [ - "+--------------------------------------------------+", - "| arrow_typeof(arrow_cast(test.b,Utf8(\"Float64\"))) |", - "+--------------------------------------------------+", - "| Float64 |", - "| Float64 |", - "| Float64 |", - "| Float64 |", - "+--------------------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +--------------------------------------------------+ + | arrow_typeof(arrow_cast(test.b,Utf8("Float64"))) | + +--------------------------------------------------+ + | Float64 | + | Float64 | + | Float64 | + | Float64 | + +--------------------------------------------------+ + "#); Ok(()) } @@ -215,21 +226,24 @@ async fn test_nvl() -> Result<()> { ) .alias("nvl_expr"); - let expected = [ - "+-------------+", - "| nvl_expr |", - "+-------------+", - "| TURNED_NULL |", - "| abc123 |", - "| CBAdef |", - "| 123AbcDef |", - "+-------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +-------------+ + | nvl_expr | + +-------------+ + | TURNED_NULL | + | abc123 | + | CBAdef | + | 123AbcDef | + +-------------+ + "); Ok(()) } + #[tokio::test] async fn test_nvl2() -> Result<()> { let lit_null = lit(ScalarValue::Utf8(None)); @@ -243,18 +257,20 @@ async fn test_nvl2() -> Result<()> { ) .alias("nvl2_expr"); - let expected = [ - "+-------------+", - "| nvl2_expr |", - "+-------------+", - "| TURNED_NULL |", - "| NON_NULL |", - "| NON_NULL |", - "| NON_NULL |", - "+-------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +-------------+ + | nvl2_expr | + +-------------+ + | TURNED_NULL | + | NON_NULL | + | NON_NULL | + | NON_NULL | + +-------------+ + "); Ok(()) } @@ -262,18 +278,20 @@ async fn test_nvl2() -> Result<()> { async fn test_fn_arrow_typeof() -> Result<()> { let expr = arrow_typeof(col("l")); - let expected = [ - "+------------------------------------------------------------------------------------------------------------------+", - "| arrow_typeof(test.l) |", - "+------------------------------------------------------------------------------------------------------------------+", - "| List(Field { name: \"item\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) |", - "| List(Field { name: \"item\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) |", - "| List(Field { name: \"item\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) |", - "| List(Field { name: \"item\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) |", - "+------------------------------------------------------------------------------------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +------------------------------------------------------------------------------------------------------------------+ + | arrow_typeof(test.l) | + +------------------------------------------------------------------------------------------------------------------+ + | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | + | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | + | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | + | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | + +------------------------------------------------------------------------------------------------------------------+ + "#); Ok(()) } @@ -282,18 +300,20 @@ async fn test_fn_arrow_typeof() -> Result<()> { async fn test_fn_struct() -> Result<()> { let expr = r#struct(vec![col("a"), col("b")]); - let expected = [ - "+--------------------------+", - "| struct(test.a,test.b) |", - "+--------------------------+", - "| {c0: abcDEF, c1: 1} |", - "| {c0: abc123, c1: 10} |", - "| {c0: CBAdef, c1: 10} |", - "| {c0: 123AbcDef, c1: 100} |", - "+--------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +--------------------------+ + | struct(test.a,test.b) | + +--------------------------+ + | {c0: abcDEF, c1: 1} | + | {c0: abc123, c1: 10} | + | {c0: CBAdef, c1: 10} | + | {c0: 123AbcDef, c1: 100} | + +--------------------------+ + "); Ok(()) } @@ -302,18 +322,20 @@ async fn test_fn_struct() -> Result<()> { async fn test_fn_named_struct() -> Result<()> { let expr = named_struct(vec![lit("column_a"), col("a"), lit("column_b"), col("b")]); - let expected = [ - "+---------------------------------------------------------------+", - "| named_struct(Utf8(\"column_a\"),test.a,Utf8(\"column_b\"),test.b) |", - "+---------------------------------------------------------------+", - "| {column_a: abcDEF, column_b: 1} |", - "| {column_a: abc123, column_b: 10} |", - "| {column_a: CBAdef, column_b: 10} |", - "| {column_a: 123AbcDef, column_b: 100} |", - "+---------------------------------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +---------------------------------------------------------------+ + | named_struct(Utf8("column_a"),test.a,Utf8("column_b"),test.b) | + +---------------------------------------------------------------+ + | {column_a: abcDEF, column_b: 1} | + | {column_a: abc123, column_b: 10} | + | {column_a: CBAdef, column_b: 10} | + | {column_a: 123AbcDef, column_b: 100} | + +---------------------------------------------------------------+ + "#); Ok(()) } @@ -322,18 +344,20 @@ async fn test_fn_named_struct() -> Result<()> { async fn test_fn_coalesce() -> Result<()> { let expr = coalesce(vec![lit(ScalarValue::Utf8(None)), lit("ab")]); - let expected = [ - "+---------------------------------+", - "| coalesce(Utf8(NULL),Utf8(\"ab\")) |", - "+---------------------------------+", - "| ab |", - "| ab |", - "| ab |", - "| ab |", - "+---------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +---------------------------------+ + | coalesce(Utf8(NULL),Utf8("ab")) | + +---------------------------------+ + | ab | + | ab | + | ab | + | ab | + +---------------------------------+ + "#); Ok(()) } @@ -342,18 +366,18 @@ async fn test_fn_coalesce() -> Result<()> { async fn test_fn_approx_median() -> Result<()> { let expr = approx_median(col("b")); - let expected = [ - "+-----------------------+", - "| approx_median(test.b) |", - "+-----------------------+", - "| 10 |", - "+-----------------------+", - ]; - let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; - assert_batches_eq!(expected, &batches); + assert_snapshot!( + batches_to_string(&batches), + @r" + +-----------------------+ + | approx_median(test.b) | + +-----------------------+ + | 10 | + +-----------------------+ + "); Ok(()) } @@ -362,18 +386,18 @@ async fn test_fn_approx_median() -> Result<()> { async fn test_fn_approx_percentile_cont() -> Result<()> { let expr = approx_percentile_cont(col("b"), lit(0.5), None); - let expected = [ - "+---------------------------------------------+", - "| approx_percentile_cont(test.b,Float64(0.5)) |", - "+---------------------------------------------+", - "| 10 |", - "+---------------------------------------------+", - ]; - let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; - assert_batches_eq!(expected, &batches); + assert_snapshot!( + batches_to_string(&batches), + @r" + +---------------------------------------------+ + | approx_percentile_cont(test.b,Float64(0.5)) | + +---------------------------------------------+ + | 10 | + +---------------------------------------------+ + "); // the arg2 parameter is a complex expr, but it can be evaluated to the literal value let alias_expr = Expr::Alias(Alias::new( @@ -383,31 +407,34 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { )); let expr = approx_percentile_cont(col("b"), alias_expr, None); let df = create_test_table().await?; - let expected = [ - "+--------------------------------------+", - "| approx_percentile_cont(test.b,arg_2) |", - "+--------------------------------------+", - "| 10 |", - "+--------------------------------------+", - ]; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; - assert_batches_eq!(expected, &batches); + assert_snapshot!( + batches_to_string(&batches), + @r" + +--------------------------------------+ + | approx_percentile_cont(test.b,arg_2) | + +--------------------------------------+ + | 10 | + +--------------------------------------+ + " + ); // with number of centroids set let expr = approx_percentile_cont(col("b"), lit(0.5), Some(lit(2))); - let expected = [ - "+------------------------------------------------------+", - "| approx_percentile_cont(test.b,Float64(0.5),Int32(2)) |", - "+------------------------------------------------------+", - "| 30 |", - "+------------------------------------------------------+", - ]; let df = create_test_table().await?; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; - assert_batches_eq!(expected, &batches); + assert_snapshot!( + batches_to_string(&batches), + @r" + +------------------------------------------------------+ + | approx_percentile_cont(test.b,Float64(0.5),Int32(2)) | + +------------------------------------------------------+ + | 30 | + +------------------------------------------------------+ + "); Ok(()) } @@ -417,18 +444,20 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { async fn test_fn_character_length() -> Result<()> { let expr = character_length(col("a")); - let expected = [ - "+--------------------------+", - "| character_length(test.a) |", - "+--------------------------+", - "| 6 |", - "| 6 |", - "| 6 |", - "| 9 |", - "+--------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +--------------------------+ + | character_length(test.a) | + +--------------------------+ + | 6 | + | 6 | + | 6 | + | 9 | + +--------------------------+ + "); Ok(()) } @@ -437,15 +466,17 @@ async fn test_fn_character_length() -> Result<()> { async fn test_fn_chr() -> Result<()> { let expr = chr(lit(128175)); - let expected = [ - "+--------------------+", - "| chr(Int32(128175)) |", - "+--------------------+", - "| 💯 |", - "+--------------------+", - ]; + let batches = get_batches_with_limit(expr, 1).await?; - assert_fn_batches!(expr, expected, 1); + assert_snapshot!( + batches_to_string(&batches), + @r" + +--------------------+ + | chr(Int32(128175)) | + +--------------------+ + | 💯 | + +--------------------+ + "); Ok(()) } @@ -454,18 +485,20 @@ async fn test_fn_chr() -> Result<()> { async fn test_fn_initcap() -> Result<()> { let expr = initcap(col("a")); - let expected = [ - "+-----------------+", - "| initcap(test.a) |", - "+-----------------+", - "| Abcdef |", - "| Abc123 |", - "| Cbadef |", - "| 123abcdef |", - "+-----------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +-----------------+ + | initcap(test.a) | + +-----------------+ + | Abcdef | + | Abc123 | + | Cbadef | + | 123abcdef | + +-----------------+ + "); Ok(()) } @@ -475,18 +508,20 @@ async fn test_fn_initcap() -> Result<()> { async fn test_fn_left() -> Result<()> { let expr = left(col("a"), lit(3)); - let expected = [ - "+-----------------------+", - "| left(test.a,Int32(3)) |", - "+-----------------------+", - "| abc |", - "| abc |", - "| CBA |", - "| 123 |", - "+-----------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +-----------------------+ + | left(test.a,Int32(3)) | + +-----------------------+ + | abc | + | abc | + | CBA | + | 123 | + +-----------------------+ + "); Ok(()) } @@ -495,18 +530,20 @@ async fn test_fn_left() -> Result<()> { async fn test_fn_lower() -> Result<()> { let expr = lower(col("a")); - let expected = [ - "+---------------+", - "| lower(test.a) |", - "+---------------+", - "| abcdef |", - "| abc123 |", - "| cbadef |", - "| 123abcdef |", - "+---------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +---------------+ + | lower(test.a) | + +---------------+ + | abcdef | + | abc123 | + | cbadef | + | 123abcdef | + +---------------+ + "); Ok(()) } @@ -516,18 +553,20 @@ async fn test_fn_lower() -> Result<()> { async fn test_fn_lpad() -> Result<()> { let expr = lpad(vec![col("a"), lit(10)]); - let expected = [ - "+------------------------+", - "| lpad(test.a,Int32(10)) |", - "+------------------------+", - "| abcDEF |", - "| abc123 |", - "| CBAdef |", - "| 123AbcDef |", - "+------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +------------------------+ + | lpad(test.a,Int32(10)) | + +------------------------+ + | abcDEF | + | abc123 | + | CBAdef | + | 123AbcDef | + +------------------------+ + "); Ok(()) } @@ -537,18 +576,20 @@ async fn test_fn_lpad() -> Result<()> { async fn test_fn_lpad_with_string() -> Result<()> { let expr = lpad(vec![col("a"), lit(10), lit("*")]); - let expected = [ - "+----------------------------------+", - "| lpad(test.a,Int32(10),Utf8(\"*\")) |", - "+----------------------------------+", - "| ****abcDEF |", - "| ****abc123 |", - "| ****CBAdef |", - "| *123AbcDef |", - "+----------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +----------------------------------+ + | lpad(test.a,Int32(10),Utf8("*")) | + +----------------------------------+ + | ****abcDEF | + | ****abc123 | + | ****CBAdef | + | *123AbcDef | + +----------------------------------+ + "#); Ok(()) } @@ -557,15 +598,17 @@ async fn test_fn_lpad_with_string() -> Result<()> { async fn test_fn_ltrim() -> Result<()> { let expr = ltrim(vec![lit(" a b c ")]); - let expected = [ - "+-----------------------------------------+", - "| ltrim(Utf8(\" a b c \")) |", - "+-----------------------------------------+", - "| a b c |", - "+-----------------------------------------+", - ]; + let batches = get_batches_with_limit(expr, 1).await?; - assert_fn_batches!(expr, expected, 1); + assert_snapshot!( + batches_to_string(&batches), + @r#" + +-----------------------------------------+ + | ltrim(Utf8(" a b c ")) | + +-----------------------------------------+ + | a b c | + +-----------------------------------------+ + "#); Ok(()) } @@ -574,18 +617,20 @@ async fn test_fn_ltrim() -> Result<()> { async fn test_fn_ltrim_with_columns() -> Result<()> { let expr = ltrim(vec![col("a")]); - let expected = [ - "+---------------+", - "| ltrim(test.a) |", - "+---------------+", - "| abcDEF |", - "| abc123 |", - "| CBAdef |", - "| 123AbcDef |", - "+---------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +---------------+ + | ltrim(test.a) | + +---------------+ + | abcDEF | + | abc123 | + | CBAdef | + | 123AbcDef | + +---------------+ + "); Ok(()) } @@ -595,18 +640,20 @@ async fn test_fn_ltrim_with_columns() -> Result<()> { async fn test_fn_md5() -> Result<()> { let expr = md5(col("a")); - let expected = [ - "+----------------------------------+", - "| md5(test.a) |", - "+----------------------------------+", - "| ea2de8bd80f3a1f52c754214fc9b0ed1 |", - "| e99a18c428cb38d5f260853678922e03 |", - "| 11ed4a6e9985df40913eead67f022e27 |", - "| 8f5e60e523c9253e623ae38bb58c399a |", - "+----------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +----------------------------------+ + | md5(test.a) | + +----------------------------------+ + | ea2de8bd80f3a1f52c754214fc9b0ed1 | + | e99a18c428cb38d5f260853678922e03 | + | 11ed4a6e9985df40913eead67f022e27 | + | 8f5e60e523c9253e623ae38bb58c399a | + +----------------------------------+ + "); Ok(()) } @@ -616,33 +663,37 @@ async fn test_fn_md5() -> Result<()> { async fn test_fn_regexp_like() -> Result<()> { let expr = regexp_like(col("a"), lit("[a-z]"), None); - let expected = [ - "+-----------------------------------+", - "| regexp_like(test.a,Utf8(\"[a-z]\")) |", - "+-----------------------------------+", - "| true |", - "| true |", - "| true |", - "| true |", - "+-----------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +-----------------------------------+ + | regexp_like(test.a,Utf8("[a-z]")) | + +-----------------------------------+ + | true | + | true | + | true | + | true | + +-----------------------------------+ + "#); let expr = regexp_like(col("a"), lit("abc"), Some(lit("i"))); - let expected = [ - "+-------------------------------------------+", - "| regexp_like(test.a,Utf8(\"abc\"),Utf8(\"i\")) |", - "+-------------------------------------------+", - "| true |", - "| true |", - "| false |", - "| true |", - "+-------------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +-------------------------------------------+ + | regexp_like(test.a,Utf8("abc"),Utf8("i")) | + +-------------------------------------------+ + | true | + | true | + | false | + | true | + +-------------------------------------------+ + "#); Ok(()) } @@ -652,33 +703,37 @@ async fn test_fn_regexp_like() -> Result<()> { async fn test_fn_regexp_match() -> Result<()> { let expr = regexp_match(col("a"), lit("[a-z]"), None); - let expected = [ - "+------------------------------------+", - "| regexp_match(test.a,Utf8(\"[a-z]\")) |", - "+------------------------------------+", - "| [a] |", - "| [a] |", - "| [d] |", - "| [b] |", - "+------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +------------------------------------+ + | regexp_match(test.a,Utf8("[a-z]")) | + +------------------------------------+ + | [a] | + | [a] | + | [d] | + | [b] | + +------------------------------------+ + "#); let expr = regexp_match(col("a"), lit("[A-Z]"), Some(lit("i"))); - let expected = [ - "+----------------------------------------------+", - "| regexp_match(test.a,Utf8(\"[A-Z]\"),Utf8(\"i\")) |", - "+----------------------------------------------+", - "| [a] |", - "| [a] |", - "| [C] |", - "| [A] |", - "+----------------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +----------------------------------------------+ + | regexp_match(test.a,Utf8("[A-Z]"),Utf8("i")) | + +----------------------------------------------+ + | [a] | + | [a] | + | [C] | + | [A] | + +----------------------------------------------+ + "#); Ok(()) } @@ -688,33 +743,37 @@ async fn test_fn_regexp_match() -> Result<()> { async fn test_fn_regexp_replace() -> Result<()> { let expr = regexp_replace(col("a"), lit("[a-z]"), lit("x"), Some(lit("g"))); - let expected = [ - "+----------------------------------------------------------+", - "| regexp_replace(test.a,Utf8(\"[a-z]\"),Utf8(\"x\"),Utf8(\"g\")) |", - "+----------------------------------------------------------+", - "| xxxDEF |", - "| xxx123 |", - "| CBAxxx |", - "| 123AxxDxx |", - "+----------------------------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +----------------------------------------------------------+ + | regexp_replace(test.a,Utf8("[a-z]"),Utf8("x"),Utf8("g")) | + +----------------------------------------------------------+ + | xxxDEF | + | xxx123 | + | CBAxxx | + | 123AxxDxx | + +----------------------------------------------------------+ + "#); let expr = regexp_replace(col("a"), lit("[a-z]"), lit("x"), None); - let expected = [ - "+------------------------------------------------+", - "| regexp_replace(test.a,Utf8(\"[a-z]\"),Utf8(\"x\")) |", - "+------------------------------------------------+", - "| xbcDEF |", - "| xbc123 |", - "| CBAxef |", - "| 123AxcDef |", - "+------------------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +------------------------------------------------+ + | regexp_replace(test.a,Utf8("[a-z]"),Utf8("x")) | + +------------------------------------------------+ + | xbcDEF | + | xbc123 | + | CBAxef | + | 123AxcDef | + +------------------------------------------------+ + "#); Ok(()) } @@ -723,18 +782,20 @@ async fn test_fn_regexp_replace() -> Result<()> { async fn test_fn_replace() -> Result<()> { let expr = replace(col("a"), lit("abc"), lit("x")); - let expected = [ - "+---------------------------------------+", - "| replace(test.a,Utf8(\"abc\"),Utf8(\"x\")) |", - "+---------------------------------------+", - "| xDEF |", - "| x123 |", - "| CBAdef |", - "| 123AbcDef |", - "+---------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +---------------------------------------+ + | replace(test.a,Utf8("abc"),Utf8("x")) | + +---------------------------------------+ + | xDEF | + | x123 | + | CBAdef | + | 123AbcDef | + +---------------------------------------+ + "#); Ok(()) } @@ -743,18 +804,20 @@ async fn test_fn_replace() -> Result<()> { async fn test_fn_repeat() -> Result<()> { let expr = repeat(col("a"), lit(2)); - let expected = [ - "+-------------------------+", - "| repeat(test.a,Int32(2)) |", - "+-------------------------+", - "| abcDEFabcDEF |", - "| abc123abc123 |", - "| CBAdefCBAdef |", - "| 123AbcDef123AbcDef |", - "+-------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +-------------------------+ + | repeat(test.a,Int32(2)) | + +-------------------------+ + | abcDEFabcDEF | + | abc123abc123 | + | CBAdefCBAdef | + | 123AbcDef123AbcDef | + +-------------------------+ + "); Ok(()) } @@ -764,18 +827,20 @@ async fn test_fn_repeat() -> Result<()> { async fn test_fn_reverse() -> Result<()> { let expr = reverse(col("a")); - let expected = [ - "+-----------------+", - "| reverse(test.a) |", - "+-----------------+", - "| FEDcba |", - "| 321cba |", - "| fedABC |", - "| feDcbA321 |", - "+-----------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +-----------------+ + | reverse(test.a) | + +-----------------+ + | FEDcba | + | 321cba | + | fedABC | + | feDcbA321 | + +-----------------+ + "); Ok(()) } @@ -785,18 +850,20 @@ async fn test_fn_reverse() -> Result<()> { async fn test_fn_right() -> Result<()> { let expr = right(col("a"), lit(3)); - let expected = [ - "+------------------------+", - "| right(test.a,Int32(3)) |", - "+------------------------+", - "| DEF |", - "| 123 |", - "| def |", - "| Def |", - "+------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +------------------------+ + | right(test.a,Int32(3)) | + +------------------------+ + | DEF | + | 123 | + | def | + | Def | + +------------------------+ + "); Ok(()) } @@ -806,18 +873,20 @@ async fn test_fn_right() -> Result<()> { async fn test_fn_rpad() -> Result<()> { let expr = rpad(vec![col("a"), lit(11)]); - let expected = [ - "+------------------------+", - "| rpad(test.a,Int32(11)) |", - "+------------------------+", - "| abcDEF |", - "| abc123 |", - "| CBAdef |", - "| 123AbcDef |", - "+------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +------------------------+ + | rpad(test.a,Int32(11)) | + +------------------------+ + | abcDEF | + | abc123 | + | CBAdef | + | 123AbcDef | + +------------------------+ + "); Ok(()) } @@ -827,18 +896,20 @@ async fn test_fn_rpad() -> Result<()> { async fn test_fn_rpad_with_characters() -> Result<()> { let expr = rpad(vec![col("a"), lit(11), lit("x")]); - let expected = [ - "+----------------------------------+", - "| rpad(test.a,Int32(11),Utf8(\"x\")) |", - "+----------------------------------+", - "| abcDEFxxxxx |", - "| abc123xxxxx |", - "| CBAdefxxxxx |", - "| 123AbcDefxx |", - "+----------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +----------------------------------+ + | rpad(test.a,Int32(11),Utf8("x")) | + +----------------------------------+ + | abcDEFxxxxx | + | abc123xxxxx | + | CBAdefxxxxx | + | 123AbcDefxx | + +----------------------------------+ + "#); Ok(()) } @@ -848,18 +919,20 @@ async fn test_fn_rpad_with_characters() -> Result<()> { async fn test_fn_sha224() -> Result<()> { let expr = sha224(col("a")); - let expected = [ - "+----------------------------------------------------------+", - "| sha224(test.a) |", - "+----------------------------------------------------------+", - "| 8b9ef961d2b19cfe7ee2a8452e3adeea98c7b22954b4073976bf80ee |", - "| 5c69bb695cc29b93d655e1a4bb5656cda624080d686f74477ea09349 |", - "| b3b3783b7470594e7ddb845eca0aec5270746dd6d0bc309bb948ceab |", - "| fc8a30d59386d78053328440c6670c3b583404a905cbe9bbd491a517 |", - "+----------------------------------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +----------------------------------------------------------+ + | sha224(test.a) | + +----------------------------------------------------------+ + | 8b9ef961d2b19cfe7ee2a8452e3adeea98c7b22954b4073976bf80ee | + | 5c69bb695cc29b93d655e1a4bb5656cda624080d686f74477ea09349 | + | b3b3783b7470594e7ddb845eca0aec5270746dd6d0bc309bb948ceab | + | fc8a30d59386d78053328440c6670c3b583404a905cbe9bbd491a517 | + +----------------------------------------------------------+ + "); Ok(()) } @@ -868,17 +941,20 @@ async fn test_fn_sha224() -> Result<()> { async fn test_fn_split_part() -> Result<()> { let expr = split_part(col("a"), lit("b"), lit(1)); - let expected = [ - "+---------------------------------------+", - "| split_part(test.a,Utf8(\"b\"),Int32(1)) |", - "+---------------------------------------+", - "| a |", - "| a |", - "| CBAdef |", - "| 123A |", - "+---------------------------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +---------------------------------------+ + | split_part(test.a,Utf8("b"),Int32(1)) | + +---------------------------------------+ + | a | + | a | + | CBAdef | + | 123A | + +---------------------------------------+ + "#); Ok(()) } @@ -887,18 +963,20 @@ async fn test_fn_split_part() -> Result<()> { async fn test_fn_starts_with() -> Result<()> { let expr = starts_with(col("a"), lit("abc")); - let expected = [ - "+---------------------------------+", - "| starts_with(test.a,Utf8(\"abc\")) |", - "+---------------------------------+", - "| true |", - "| true |", - "| false |", - "| false |", - "+---------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +---------------------------------+ + | starts_with(test.a,Utf8("abc")) | + +---------------------------------+ + | true | + | true | + | false | + | false | + +---------------------------------+ + "#); Ok(()) } @@ -907,18 +985,20 @@ async fn test_fn_starts_with() -> Result<()> { async fn test_fn_ends_with() -> Result<()> { let expr = ends_with(col("a"), lit("DEF")); - let expected = [ - "+-------------------------------+", - "| ends_with(test.a,Utf8(\"DEF\")) |", - "+-------------------------------+", - "| true |", - "| false |", - "| false |", - "| false |", - "+-------------------------------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +-------------------------------+ + | ends_with(test.a,Utf8("DEF")) | + +-------------------------------+ + | true | + | false | + | false | + | false | + +-------------------------------+ + "#); Ok(()) } @@ -928,17 +1008,20 @@ async fn test_fn_ends_with() -> Result<()> { async fn test_fn_strpos() -> Result<()> { let expr = strpos(col("a"), lit("f")); - let expected = [ - "+--------------------------+", - "| strpos(test.a,Utf8(\"f\")) |", - "+--------------------------+", - "| 0 |", - "| 0 |", - "| 6 |", - "| 9 |", - "+--------------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +--------------------------+ + | strpos(test.a,Utf8("f")) | + +--------------------------+ + | 0 | + | 0 | + | 6 | + | 9 | + +--------------------------+ + "#); Ok(()) } @@ -948,17 +1031,20 @@ async fn test_fn_strpos() -> Result<()> { async fn test_fn_substr() -> Result<()> { let expr = substr(col("a"), lit(2)); - let expected = [ - "+-------------------------+", - "| substr(test.a,Int32(2)) |", - "+-------------------------+", - "| bcDEF |", - "| bc123 |", - "| BAdef |", - "| 23AbcDef |", - "+-------------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +-------------------------+ + | substr(test.a,Int32(2)) | + +-------------------------+ + | bcDEF | + | bc123 | + | BAdef | + | 23AbcDef | + +-------------------------+ + "); Ok(()) } @@ -966,18 +1052,20 @@ async fn test_fn_substr() -> Result<()> { #[tokio::test] async fn test_cast() -> Result<()> { let expr = cast(col("b"), DataType::Float64); - let expected = [ - "+--------+", - "| test.b |", - "+--------+", - "| 1.0 |", - "| 10.0 |", - "| 10.0 |", - "| 100.0 |", - "+--------+", - ]; - - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +--------+ + | test.b | + +--------+ + | 1.0 | + | 10.0 | + | 10.0 | + | 100.0 | + +--------+ + "); Ok(()) } @@ -986,17 +1074,20 @@ async fn test_cast() -> Result<()> { async fn test_fn_to_hex() -> Result<()> { let expr = to_hex(col("b")); - let expected = [ - "+----------------+", - "| to_hex(test.b) |", - "+----------------+", - "| 1 |", - "| a |", - "| a |", - "| 64 |", - "+----------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +----------------+ + | to_hex(test.b) | + +----------------+ + | 1 | + | a | + | a | + | 64 | + +----------------+ + "); Ok(()) } @@ -1006,17 +1097,20 @@ async fn test_fn_to_hex() -> Result<()> { async fn test_fn_translate() -> Result<()> { let expr = translate(col("a"), lit("bc"), lit("xx")); - let expected = [ - "+-----------------------------------------+", - "| translate(test.a,Utf8(\"bc\"),Utf8(\"xx\")) |", - "+-----------------------------------------+", - "| axxDEF |", - "| axx123 |", - "| CBAdef |", - "| 123AxxDef |", - "+-----------------------------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +-----------------------------------------+ + | translate(test.a,Utf8("bc"),Utf8("xx")) | + +-----------------------------------------+ + | axxDEF | + | axx123 | + | CBAdef | + | 123AxxDef | + +-----------------------------------------+ + "#); Ok(()) } @@ -1025,17 +1119,20 @@ async fn test_fn_translate() -> Result<()> { async fn test_fn_upper() -> Result<()> { let expr = upper(col("a")); - let expected = [ - "+---------------+", - "| upper(test.a) |", - "+---------------+", - "| ABCDEF |", - "| ABC123 |", - "| CBADEF |", - "| 123ABCDEF |", - "+---------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r" + +---------------+ + | upper(test.a) | + +---------------+ + | ABCDEF | + | ABC123 | + | CBADEF | + | 123ABCDEF | + +---------------+ + "); Ok(()) } @@ -1044,17 +1141,20 @@ async fn test_fn_upper() -> Result<()> { async fn test_fn_encode() -> Result<()> { let expr = encode(col("a"), lit("hex")); - let expected = [ - "+----------------------------+", - "| encode(test.a,Utf8(\"hex\")) |", - "+----------------------------+", - "| 616263444546 |", - "| 616263313233 |", - "| 434241646566 |", - "| 313233416263446566 |", - "+----------------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +----------------------------+ + | encode(test.a,Utf8("hex")) | + +----------------------------+ + | 616263444546 | + | 616263313233 | + | 434241646566 | + | 313233416263446566 | + +----------------------------+ + "#); Ok(()) } @@ -1070,17 +1170,20 @@ async fn test_fn_decode() -> Result<()> { // so it looks like nothing is done .cast_to(&DataType::Utf8, &df_schema)?; - let expected = [ - "+------------------------------------------------+", - "| decode(encode(test.a,Utf8(\"hex\")),Utf8(\"hex\")) |", - "+------------------------------------------------+", - "| abcDEF |", - "| abc123 |", - "| CBAdef |", - "| 123AbcDef |", - "+------------------------------------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +------------------------------------------------+ + | decode(encode(test.a,Utf8("hex")),Utf8("hex")) | + +------------------------------------------------+ + | abcDEF | + | abc123 | + | CBAdef | + | 123AbcDef | + +------------------------------------------------+ + "#); Ok(()) } @@ -1089,17 +1192,20 @@ async fn test_fn_decode() -> Result<()> { async fn test_fn_array_to_string() -> Result<()> { let expr = array_to_string(col("l"), lit("***")); - let expected = [ - "+-------------------------------------+", - "| array_to_string(test.l,Utf8(\"***\")) |", - "+-------------------------------------+", - "| 0***1***2 |", - "| |", - "| 3***5 |", - "| 6***7 |", - "+-------------------------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +-------------------------------------+ + | array_to_string(test.l,Utf8("***")) | + +-------------------------------------+ + | 0***1***2 | + | | + | 3***5 | + | 6***7 | + +-------------------------------------+ + "#); Ok(()) } @@ -1110,17 +1216,20 @@ async fn test_fn_map() -> Result<()> { vec![lit("a"), lit("b"), lit("c")], vec![lit(1), lit(2), lit(3)], ); - let expected = [ - "+---------------------------------------------------------------------------------------+", - "| map(make_array(Utf8(\"a\"),Utf8(\"b\"),Utf8(\"c\")),make_array(Int32(1),Int32(2),Int32(3))) |", - "+---------------------------------------------------------------------------------------+", - "| {a: 1, b: 2, c: 3} |", - "| {a: 1, b: 2, c: 3} |", - "| {a: 1, b: 2, c: 3} |", - "| {a: 1, b: 2, c: 3} |", - "+---------------------------------------------------------------------------------------+", - ]; - assert_fn_batches!(expr, expected); + let batches = get_batches(expr).await?; + + assert_snapshot!( + batches_to_string(&batches), + @r#" + +---------------------------------------------------------------------------------------+ + | map(make_array(Utf8("a"),Utf8("b"),Utf8("c")),make_array(Int32(1),Int32(2),Int32(3))) | + +---------------------------------------------------------------------------------------+ + | {a: 1, b: 2, c: 3} | + | {a: 1, b: 2, c: 3} | + | {a: 1, b: 2, c: 3} | + | {a: 1, b: 2, c: 3} | + +---------------------------------------------------------------------------------------+ + "#); Ok(()) } @@ -1145,13 +1254,14 @@ async fn test_count_wildcard() -> Result<()> { .build() .unwrap(); - let expected = "Sort: count(*) ASC NULLS LAST [count(*):Int64]\ - \n Projection: count(*) [count(*):Int64]\ - \n Aggregate: groupBy=[[test.b]], aggr=[[count(Int64(1)) AS count(*)]] [b:UInt32, count(*):Int64]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - let formatted_plan = plan.display_indent_schema().to_string(); - assert_eq!(formatted_plan, expected); + assert_snapshot!(formatted_plan, + @r" + Sort: count(*) ASC NULLS LAST [count(*):Int64] + Projection: count(*) [count(*):Int64] + Aggregate: groupBy=[[test.b]], aggr=[[count(Int64(1)) AS count(*)]] [b:UInt32, count(*):Int64] + TableScan: test [a:UInt32, b:UInt32, c:UInt32] + "); Ok(()) } diff --git a/datafusion/core/tests/dataframe/describe.rs b/datafusion/core/tests/dataframe/describe.rs index 9321481efbd2..9bd69dfa72b4 100644 --- a/datafusion/core/tests/dataframe/describe.rs +++ b/datafusion/core/tests/dataframe/describe.rs @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -use datafusion::{ - assert_batches_eq, - prelude::{ParquetReadOptions, SessionContext}, -}; +use datafusion::prelude::{ParquetReadOptions, SessionContext}; +use datafusion_common::test_util::batches_to_string; use datafusion_common::{test_util::parquet_test_data, Result}; +use insta::assert_snapshot; #[tokio::test] async fn describe() -> Result<()> { @@ -33,21 +32,21 @@ async fn describe() -> Result<()> { .collect() .await?; - #[rustfmt::skip] - let expected = [ - "+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+", - "| describe | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col | year | month |", - "+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+", - "| count | 7300.0 | 7300 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300 | 7300 | 7300 | 7300.0 | 7300.0 |", - "| null_count | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0.0 | 0.0 |", - "| mean | 3649.5 | null | 4.5 | 4.5 | 4.5 | 45.0 | 4.949999964237213 | 45.45 | null | null | null | 2009.5 | 6.526027397260274 |", - "| std | 2107.472815166704 | null | 2.8724780750809518 | 2.8724780750809518 | 2.8724780750809518 | 28.724780750809533 | 3.1597258182544645 | 29.012028558317645 | null | null | null | 0.5000342500942125 | 3.44808750051728 |", - "| min | 0.0 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 01/01/09 | 0 | 2008-12-31T23:00:00 | 2009.0 | 1.0 |", - "| max | 7299.0 | null | 9.0 | 9.0 | 9.0 | 90.0 | 9.899999618530273 | 90.89999999999999 | 12/31/10 | 9 | 2010-12-31T04:09:13.860 | 2010.0 | 12.0 |", - "| median | 3649.0 | null | 4.0 | 4.0 | 4.0 | 45.0 | 4.949999809265137 | 45.45 | null | null | null | 2009.0 | 7.0 |", - "+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+", - ]; - assert_batches_eq!(expected, &describe_record_batch); + assert_snapshot!( + batches_to_string(&describe_record_batch), + @r" + +------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+ + | describe | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col | year | month | + +------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+ + | count | 7300.0 | 7300 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300 | 7300 | 7300 | 7300.0 | 7300.0 | + | null_count | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0.0 | 0.0 | + | mean | 3649.5 | null | 4.5 | 4.5 | 4.5 | 45.0 | 4.949999964237213 | 45.45 | null | null | null | 2009.5 | 6.526027397260274 | + | std | 2107.472815166704 | null | 2.8724780750809518 | 2.8724780750809518 | 2.8724780750809518 | 28.724780750809533 | 3.1597258182544645 | 29.012028558317645 | null | null | null | 0.5000342500942125 | 3.44808750051728 | + | min | 0.0 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 01/01/09 | 0 | 2008-12-31T23:00:00 | 2009.0 | 1.0 | + | max | 7299.0 | null | 9.0 | 9.0 | 9.0 | 90.0 | 9.899999618530273 | 90.89999999999999 | 12/31/10 | 9 | 2010-12-31T04:09:13.860 | 2010.0 | 12.0 | + | median | 3649.0 | null | 4.0 | 4.0 | 4.0 | 45.0 | 4.949999809265137 | 45.45 | null | null | null | 2009.0 | 7.0 | + +------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+ + "); Ok(()) } @@ -63,21 +62,22 @@ async fn describe_boolean_binary() -> Result<()> { .await? .collect() .await?; - #[rustfmt::skip] - let expected = [ - "+------------+------+------+", - "| describe | a | b |", - "+------------+------+------+", - "| count | 1 | 1 |", - "| null_count | 0 | 0 |", - "| mean | null | null |", - "| std | null | null |", - "| min | a | null |", - "| max | a | null |", - "| median | null | null |", - "+------------+------+------+" - ]; - assert_batches_eq!(expected, &result); + + assert_snapshot!( + batches_to_string(&result), + @r" + +------------+------+------+ + | describe | a | b | + +------------+------+------+ + | count | 1 | 1 | + | null_count | 0 | 0 | + | mean | null | null | + | std | null | null | + | min | a | null | + | max | a | null | + | median | null | null | + +------------+------+------+ + "); Ok(()) } @@ -93,21 +93,22 @@ async fn describe_null() -> Result<()> { .await? .collect() .await?; - #[rustfmt::skip] - let expected = [ - "+------------+------+------+", - "| describe | a | b |", - "+------------+------+------+", - "| count | 1 | 0 |", - "| null_count | 0 | 1 |", - "| mean | null | null |", - "| std | null | null |", - "| min | a | null |", - "| max | a | null |", - "| median | null | null |", - "+------------+------+------+" - ]; - assert_batches_eq!(expected, &result); + + assert_snapshot!( + batches_to_string(&result), + @r" + +------------+------+------+ + | describe | a | b | + +------------+------+------+ + | count | 1 | 0 | + | null_count | 0 | 1 | + | mean | null | null | + | std | null | null | + | min | a | null | + | max | a | null | + | median | null | null | + +------------+------+------+ + "); Ok(()) } diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index dadec5b2be5d..b19c0b978605 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -38,6 +38,7 @@ use datafusion_functions_aggregate::expr_fn::{ }; use datafusion_functions_nested::make_array::make_array_udf; use datafusion_functions_window::expr_fn::{first_value, row_number}; +use insta::assert_snapshot; use object_store::local::LocalFileSystem; use sqlparser::ast::NullTreatment; use std::collections::HashMap; @@ -59,8 +60,8 @@ use datafusion::test_util::{ parquet_test_data, populate_csv_partitions, register_aggregate_csv, test_table, test_table_with_name, }; -use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; use datafusion_catalog::TableProvider; +use datafusion_common::test_util::{batches_to_sort_string, batches_to_string}; use datafusion_common::{ assert_contains, Constraint, Constraints, DataFusionError, ParamValues, ScalarValue, TableReference, UnnestOptions, @@ -72,28 +73,25 @@ use datafusion_expr::expr::{GroupingSet, Sort, WindowFunction}; use datafusion_expr::var_provider::{VarProvider, VarType}; use datafusion_expr::{ cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder, - scalar_subquery, when, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, + scalar_subquery, when, wildcard, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, ScalarFunctionImplementation, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, }; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::Partitioning; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; -use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties}; +use datafusion_physical_plan::{displayable, ExecutionPlanProperties}; // Get string representation of the plan -async fn assert_physical_plan(df: &DataFrame, expected: Vec<&str>) { +async fn physical_plan_to_string(df: &DataFrame) -> String { let physical_plan = df .clone() .create_physical_plan() .await .expect("Error creating physical plan"); - let actual = get_plan_string(&physical_plan); - assert_eq!( - expected, actual, - "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" - ); + let formated = displayable(physical_plan.as_ref()).indent(true); + formated.to_string() } pub fn table_with_constraints() -> Arc { @@ -271,6 +269,16 @@ async fn select_expr() -> Result<()> { Ok(()) } +#[tokio::test] +async fn select_all() -> Result<()> { + let t = test_table().await?; + let plan = t.select([wildcard()])?.logical_plan().clone(); + let sql_plan = create_plan("SELECT * FROM aggregate_test_100").await?; + assert_same_plan(&plan, &sql_plan); + + Ok(()) +} + #[tokio::test] async fn select_exprs() -> Result<()> { // build plan using `select_expr`` @@ -330,9 +338,16 @@ async fn select_with_periods() -> Result<()> { let df_results = df.collect().await?; - assert_batches_sorted_eq!( - ["+------+", "| f.c1 |", "+------+", "| 1 |", "| 10 |", "+------+"], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +------+ + | f.c1 | + +------+ + | 1 | + | 10 | + +------+ + "### ); Ok(()) @@ -429,16 +444,16 @@ async fn drop_with_quotes() -> Result<()> { let df_results = df.collect().await?; - assert_batches_sorted_eq!( - [ - "+------+", - "| f\"c2 |", - "+------+", - "| 2 |", - "| 11 |", - "+------+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +------+ + | f"c2 | + +------+ + | 11 | + | 2 | + +------+ + "### ); Ok(()) @@ -461,9 +476,16 @@ async fn drop_with_periods() -> Result<()> { let df_results = df.collect().await?; - assert_batches_sorted_eq!( - ["+------+", "| f.c2 |", "+------+", "| 2 |", "| 11 |", "+------+"], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +------+ + | f.c2 | + +------+ + | 11 | + | 2 | + +------+ + "### ); Ok(()) @@ -485,18 +507,20 @@ async fn aggregate() -> Result<()> { let df: Vec = df.aggregate(group_expr, aggr_expr)?.collect().await?; - assert_batches_sorted_eq!( - ["+----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+", - "| c1 | min(aggregate_test_100.c12) | max(aggregate_test_100.c12) | avg(aggregate_test_100.c12) | sum(aggregate_test_100.c12) | count(aggregate_test_100.c12) | count(DISTINCT aggregate_test_100.c12) |", - "+----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+", - "| a | 0.02182578039211991 | 0.9800193410444061 | 0.48754517466109415 | 10.238448667882977 | 21 | 21 |", - "| b | 0.04893135681998029 | 0.9185813970744787 | 0.41040709263815384 | 7.797734760124923 | 19 | 19 |", - "| c | 0.0494924465469434 | 0.991517828651004 | 0.6600456536439784 | 13.860958726523545 | 21 | 21 |", - "| d | 0.061029375346466685 | 0.9748360509016578 | 0.48855379387549824 | 8.793968289758968 | 18 | 18 |", - "| e | 0.01479305307777301 | 0.9965400387585364 | 0.48600669271341534 | 10.206140546981722 | 21 | 21 |", - "+----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+"], - &df - ); + assert_snapshot!( + batches_to_sort_string(&df), + @r###" + +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ + | c1 | min(aggregate_test_100.c12) | max(aggregate_test_100.c12) | avg(aggregate_test_100.c12) | sum(aggregate_test_100.c12) | count(aggregate_test_100.c12) | count(DISTINCT aggregate_test_100.c12) | + +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ + | a | 0.02182578039211991 | 0.9800193410444061 | 0.48754517466109415 | 10.238448667882977 | 21 | 21 | + | b | 0.04893135681998029 | 0.9185813970744787 | 0.41040709263815384 | 7.797734760124923 | 19 | 19 | + | c | 0.0494924465469434 | 0.991517828651004 | 0.6600456536439784 | 13.860958726523545 | 21 | 21 | + | d | 0.061029375346466685 | 0.9748360509016578 | 0.48855379387549824 | 8.793968289758968 | 18 | 18 | + | e | 0.01479305307777301 | 0.9965400387585364 | 0.48600669271341534 | 10.206140546981722 | 21 | 21 | + +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ + "### + ); Ok(()) } @@ -542,27 +566,27 @@ async fn test_aggregate_with_pk() -> Result<()> { // expression even if it is not part of the group by expression and can // select "name" column even though it wasn't explicitly grouped let df = df.select(vec![col("id"), col("name")])?; - assert_physical_plan( - &df, - vec![ - "AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]", - " DataSourceExec: partitions=1, partition_sizes=[1]", - ], - ) - .await; + + assert_snapshot!( + physical_plan_to_string(&df).await, + @r###" + AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + "### + ); let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!([ - "+----+------+", - "| id | name |", - "+----+------+", - "| 1 | a |", - "+----+------+" - ], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+------+ + | id | name | + +----+------+ + | 1 | a | + +----+------+ + "### + ); Ok(()) } @@ -584,30 +608,30 @@ async fn test_aggregate_with_pk2() -> Result<()> { // id = 1 AND name = 'a' let predicate = col("id").eq(lit(1i32)).and(col("name").eq(lit("a"))); let df = df.filter(predicate)?; - assert_physical_plan( - &df, - vec![ - "CoalesceBatchesExec: target_batch_size=8192", - " FilterExec: id@0 = 1 AND name@1 = a", - " AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]", - " DataSourceExec: partitions=1, partition_sizes=[1]", - ], - ) - .await; + assert_snapshot!( + physical_plan_to_string(&df).await, + @r###" + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 AND name@1 = a + AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + "### + ); // Since id and name are functionally dependant, we can use name among expression // even if it is not part of the group by expression. let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+------+", - "| id | name |", - "+----+------+", - "| 1 | a |", - "+----+------+",], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+------+ + | id | name | + +----+------+ + | 1 | a | + +----+------+ + "### + ); Ok(()) } @@ -633,30 +657,30 @@ async fn test_aggregate_with_pk3() -> Result<()> { // Select expression refers to id, and name columns. // id, name let df = df.select(vec![col("id"), col("name")])?; - assert_physical_plan( - &df, - vec![ - "CoalesceBatchesExec: target_batch_size=8192", - " FilterExec: id@0 = 1", - " AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]", - " DataSourceExec: partitions=1, partition_sizes=[1]", - ], - ) - .await; + assert_snapshot!( + physical_plan_to_string(&df).await, + @r###" + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 + AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + "### + ); // Since id and name are functionally dependant, we can use name among expression // even if it is not part of the group by expression. let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+------+", - "| id | name |", - "+----+------+", - "| 1 | a |", - "+----+------+",], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+------+ + | id | name | + +----+------+ + | 1 | a | + +----+------+ + "### + ); Ok(()) } @@ -684,28 +708,28 @@ async fn test_aggregate_with_pk4() -> Result<()> { // In this case aggregate shouldn't be expanded, since these // columns are not used. - assert_physical_plan( - &df, - vec![ - "CoalesceBatchesExec: target_batch_size=8192", - " FilterExec: id@0 = 1", - " AggregateExec: mode=Single, gby=[id@0 as id], aggr=[]", - " DataSourceExec: partitions=1, partition_sizes=[1]", - ], - ) - .await; + assert_snapshot!( + physical_plan_to_string(&df).await, + @r###" + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 + AggregateExec: mode=Single, gby=[id@0 as id], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + "### + ); let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!([ - "+----+", - "| id |", - "+----+", - "| 1 |", - "+----+",], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | id | + +----+ + | 1 | + +----+ + "### + ); Ok(()) } @@ -724,20 +748,20 @@ async fn test_aggregate_alias() -> Result<()> { let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!([ - "+----+", - "| c2 |", - "+----+", - "| 2 |", - "| 3 |", - "| 4 |", - "| 5 |", - "| 6 |", - "+----+", - ], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | c2 | + +----+ + | 2 | + | 3 | + | 4 | + | 5 | + | 6 | + +----+ + "### + ); Ok(()) } @@ -771,22 +795,20 @@ async fn test_aggregate_with_union() -> Result<()> { let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - [ - "+----+------------+", - "| c1 | sum_result |", - "+----+------------+", - "| a | 84 |", - "| b | 69 |", - "| c | 124 |", - "| d | 126 |", - "| e | 121 |", - "+----+------------+" - ], - &df_results - ); - + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+------------+ + | c1 | sum_result | + +----+------------+ + | a | 84 | + | b | 69 | + | c | 124 | + | d | 126 | + | e | 121 | + +----+------------+ + "### + ); Ok(()) } @@ -809,20 +831,20 @@ async fn test_aggregate_subexpr() -> Result<()> { let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!([ - "+----------------+------+", - "| c2 + Int32(10) | sum |", - "+----------------+------+", - "| 12 | 431 |", - "| 13 | 248 |", - "| 14 | 453 |", - "| 15 | 95 |", - "| 16 | -146 |", - "+----------------+------+", - ], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----------------+------+ + | c2 + Int32(10) | sum | + +----------------+------+ + | 12 | 431 | + | 13 | 248 | + | 14 | 453 | + | 15 | 95 | + | 16 | -146 | + +----------------+------+ + "### + ); Ok(()) } @@ -844,9 +866,7 @@ async fn test_aggregate_name_collision() -> Result<()> { // The select expr has the same display_name as the group_expr, // but since they are different expressions, it should fail. .expect_err("Expected error"); - let expected = "Schema error: No field named aggregate_test_100.c2. \ - Valid fields are \"aggregate_test_100.c2 + aggregate_test_100.c3\"."; - assert_eq!(df.strip_backtrace(), expected); + assert_snapshot!(df.strip_backtrace(), @r###"Schema error: No field named aggregate_test_100.c2. Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3"."###); Ok(()) } @@ -903,36 +923,36 @@ async fn window_using_aggregates() -> Result<()> { let df: Vec = df.select(aggr_expr)?.collect().await?; - assert_batches_sorted_eq!( - [ - "+-------------+----------+-----------------+---------------+--------+-----+------+----+------+", - "| first_value | last_val | approx_distinct | approx_median | median | max | min | c2 | c3 |", - "+-------------+----------+-----------------+---------------+--------+-----+------+----+------+", - "| | | | | | | | 1 | -85 |", - "| -85 | -101 | 14 | -12 | -101 | 83 | -101 | 4 | -54 |", - "| -85 | -101 | 17 | -25 | -101 | 83 | -101 | 5 | -31 |", - "| -85 | -12 | 10 | -32 | -12 | 83 | -85 | 3 | 13 |", - "| -85 | -25 | 3 | -56 | -25 | -25 | -85 | 1 | -5 |", - "| -85 | -31 | 18 | -29 | -31 | 83 | -101 | 5 | 36 |", - "| -85 | -38 | 16 | -25 | -38 | 83 | -101 | 4 | 65 |", - "| -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 |", - "| -85 | -48 | 6 | -35 | -48 | 83 | -85 | 2 | -43 |", - "| -85 | -5 | 4 | -37 | -5 | -5 | -85 | 1 | 83 |", - "| -85 | -54 | 15 | -17 | -54 | 83 | -101 | 4 | -38 |", - "| -85 | -56 | 2 | -70 | -56 | -56 | -85 | 1 | -25 |", - "| -85 | -72 | 9 | -43 | -72 | 83 | -85 | 3 | -12 |", - "| -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 |", - "| -85 | 13 | 11 | -17 | 13 | 83 | -85 | 3 | 14 |", - "| -85 | 13 | 11 | -25 | 13 | 83 | -85 | 3 | 13 |", - "| -85 | 14 | 12 | -12 | 14 | 83 | -85 | 3 | 17 |", - "| -85 | 17 | 13 | -11 | 17 | 83 | -85 | 4 | -101 |", - "| -85 | 45 | 8 | -34 | 45 | 83 | -85 | 3 | -72 |", - "| -85 | 65 | 17 | -17 | 65 | 83 | -101 | 5 | -101 |", - "| -85 | 83 | 5 | -25 | 83 | 83 | -85 | 2 | -48 |", - "+-------------+----------+-----------------+---------------+--------+-----+------+----+------+", - ], - &df - ); + assert_snapshot!( + batches_to_sort_string(&df), + @r###" + +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ + | first_value | last_val | approx_distinct | approx_median | median | max | min | c2 | c3 | + +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ + | | | | | | | | 1 | -85 | + | -85 | -101 | 14 | -12 | -101 | 83 | -101 | 4 | -54 | + | -85 | -101 | 17 | -25 | -101 | 83 | -101 | 5 | -31 | + | -85 | -12 | 10 | -32 | -12 | 83 | -85 | 3 | 13 | + | -85 | -25 | 3 | -56 | -25 | -25 | -85 | 1 | -5 | + | -85 | -31 | 18 | -29 | -31 | 83 | -101 | 5 | 36 | + | -85 | -38 | 16 | -25 | -38 | 83 | -101 | 4 | 65 | + | -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 | + | -85 | -48 | 6 | -35 | -48 | 83 | -85 | 2 | -43 | + | -85 | -5 | 4 | -37 | -5 | -5 | -85 | 1 | 83 | + | -85 | -54 | 15 | -17 | -54 | 83 | -101 | 4 | -38 | + | -85 | -56 | 2 | -70 | -56 | -56 | -85 | 1 | -25 | + | -85 | -72 | 9 | -43 | -72 | 83 | -85 | 3 | -12 | + | -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 | + | -85 | 13 | 11 | -17 | 13 | 83 | -85 | 3 | 14 | + | -85 | 13 | 11 | -25 | 13 | 83 | -85 | 3 | 13 | + | -85 | 14 | 12 | -12 | 14 | 83 | -85 | 3 | 17 | + | -85 | 17 | 13 | -11 | 17 | 83 | -85 | 4 | -101 | + | -85 | 45 | 8 | -34 | 45 | 83 | -85 | 3 | -72 | + | -85 | 65 | 17 | -17 | 65 | 83 | -101 | 5 | -101 | + | -85 | 83 | 5 | -25 | 83 | 83 | -85 | 2 | -48 | + +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ + "### + ); Ok(()) } @@ -985,19 +1005,20 @@ async fn test_distinct_sort_by() -> Result<()> { let df_results = plan.clone().collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+", - "| c1 |", - "+----+", - "| a |", - "| b |", - "| c |", - "| d |", - "| e |", - "+----+"], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | c1 | + +----+ + | a | + | b | + | c | + | d | + | e | + +----+ + "### + ); Ok(()) } @@ -1013,7 +1034,7 @@ async fn test_distinct_sort_by_unprojected() -> Result<()> { // try to sort on some value not present in input to distinct .sort(vec![col("c2").sort(true, true)]) .unwrap_err(); - assert_eq!(err.strip_backtrace(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); + assert_snapshot!(err.strip_backtrace(), @"Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); Ok(()) } @@ -1032,19 +1053,20 @@ async fn test_distinct_on() -> Result<()> { let df_results = plan.clone().collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+", - "| c1 |", - "+----+", - "| a |", - "| b |", - "| c |", - "| d |", - "| e |", - "+----+"], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | c1 | + +----+ + | a | + | b | + | c | + | d | + | e | + +----+ + "### + ); Ok(()) } @@ -1066,19 +1088,20 @@ async fn test_distinct_on_sort_by() -> Result<()> { let df_results = plan.clone().collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+", - "| c1 |", - "+----+", - "| a |", - "| b |", - "| c |", - "| d |", - "| e |", - "+----+"], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | c1 | + +----+ + | a | + | b | + | c | + | d | + | e | + +----+ + "### + ); Ok(()) } @@ -1098,7 +1121,7 @@ async fn test_distinct_on_sort_by_unprojected() -> Result<()> { // try to sort on some value not present in input to distinct .sort(vec![col("c2").sort(true, true)]) .unwrap_err(); - assert_eq!(err.strip_backtrace(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); + assert_snapshot!(err.strip_backtrace(), @"Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); Ok(()) } @@ -1139,15 +1162,15 @@ async fn join_coercion_unnamed() -> Result<()> { let join = right.join(left, JoinType::LeftAnti, &cols, &cols, filter)?; let results = join.collect().await?; - assert_batches_sorted_eq!( - [ - "+----+------+", - "| id | name |", - "+----+------+", - "| 10 | d |", - "+----+------+", - ], - &results + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+------+ + | id | name | + +----+------+ + | 10 | d | + +----+------+ + "### ); Ok(()) } @@ -1166,12 +1189,13 @@ async fn join_on() -> Result<()> { [col("a.c1").not_eq(col("b.c1")), col("a.c2").eq(col("b.c2"))], )?; - let expected_plan = "Inner Join: Filter: a.c1 != b.c1 AND a.c2 = b.c2\ - \n Projection: a.c1, a.c2\ - \n TableScan: a\ - \n Projection: b.c1, b.c2\ - \n TableScan: b"; - assert_eq!(expected_plan, format!("{}", join.logical_plan())); + assert_snapshot!(join.logical_plan(), @r###" + Inner Join: Filter: a.c1 != b.c1 AND a.c2 = b.c2 + Projection: a.c1, a.c2 + TableScan: a + Projection: b.c1, b.c2 + TableScan: b + "###); Ok(()) } @@ -1187,15 +1211,14 @@ async fn join_on_filter_datatype() -> Result<()> { JoinType::Inner, Some(Expr::Literal(ScalarValue::Null)), )?; - let expected_plan = "EmptyRelation"; - assert_eq!(expected_plan, format!("{}", join.into_optimized_plan()?)); + assert_snapshot!(join.into_optimized_plan().unwrap(), @"EmptyRelation"); // JOIN ON expression must be boolean type let join = left.join_on(right, JoinType::Inner, Some(lit("TRUE")))?; - let expected = join.into_optimized_plan().unwrap_err(); - assert_eq!( - expected.strip_backtrace(), - "type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8" + let err = join.into_optimized_plan().unwrap_err(); + assert_snapshot!( + err.strip_backtrace(), + @"type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8" ); Ok(()) } @@ -1212,8 +1235,7 @@ async fn join_ambiguous_filter() -> Result<()> { let join = left .join_on(right, JoinType::Inner, [col("c1").eq(col("c1"))]) .expect_err("join didn't fail check"); - let expected = "Schema error: Ambiguous reference to unqualified field c1"; - assert_eq!(join.strip_backtrace(), expected); + assert_snapshot!(join.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field c1"); Ok(()) } @@ -1359,35 +1381,35 @@ async fn register_table() -> Result<()> { .await?; let table_results = &table.aggregate(group_expr, aggr_expr)?.collect().await?; - assert_batches_sorted_eq!( - [ - "+----+-----------------------------+", - "| c1 | sum(aggregate_test_100.c12) |", - "+----+-----------------------------+", - "| a | 10.238448667882977 |", - "| b | 7.797734760124923 |", - "| c | 13.860958726523545 |", - "| d | 8.793968289758968 |", - "| e | 10.206140546981722 |", - "+----+-----------------------------+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+-----------------------------+ + | c1 | sum(aggregate_test_100.c12) | + +----+-----------------------------+ + | a | 10.238448667882977 | + | b | 7.797734760124923 | + | c | 13.860958726523545 | + | d | 8.793968289758968 | + | e | 10.206140546981722 | + +----+-----------------------------+ + "### ); // the results are the same as the results from the view, modulo the leaf table name - assert_batches_sorted_eq!( - [ - "+----+---------------------+", - "| c1 | sum(test_table.c12) |", - "+----+---------------------+", - "| a | 10.238448667882977 |", - "| b | 7.797734760124923 |", - "| c | 13.860958726523545 |", - "| d | 8.793968289758968 |", - "| e | 10.206140546981722 |", - "+----+---------------------+" - ], - table_results + assert_snapshot!( + batches_to_sort_string(table_results), + @r###" + +----+---------------------+ + | c1 | sum(test_table.c12) | + +----+---------------------+ + | a | 10.238448667882977 | + | b | 7.797734760124923 | + | c | 13.860958726523545 | + | d | 8.793968289758968 | + | e | 10.206140546981722 | + +----+---------------------+ + "### ); Ok(()) } @@ -1417,20 +1439,20 @@ async fn with_column() -> Result<()> { // check that new column added let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+-----+", - "| c1 | c2 | c3 | sum |", - "+----+----+-----+-----+", - "| a | 3 | -12 | -9 |", - "| a | 3 | -72 | -69 |", - "| a | 3 | 13 | 16 |", - "| a | 3 | 13 | 16 |", - "| a | 3 | 14 | 17 |", - "| a | 3 | 17 | 20 |", - "+----+----+-----+-----+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+-----+-----+ + | c1 | c2 | c3 | sum | + +----+----+-----+-----+ + | a | 3 | -12 | -9 | + | a | 3 | -72 | -69 | + | a | 3 | 13 | 16 | + | a | 3 | 13 | 16 | + | a | 3 | 14 | 17 | + | a | 3 | 17 | 20 | + +----+----+-----+-----+ + "### ); // check that col with the same name overwritten @@ -1440,20 +1462,20 @@ async fn with_column() -> Result<()> { .collect() .await?; - assert_batches_sorted_eq!( - [ - "+-----+----+-----+-----+", - "| c1 | c2 | c3 | sum |", - "+-----+----+-----+-----+", - "| -69 | 3 | -72 | -69 |", - "| -9 | 3 | -12 | -9 |", - "| 16 | 3 | 13 | 16 |", - "| 16 | 3 | 13 | 16 |", - "| 17 | 3 | 14 | 17 |", - "| 20 | 3 | 17 | 20 |", - "+-----+----+-----+-----+" - ], - &df_results_overwrite + assert_snapshot!( + batches_to_sort_string(&df_results_overwrite), + @r###" + +-----+----+-----+-----+ + | c1 | c2 | c3 | sum | + +-----+----+-----+-----+ + | -69 | 3 | -72 | -69 | + | -9 | 3 | -12 | -9 | + | 16 | 3 | 13 | 16 | + | 16 | 3 | 13 | 16 | + | 17 | 3 | 14 | 17 | + | 20 | 3 | 17 | 20 | + +-----+----+-----+-----+ + "### ); // check that col with the same name overwritten using same name as reference @@ -1463,20 +1485,20 @@ async fn with_column() -> Result<()> { .collect() .await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+-----+", - "| c1 | c2 | c3 | sum |", - "+----+----+-----+-----+", - "| a | 4 | -12 | -9 |", - "| a | 4 | -72 | -69 |", - "| a | 4 | 13 | 16 |", - "| a | 4 | 13 | 16 |", - "| a | 4 | 14 | 17 |", - "| a | 4 | 17 | 20 |", - "+----+----+-----+-----+" - ], - &df_results_overwrite_self + assert_snapshot!( + batches_to_sort_string(&df_results_overwrite_self), + @r###" + +----+----+-----+-----+ + | c1 | c2 | c3 | sum | + +----+----+-----+-----+ + | a | 4 | -12 | -9 | + | a | 4 | -72 | -69 | + | a | 4 | 13 | 16 | + | a | 4 | 13 | 16 | + | a | 4 | 14 | 17 | + | a | 4 | 17 | 20 | + +----+----+-----+-----+ + "### ); Ok(()) @@ -1502,16 +1524,16 @@ async fn test_window_function_with_column() -> Result<()> { assert_eq!(5, df.schema().fields().len()); let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+-----+---+", - "| c1 | c2 | c3 | s | r |", - "+----+----+-----+-----+---+", - "| c | 2 | 1 | 3 | 1 |", - "| d | 5 | -40 | -35 | 2 |", - "+----+----+-----+-----+---+", - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+-----+-----+---+ + | c1 | c2 | c3 | s | r | + +----+----+-----+-----+---+ + | c | 2 | 1 | 3 | 1 | + | d | 5 | -40 | -35 | 2 | + +----+----+-----+-----+---+ + "### ); Ok(()) @@ -1544,54 +1566,61 @@ async fn with_column_join_same_columns() -> Result<()> { .limit(0, Some(1))?; let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+", - "| c1 | c1 |", - "+----+----+", - "| a | a |", - "+----+----+", - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+ + | c1 | c1 | + +----+----+ + | a | a | + +----+----+ + "### ); let df_with_column = df.clone().with_column("new_column", lit(true))?; - assert_eq!( - "\ - Projection: t1.c1, t2.c1, Boolean(true) AS new_column\ - \n Limit: skip=0, fetch=1\ - \n Sort: t1.c1 ASC NULLS FIRST\ - \n Inner Join: t1.c1 = t2.c1\ - \n TableScan: t1\ - \n TableScan: t2", - format!("{}", df_with_column.logical_plan()) + assert_snapshot!( + df_with_column.logical_plan(), + @r" + Projection: t1.c1, t2.c1, Boolean(true) AS new_column + Limit: skip=0, fetch=1 + Sort: t1.c1 ASC NULLS FIRST + Inner Join: t1.c1 = t2.c1 + SubqueryAlias: t1 + Projection: aggregate_test_100.c1 + TableScan: aggregate_test_100 + SubqueryAlias: t2 + Projection: aggregate_test_100.c1 + TableScan: aggregate_test_100 + " ); - assert_eq!( - "\ - Projection: t1.c1, t2.c1, Boolean(true) AS new_column\ - \n Sort: t1.c1 ASC NULLS FIRST, fetch=1\ - \n Inner Join: t1.c1 = t2.c1\ - \n SubqueryAlias: t1\ - \n TableScan: aggregate_test_100 projection=[c1]\ - \n SubqueryAlias: t2\ - \n TableScan: aggregate_test_100 projection=[c1]", - format!("{}", df_with_column.clone().into_optimized_plan()?) + assert_snapshot!( + df_with_column.clone().into_optimized_plan().unwrap(), + @r###" + Projection: t1.c1, t2.c1, Boolean(true) AS new_column + Sort: t1.c1 ASC NULLS FIRST, fetch=1 + Inner Join: t1.c1 = t2.c1 + SubqueryAlias: t1 + TableScan: aggregate_test_100 projection=[c1] + SubqueryAlias: t2 + TableScan: aggregate_test_100 projection=[c1] + "### ); let df_results = df_with_column.collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+------------+", - "| c1 | c1 | new_column |", - "+----+----+------------+", - "| a | a | true |", - "+----+----+------------+", - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+------------+ + | c1 | c1 | new_column | + +----+----+------------+ + | a | a | true | + +----+----+------------+ + "### ); + Ok(()) } @@ -1637,15 +1666,15 @@ async fn with_column_renamed() -> Result<()> { let batches = &df_sum_renamed.collect().await?; - assert_batches_sorted_eq!( - [ - "+-----+-----+-----+-------+", - "| one | two | c3 | total |", - "+-----+-----+-----+-------+", - "| a | 3 | -72 | -69 |", - "+-----+-----+-----+-------+", - ], - batches + assert_snapshot!( + batches_to_sort_string(batches), + @r###" + +-----+-----+-----+-------+ + | one | two | c3 | total | + +-----+-----+-----+-------+ + | a | 3 | -72 | -69 | + +-----+-----+-----+-------+ + "### ); Ok(()) @@ -1673,8 +1702,7 @@ async fn with_column_renamed_ambiguous() -> Result<()> { // can be t1.c2 or t2.c2 .with_column_renamed("c2", "AAA") .unwrap_err(); - let expected_err = "Schema error: Ambiguous reference to unqualified field c2"; - assert_eq!(actual_err.strip_backtrace(), expected_err); + assert_snapshot!(actual_err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field c2"); Ok(()) } @@ -1709,51 +1737,59 @@ async fn with_column_renamed_join() -> Result<()> { .limit(0, Some(1))?; let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+----+----+-----+", - "| c1 | c2 | c3 | c1 | c2 | c3 |", - "+----+----+-----+----+----+-----+", - "| a | 1 | -85 | a | 1 | -85 |", - "+----+----+-----+----+----+-----+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+-----+----+----+-----+ + | c1 | c2 | c3 | c1 | c2 | c3 | + +----+----+-----+----+----+-----+ + | a | 1 | -85 | a | 1 | -85 | + +----+----+-----+----+----+-----+ + "### ); let df_renamed = df.clone().with_column_renamed("t1.c1", "AAA")?; - assert_eq!("\ - Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3\ - \n Limit: skip=0, fetch=1\ - \n Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST\ - \n Inner Join: t1.c1 = t2.c1\ - \n TableScan: t1\ - \n TableScan: t2", - format!("{}", df_renamed.logical_plan()) + assert_snapshot!( + df_renamed.logical_plan(), + @r" + Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3 + Limit: skip=0, fetch=1 + Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST + Inner Join: t1.c1 = t2.c1 + SubqueryAlias: t1 + Projection: aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c3 + TableScan: aggregate_test_100 + SubqueryAlias: t2 + Projection: aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c3 + TableScan: aggregate_test_100 + " ); - assert_eq!("\ - Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3\ - \n Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST, fetch=1\ - \n Inner Join: t1.c1 = t2.c1\ - \n SubqueryAlias: t1\ - \n TableScan: aggregate_test_100 projection=[c1, c2, c3]\ - \n SubqueryAlias: t2\ - \n TableScan: aggregate_test_100 projection=[c1, c2, c3]", - format!("{}", df_renamed.clone().into_optimized_plan()?) + assert_snapshot!( + df_renamed.clone().into_optimized_plan().unwrap(), + @r###" + Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3 + Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST, fetch=1 + Inner Join: t1.c1 = t2.c1 + SubqueryAlias: t1 + TableScan: aggregate_test_100 projection=[c1, c2, c3] + SubqueryAlias: t2 + TableScan: aggregate_test_100 projection=[c1, c2, c3] + "### ); let df_results = df_renamed.collect().await?; - assert_batches_sorted_eq!( - [ - "+-----+----+-----+----+----+-----+", - "| AAA | c2 | c3 | c1 | c2 | c3 |", - "+-----+----+-----+----+----+-----+", - "| a | 1 | -85 | a | 1 | -85 |", - "+-----+----+-----+----+----+-----+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +-----+----+-----+----+----+-----+ + | AAA | c2 | c3 | c1 | c2 | c3 | + +-----+----+-----+----+----+-----+ + | a | 1 | -85 | a | 1 | -85 | + +-----+----+-----+----+----+-----+ + "### ); Ok(()) @@ -1786,15 +1822,15 @@ async fn with_column_renamed_case_sensitive() -> Result<()> { let res = &df_renamed.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+---------+", - "| CoLuMn1 |", - "+---------+", - "| a |", - "+---------+" - ], - res + assert_snapshot!( + batches_to_sort_string(res), + @r###" + +---------+ + | CoLuMn1 | + +---------+ + | a | + +---------+ + "### ); let df_renamed = df_renamed @@ -1802,9 +1838,15 @@ async fn with_column_renamed_case_sensitive() -> Result<()> { .collect() .await?; - assert_batches_sorted_eq!( - ["+----+", "| c1 |", "+----+", "| a |", "+----+"], - &df_renamed + assert_snapshot!( + batches_to_sort_string(&df_renamed), + @r###" + +----+ + | c1 | + +----+ + | a | + +----+ + "### ); Ok(()) @@ -1820,15 +1862,15 @@ async fn cast_expr_test() -> Result<()> { let df_results = df.clone().collect().await?; df.clone().show().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+", - "| c2 | c3 | sum |", - "+----+----+-----+", - "| 2 | 1 | 3 |", - "+----+----+-----+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+-----+ + | c2 | c3 | sum | + +----+----+-----+ + | 2 | 1 | 3 | + +----+----+-----+ + "### ); Ok(()) @@ -1886,16 +1928,16 @@ async fn with_column_name() -> Result<()> { let df_results = df.collect().await?; - assert_batches_sorted_eq!( - [ - "+------+-------+", - "| f.c1 | f.c2 |", - "+------+-------+", - "| 1 | hello |", - "| 10 | hello |", - "+------+-------+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +------+-------+ + | f.c1 | f.c2 | + +------+-------+ + | 1 | hello | + | 10 | hello | + +------+-------+ + "### ); Ok(()) @@ -1922,22 +1964,22 @@ async fn cache_test() -> Result<()> { let cached_df = df.clone().cache().await?; - assert_eq!( - "TableScan: ?table? projection=[c2, c3, sum]", - format!("{}", cached_df.clone().into_optimized_plan()?) + assert_snapshot!( + cached_df.clone().into_optimized_plan().unwrap(), + @"TableScan: ?table? projection=[c2, c3, sum]" ); let df_results = df.collect().await?; let cached_df_results = cached_df.collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+", - "| c2 | c3 | sum |", - "+----+----+-----+", - "| 2 | 1 | 3 |", - "+----+----+-----+" - ], - &cached_df_results + assert_snapshot!( + batches_to_sort_string(&cached_df_results), + @r###" + +----+----+-----+ + | c2 | c3 | sum | + +----+----+-----+ + | 2 | 1 | 3 | + +----+----+-----+ + "### ); assert_eq!(&df_results, &cached_df_results); @@ -2210,15 +2252,15 @@ async fn filtered_aggr_with_param_values() -> Result<()> { .with_param_values(ParamValues::List(vec![ScalarValue::from(10u64)])); let df_results = df?.collect().await?; - assert_batches_eq!( - &[ - "+------------------------------------------------+", - "| count(table1.c2) FILTER (WHERE table1.c3 > $1) |", - "+------------------------------------------------+", - "| 54 |", - "+------------------------------------------------+", - ], - &df_results + assert_snapshot!( + batches_to_string(&df_results), + @r###" + +------------------------------------------------+ + | count(table1.c2) FILTER (WHERE table1.c3 > $1) | + +------------------------------------------------+ + | 54 | + +------------------------------------------------+ + "### ); Ok(()) @@ -2264,20 +2306,21 @@ async fn write_parquet_with_order() -> Result<()> { let df = ctx.sql("SELECT * FROM data").await?; let results = df.collect().await?; - assert_batches_eq!( - &[ - "+---+---+", - "| a | b |", - "+---+---+", - "| 1 | 2 |", - "| 2 | 6 |", - "| 3 | 5 |", - "| 5 | 3 |", - "| 7 | 4 |", - "+---+---+", - ], - &results + assert_snapshot!( + batches_to_string(&results), + @r###" + +---+---+ + | a | b | + +---+---+ + | 1 | 2 | + | 2 | 6 | + | 3 | 5 | + | 5 | 3 | + | 7 | 4 | + +---+---+ + "### ); + Ok(()) } @@ -2321,19 +2364,19 @@ async fn write_csv_with_order() -> Result<()> { let df = ctx.sql("SELECT * FROM data").await?; let results = df.collect().await?; - assert_batches_eq!( - &[ - "+---+---+", - "| a | b |", - "+---+---+", - "| 1 | 2 |", - "| 2 | 6 |", - "| 3 | 5 |", - "| 5 | 3 |", - "| 7 | 4 |", - "+---+---+", - ], - &results + assert_snapshot!( + batches_to_string(&results), + @r###" + +---+---+ + | a | b | + +---+---+ + | 1 | 2 | + | 2 | 6 | + | 3 | 5 | + | 5 | 3 | + | 7 | 4 | + +---+---+ + "### ); Ok(()) } @@ -2378,19 +2421,19 @@ async fn write_json_with_order() -> Result<()> { let df = ctx.sql("SELECT * FROM data").await?; let results = df.collect().await?; - assert_batches_eq!( - &[ - "+---+---+", - "| a | b |", - "+---+---+", - "| 1 | 2 |", - "| 2 | 6 |", - "| 3 | 5 |", - "| 5 | 3 |", - "| 7 | 4 |", - "+---+---+", - ], - &results + assert_snapshot!( + batches_to_string(&results), + @r###" + +---+---+ + | a | b | + +---+---+ + | 1 | 2 | + | 2 | 6 | + | 3 | 5 | + | 5 | 3 | + | 7 | 4 | + +---+---+ + "### ); Ok(()) } @@ -2432,19 +2475,19 @@ async fn write_table_with_order() -> Result<()> { let df = ctx.sql("SELECT * FROM data").await?; let results = df.collect().await?; - assert_batches_eq!( - &[ - "+-----------+", - "| tablecol1 |", - "+-----------+", - "| a |", - "| b |", - "| c |", - "| x |", - "| z |", - "+-----------+", - ], - &results + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----------+ + | tablecol1 | + +-----------+ + | a | + | b | + | c | + | x | + | z | + +-----------+ + "### ); Ok(()) } @@ -2469,50 +2512,52 @@ async fn test_count_wildcard_on_sort() -> Result<()> { .collect() .await?; - let expected_sql_result = "+---------------+------------------------------------------------------------------------------------+\ - \n| plan_type | plan |\ - \n+---------------+------------------------------------------------------------------------------------+\ - \n| logical_plan | Sort: count(*) ASC NULLS LAST |\ - \n| | Projection: t1.b, count(Int64(1)) AS count(*) |\ - \n| | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]] |\ - \n| | TableScan: t1 projection=[b] |\ - \n| physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST] |\ - \n| | SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] |\ - \n| | ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*)] |\ - \n| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] |\ - \n| | CoalesceBatchesExec: target_batch_size=8192 |\ - \n| | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 |\ - \n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ - \n| | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))] |\ - \n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ - \n| | |\ - \n+---------------+------------------------------------------------------------------------------------+"; - - assert_eq!( - expected_sql_result, - pretty_format_batches(&sql_results)?.to_string() - ); - - let expected_df_result = "+---------------+--------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+--------------------------------------------------------------------------------+\ -\n| logical_plan | Sort: count(*) ASC NULLS LAST |\ -\n| | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]] |\ -\n| | TableScan: t1 projection=[b] |\ -\n| physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST] |\ -\n| | SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] |\ -\n| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 |\ -\n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ -\n| | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+--------------------------------------------------------------------------------+"; + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: t1.b, count(*) | + | | Sort: count(Int64(1)) AS count(*) AS count(*) ASC NULLS LAST | + | | Projection: t1.b, count(Int64(1)) AS count(*), count(Int64(1)) | + | | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]] | + | | TableScan: t1 projection=[b] | + | physical_plan | ProjectionExec: expr=[b@0 as b, count(*)@1 as count(*)] | + | | SortPreservingMergeExec: [count(Int64(1))@2 ASC NULLS LAST] | + | | SortExec: expr=[count(Int64(1))@2 ASC NULLS LAST], preserve_partitioning=[true] | + | | ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*), count(Int64(1))@1 as count(Int64(1))] | + | | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 | + | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+------------------------------------------------------------------------------------------------------------+ + "### + ); - assert_eq!( - expected_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+--------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+--------------------------------------------------------------------------------+ + | logical_plan | Sort: count(*) ASC NULLS LAST | + | | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t1 projection=[b] | + | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST] | + | | SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] | + | | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 | + | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+--------------------------------------------------------------------------------+ + "### ); Ok(()) } @@ -2527,27 +2572,27 @@ async fn test_count_wildcard_on_where_in() -> Result<()> { .collect() .await?; - let expected_sql_result = "+---------------+------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*) |\ -\n| | TableScan: t1 projection=[a, b] |\ -\n| | SubqueryAlias: __correlated_sq_1 |\ -\n| | Projection: count(Int64(1)) AS count(*) |\ -\n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] |\ -\n| | TableScan: t2 projection=[] |\ -\n| physical_plan | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |\ -\n| | ProjectionExec: expr=[4 as count(*)] |\ -\n| | PlaceholderRowExec |\ -\n| | ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+------------------------------------------------------------------------------------------------------------------------+"; - - assert_eq!( - expected_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*) | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | Projection: count(Int64(1)) AS count(*) | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | + | | TableScan: t2 projection=[] | + | physical_plan | CoalesceBatchesExec: target_batch_size=8192 | + | | HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] | + | | ProjectionExec: expr=[4 as count(*)] | + | | PlaceholderRowExec | + | | ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+------------------------------------------------------------------------------------------------------------------------+ + "### ); // In the same SessionContext, AliasGenerator will increase subquery_alias id by 1 @@ -2572,27 +2617,27 @@ async fn test_count_wildcard_on_where_in() -> Result<()> { .collect() .await?; - let actual_df_result= "+---------------+------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*) |\ -\n| | TableScan: t1 projection=[a, b] |\ -\n| | SubqueryAlias: __correlated_sq_1 |\ -\n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] |\ -\n| | TableScan: t2 projection=[] |\ -\n| physical_plan | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |\ -\n| | ProjectionExec: expr=[4 as count(*)] |\ -\n| | PlaceholderRowExec |\ -\n| | ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+------------------------------------------------------------------------------------------------------------------------+"; - // make sure sql plan same with df plan - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*) | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t2 projection=[] | + | physical_plan | CoalesceBatchesExec: target_batch_size=8192 | + | | HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] | + | | ProjectionExec: expr=[4 as count(*)] | + | | PlaceholderRowExec | + | | ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+------------------------------------------------------------------------------------------------------------------------+ + "### ); Ok(()) @@ -2608,26 +2653,25 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> { .collect() .await?; - let actual_sql_result = - "+---------------+---------------------------------------------------------+\ - \n| plan_type | plan |\ - \n+---------------+---------------------------------------------------------+\ - \n| logical_plan | LeftSemi Join: |\ - \n| | TableScan: t1 projection=[a, b] |\ - \n| | SubqueryAlias: __correlated_sq_1 |\ - \n| | Projection: |\ - \n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] |\ - \n| | TableScan: t2 projection=[] |\ - \n| physical_plan | NestedLoopJoinExec: join_type=RightSemi |\ - \n| | ProjectionExec: expr=[] |\ - \n| | PlaceholderRowExec |\ - \n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ - \n| | |\ - \n+---------------+---------------------------------------------------------+"; - - assert_eq!( - actual_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------+ + | logical_plan | LeftSemi Join: | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | Projection: | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | + | | TableScan: t2 projection=[] | + | physical_plan | NestedLoopJoinExec: join_type=RightSemi | + | | ProjectionExec: expr=[] | + | | PlaceholderRowExec | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------+ + "### ); let df_results = ctx @@ -2648,25 +2692,25 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> { .collect() .await?; - let actual_df_result = "+---------------+---------------------------------------------------------------------+\ - \n| plan_type | plan |\ - \n+---------------+---------------------------------------------------------------------+\ - \n| logical_plan | LeftSemi Join: |\ - \n| | TableScan: t1 projection=[a, b] |\ - \n| | SubqueryAlias: __correlated_sq_1 |\ - \n| | Projection: |\ - \n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] |\ - \n| | TableScan: t2 projection=[] |\ - \n| physical_plan | NestedLoopJoinExec: join_type=RightSemi |\ - \n| | ProjectionExec: expr=[] |\ - \n| | PlaceholderRowExec |\ - \n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ - \n| | |\ - \n+---------------+---------------------------------------------------------------------+"; - - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------+ + | logical_plan | LeftSemi Join: | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | Projection: | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t2 projection=[] | + | physical_plan | NestedLoopJoinExec: join_type=RightSemi | + | | ProjectionExec: expr=[] | + | | PlaceholderRowExec | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------+ + "### ); Ok(()) @@ -2683,22 +2727,22 @@ async fn test_count_wildcard_on_window() -> Result<()> { .collect() .await?; - let actual_sql_result = "+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING |\ -\n| | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] |\ -\n| | TableScan: t1 projection=[a] |\ -\n| physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] |\ -\n| | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: \"count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] |\ -\n| | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+"; - - assert_eq!( - actual_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + "### ); let df_results = ctx @@ -2717,22 +2761,22 @@ async fn test_count_wildcard_on_window() -> Result<()> { .collect() .await?; - let actual_df_result = "+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING |\ -\n| | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] |\ -\n| | TableScan: t1 projection=[a] |\ -\n| physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] |\ -\n| | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: \"count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] |\ -\n| | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+"; - - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + "### ); Ok(()) @@ -2750,20 +2794,20 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> { .collect() .await?; - let actual_sql_result = - "+---------------+-----------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+-----------------------------------------------------+\ -\n| logical_plan | Projection: count(Int64(1)) AS count(*) |\ -\n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] |\ -\n| | TableScan: t1 projection=[] |\ -\n| physical_plan | ProjectionExec: expr=[4 as count(*)] |\ -\n| | PlaceholderRowExec |\ -\n| | |\ -\n+---------------+-----------------------------------------------------+"; - assert_eq!( - actual_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+-----------------------------------------------------+ + | plan_type | plan | + +---------------+-----------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) AS count(*) | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | + | | TableScan: t1 projection=[] | + | physical_plan | ProjectionExec: expr=[4 as count(*)] | + | | PlaceholderRowExec | + | | | + +---------------+-----------------------------------------------------+ + "### ); // add `.select(vec![count_wildcard()])?` to make sure we can analyze all node instead of just top node. @@ -2776,18 +2820,19 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> { .collect() .await?; - let actual_df_result = "+---------------+---------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------+\ -\n| logical_plan | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] |\ -\n| | TableScan: t1 projection=[] |\ -\n| physical_plan | ProjectionExec: expr=[4 as count(*)] |\ -\n| | PlaceholderRowExec |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------+"; - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------+ + | logical_plan | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t1 projection=[] | + | physical_plan | ProjectionExec: expr=[4 as count(*)] | + | | PlaceholderRowExec | + | | | + +---------------+---------------------------------------------------------------+ + "### ); Ok(()) @@ -2804,37 +2849,38 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> { .collect() .await?; - let actual_sql_result = "+---------------+---------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | Projection: t1.a, t1.b |\ -\n| | Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0) |\ -\n| | Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true |\ -\n| | Left Join: t1.a = __scalar_sq_1.a |\ -\n| | TableScan: t1 projection=[a, b] |\ -\n| | SubqueryAlias: __scalar_sq_1 |\ -\n| | Projection: count(Int64(1)) AS count(*), t2.a, Boolean(true) AS __always_true |\ -\n| | Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1))]] |\ -\n| | TableScan: t2 projection=[a] |\ -\n| physical_plan | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true] |\ -\n| | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 |\ -\n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ -\n| | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------+"; - assert_eq!( - actual_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: t1.a, t1.b | + | | Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0) | + | | Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true | + | | Left Join: t1.a = __scalar_sq_1.a | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __scalar_sq_1 | + | | Projection: count(Int64(1)) AS count(*), t2.a, Boolean(true) AS __always_true | + | | Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1))]] | + | | TableScan: t2 projection=[a] | + | physical_plan | CoalesceBatchesExec: target_batch_size=8192 | + | | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true] | + | | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 | + | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + "### ); // In the same SessionContext, AliasGenerator will increase subquery_alias id by 1 @@ -2862,37 +2908,38 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> { .collect() .await?; - let actual_df_result = "+---------------+---------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | Projection: t1.a, t1.b |\ -\n| | Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0) |\ -\n| | Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true |\ -\n| | Left Join: t1.a = __scalar_sq_1.a |\ -\n| | TableScan: t1 projection=[a, b] |\ -\n| | SubqueryAlias: __scalar_sq_1 |\ -\n| | Projection: count(*), t2.a, Boolean(true) AS __always_true |\ -\n| | Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1)) AS count(*)]] |\ -\n| | TableScan: t2 projection=[a] |\ -\n| physical_plan | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true] |\ -\n| | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 |\ -\n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ -\n| | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------+"; - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: t1.a, t1.b | + | | Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0) | + | | Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true | + | | Left Join: t1.a = __scalar_sq_1.a | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __scalar_sq_1 | + | | Projection: count(*), t2.a, Boolean(true) AS __always_true | + | | Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t2 projection=[a] | + | physical_plan | CoalesceBatchesExec: target_batch_size=8192 | + | | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true] | + | | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 | + | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + "### ); Ok(()) @@ -2974,16 +3021,19 @@ async fn sort_on_unprojected_columns() -> Result<()> { .unwrap(); let results = df.collect().await.unwrap(); - #[rustfmt::skip] - let expected = ["+-----+", - "| a |", - "+-----+", - "| 100 |", - "| 10 |", - "| 10 |", - "| 1 |", - "+-----+"]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----+ + | a | + +-----+ + | 100 | + | 10 | + | 10 | + | 1 | + +-----+ + "### + ); Ok(()) } @@ -3018,15 +3068,18 @@ async fn sort_on_distinct_columns() -> Result<()> { .unwrap(); let results = df.collect().await.unwrap(); - #[rustfmt::skip] - let expected = ["+-----+", - "| a |", - "+-----+", - "| 100 |", - "| 10 |", - "| 1 |", - "+-----+"]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----+ + | a | + +-----+ + | 100 | + | 10 | + | 1 | + +-----+ + "### + ); Ok(()) } @@ -3055,7 +3108,7 @@ async fn sort_on_distinct_unprojected_columns() -> Result<()> { .distinct()? .sort(vec![Sort::new(col("b"), false, true)]) .unwrap_err(); - assert_eq!(err.strip_backtrace(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions b must appear in select list"); + assert_snapshot!(err.strip_backtrace(), @"Error during planning: For SELECT DISTINCT, ORDER BY expressions b must appear in select list"); Ok(()) } @@ -3073,8 +3126,7 @@ async fn sort_on_ambiguous_column() -> Result<()> { .sort(vec![col("b").sort(true, true)]) .unwrap_err(); - let expected = "Schema error: Ambiguous reference to unqualified field b"; - assert_eq!(err.strip_backtrace(), expected); + assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); Ok(()) } @@ -3092,8 +3144,7 @@ async fn group_by_ambiguous_column() -> Result<()> { .aggregate(vec![col("b")], vec![max(col("a"))]) .unwrap_err(); - let expected = "Schema error: Ambiguous reference to unqualified field b"; - assert_eq!(err.strip_backtrace(), expected); + assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); Ok(()) } @@ -3111,8 +3162,7 @@ async fn filter_on_ambiguous_column() -> Result<()> { .filter(col("b").eq(lit(1))) .unwrap_err(); - let expected = "Schema error: Ambiguous reference to unqualified field b"; - assert_eq!(err.strip_backtrace(), expected); + assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); Ok(()) } @@ -3130,8 +3180,7 @@ async fn select_ambiguous_column() -> Result<()> { .select(vec![col("b")]) .unwrap_err(); - let expected = "Schema error: Ambiguous reference to unqualified field b"; - assert_eq!(err.strip_backtrace(), expected); + assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); Ok(()) } @@ -3158,14 +3207,17 @@ async fn filter_with_alias_overwrite() -> Result<()> { .unwrap(); let results = df.collect().await.unwrap(); - #[rustfmt::skip] - let expected = ["+------+", - "| a |", - "+------+", - "| true |", - "| true |", - "+------+"]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+ + | a | + +------+ + | true | + | true | + +------+ + "### + ); Ok(()) } @@ -3191,16 +3243,19 @@ async fn select_with_alias_overwrite() -> Result<()> { let results = df.collect().await?; - #[rustfmt::skip] - let expected = ["+-------+", - "| a |", - "+-------+", - "| false |", - "| true |", - "| true |", - "| false |", - "+-------+"]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-------+ + | a | + +-------+ + | false | + | true | + | true | + | false | + +-------+ + "### + ); Ok(()) } @@ -3223,24 +3278,26 @@ async fn test_grouping_sets() -> Result<()> { let results = df.collect().await?; - let expected = vec![ - "+-----------+-----+---------------+", - "| a | b | count(test.a) |", - "+-----------+-----+---------------+", - "| | 100 | 1 |", - "| | 10 | 2 |", - "| | 1 | 1 |", - "| abcDEF | | 1 |", - "| abcDEF | 1 | 1 |", - "| abc123 | | 1 |", - "| abc123 | 10 | 1 |", - "| CBAdef | | 1 |", - "| CBAdef | 10 | 1 |", - "| 123AbcDef | | 1 |", - "| 123AbcDef | 100 | 1 |", - "+-----------+-----+---------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----------+-----+---------------+ + | a | b | count(test.a) | + +-----------+-----+---------------+ + | | 100 | 1 | + | | 10 | 2 | + | | 1 | 1 | + | abcDEF | | 1 | + | abcDEF | 1 | 1 | + | abc123 | | 1 | + | abc123 | 10 | 1 | + | CBAdef | | 1 | + | CBAdef | 10 | 1 | + | 123AbcDef | | 1 | + | 123AbcDef | 100 | 1 | + +-----------+-----+---------------+ + "### + ); Ok(()) } @@ -3264,23 +3321,25 @@ async fn test_grouping_sets_count() -> Result<()> { let results = df.collect().await?; - let expected = vec![ - "+----+----+-----------------+", - "| c1 | c2 | count(Int32(1)) |", - "+----+----+-----------------+", - "| | 5 | 14 |", - "| | 4 | 23 |", - "| | 3 | 19 |", - "| | 2 | 22 |", - "| | 1 | 22 |", - "| e | | 21 |", - "| d | | 18 |", - "| c | | 21 |", - "| b | | 19 |", - "| a | | 21 |", - "+----+----+-----------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +----+----+-----------------+ + | c1 | c2 | count(Int32(1)) | + +----+----+-----------------+ + | | 5 | 14 | + | | 4 | 23 | + | | 3 | 19 | + | | 2 | 22 | + | | 1 | 22 | + | e | | 21 | + | d | | 18 | + | c | | 21 | + | b | | 19 | + | a | | 21 | + +----+----+-----------------+ + "### + ); Ok(()) } @@ -3311,48 +3370,50 @@ async fn test_grouping_set_array_agg_with_overflow() -> Result<()> { let results = df.collect().await?; - let expected = vec![ - "+----+----+--------+---------------------+", - "| c1 | c2 | sum_c3 | avg_c3 |", - "+----+----+--------+---------------------+", - "| | 5 | -194 | -13.857142857142858 |", - "| | 4 | 29 | 1.2608695652173914 |", - "| | 3 | 395 | 20.789473684210527 |", - "| | 2 | 184 | 8.363636363636363 |", - "| | 1 | 367 | 16.681818181818183 |", - "| e | | 847 | 40.333333333333336 |", - "| e | 5 | -22 | -11.0 |", - "| e | 4 | 261 | 37.285714285714285 |", - "| e | 3 | 192 | 48.0 |", - "| e | 2 | 189 | 37.8 |", - "| e | 1 | 227 | 75.66666666666667 |", - "| d | | 458 | 25.444444444444443 |", - "| d | 5 | -99 | -49.5 |", - "| d | 4 | 162 | 54.0 |", - "| d | 3 | 124 | 41.333333333333336 |", - "| d | 2 | 328 | 109.33333333333333 |", - "| d | 1 | -57 | -8.142857142857142 |", - "| c | | -28 | -1.3333333333333333 |", - "| c | 5 | 24 | 12.0 |", - "| c | 4 | -43 | -10.75 |", - "| c | 3 | 190 | 47.5 |", - "| c | 2 | -389 | -55.57142857142857 |", - "| c | 1 | 190 | 47.5 |", - "| b | | -111 | -5.842105263157895 |", - "| b | 5 | -1 | -0.2 |", - "| b | 4 | -223 | -44.6 |", - "| b | 3 | -84 | -42.0 |", - "| b | 2 | 102 | 25.5 |", - "| b | 1 | 95 | 31.666666666666668 |", - "| a | | -385 | -18.333333333333332 |", - "| a | 5 | -96 | -32.0 |", - "| a | 4 | -128 | -32.0 |", - "| a | 3 | -27 | -4.5 |", - "| a | 2 | -46 | -15.333333333333334 |", - "| a | 1 | -88 | -17.6 |", - "+----+----+--------+---------------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +----+----+--------+---------------------+ + | c1 | c2 | sum_c3 | avg_c3 | + +----+----+--------+---------------------+ + | | 5 | -194 | -13.857142857142858 | + | | 4 | 29 | 1.2608695652173914 | + | | 3 | 395 | 20.789473684210527 | + | | 2 | 184 | 8.363636363636363 | + | | 1 | 367 | 16.681818181818183 | + | e | | 847 | 40.333333333333336 | + | e | 5 | -22 | -11.0 | + | e | 4 | 261 | 37.285714285714285 | + | e | 3 | 192 | 48.0 | + | e | 2 | 189 | 37.8 | + | e | 1 | 227 | 75.66666666666667 | + | d | | 458 | 25.444444444444443 | + | d | 5 | -99 | -49.5 | + | d | 4 | 162 | 54.0 | + | d | 3 | 124 | 41.333333333333336 | + | d | 2 | 328 | 109.33333333333333 | + | d | 1 | -57 | -8.142857142857142 | + | c | | -28 | -1.3333333333333333 | + | c | 5 | 24 | 12.0 | + | c | 4 | -43 | -10.75 | + | c | 3 | 190 | 47.5 | + | c | 2 | -389 | -55.57142857142857 | + | c | 1 | 190 | 47.5 | + | b | | -111 | -5.842105263157895 | + | b | 5 | -1 | -0.2 | + | b | 4 | -223 | -44.6 | + | b | 3 | -84 | -42.0 | + | b | 2 | 102 | 25.5 | + | b | 1 | 95 | 31.666666666666668 | + | a | | -385 | -18.333333333333332 | + | a | 5 | -96 | -32.0 | + | a | 4 | -128 | -32.0 | + | a | 3 | -27 | -4.5 | + | a | 2 | -46 | -15.333333333333334 | + | a | 1 | -88 | -17.6 | + +----+----+--------+---------------------+ + "### + ); Ok(()) } @@ -3384,31 +3445,30 @@ async fn join_with_alias_filter() -> Result<()> { ])?; let optimized_plan = df.clone().into_optimized_plan()?; - let expected = vec![ - "Projection: t1.a, t2.a, t1.b, t1.c, t2.b, t2.c [a:UInt32, a:UInt32, b:Utf8, c:Int32, b:Utf8, c:Int32]", - " Inner Join: t1.a + UInt32(3) = t2.a + UInt32(1) [a:UInt32, b:Utf8, c:Int32, a:UInt32, b:Utf8, c:Int32]", - " TableScan: t1 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", - " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", - ]; - let formatted = optimized_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r###" + Projection: t1.a, t2.a, t1.b, t1.c, t2.b, t2.c [a:UInt32, a:UInt32, b:Utf8, c:Int32, b:Utf8, c:Int32] + Inner Join: t1.a + UInt32(3) = t2.a + UInt32(1) [a:UInt32, b:Utf8, c:Int32, a:UInt32, b:Utf8, c:Int32] + TableScan: t1 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32] + TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32] + "### ); let results = df.collect().await?; - let expected: Vec<&str> = vec![ - "+----+----+---+----+---+---+", - "| a | a | b | c | b | c |", - "+----+----+---+----+---+---+", - "| 11 | 13 | c | 30 | c | 3 |", - "| 1 | 3 | a | 10 | a | 1 |", - "+----+----+---+----+---+---+", - ]; - - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+----+---+----+---+---+ + | a | a | b | c | b | c | + +----+----+---+----+---+---+ + | 1 | 3 | a | 10 | a | 1 | + | 11 | 13 | c | 30 | c | 3 | + +----+----+---+----+---+---+ + "### + ); Ok(()) } @@ -3429,32 +3489,34 @@ async fn right_semi_with_alias_filter() -> Result<()> { .join(t2, JoinType::RightSemi, &[], &[], Some(filter))? .select(vec![col("t2.a"), col("t2.b"), col("t2.c")])?; let optimized_plan = df.clone().into_optimized_plan()?; - let expected = vec![ - "RightSemi Join: t1.a = t2.a [a:UInt32, b:Utf8, c:Int32]", - " Projection: t1.a [a:UInt32]", - " Filter: t1.c > Int32(1) [a:UInt32, c:Int32]", - " TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]", - " Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]", - " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", - ]; let formatted = optimized_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r###" + RightSemi Join: t1.a = t2.a [a:UInt32, b:Utf8, c:Int32] + Projection: t1.a [a:UInt32] + Filter: t1.c > Int32(1) [a:UInt32, c:Int32] + TableScan: t1 projection=[a, c] [a:UInt32, c:Int32] + Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32] + TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32] + "### ); let results = df.collect().await?; - let expected: Vec<&str> = vec![ - "+-----+---+---+", - "| a | b | c |", - "+-----+---+---+", - "| 10 | b | 2 |", - "| 100 | d | 4 |", - "+-----+---+---+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +-----+---+---+ + | a | b | c | + +-----+---+---+ + | 10 | b | 2 | + | 100 | d | 4 | + +-----+---+---+ + "### + ); + Ok(()) } @@ -3474,31 +3536,33 @@ async fn right_anti_filter_push_down() -> Result<()> { .join(t2, JoinType::RightAnti, &[], &[], Some(filter))? .select(vec![col("t2.a"), col("t2.b"), col("t2.c")])?; let optimized_plan = df.clone().into_optimized_plan()?; - let expected = vec![ - "RightAnti Join: t1.a = t2.a Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]", - " Projection: t1.a [a:UInt32]", - " Filter: t1.c > Int32(1) [a:UInt32, c:Int32]", - " TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]", - " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", - ]; let formatted = optimized_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r###" + RightAnti Join: t1.a = t2.a Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32] + Projection: t1.a [a:UInt32] + Filter: t1.c > Int32(1) [a:UInt32, c:Int32] + TableScan: t1 projection=[a, c] [a:UInt32, c:Int32] + TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32] + "### ); let results = df.collect().await?; - let expected: Vec<&str> = vec![ - "+----+---+---+", - "| a | b | c |", - "+----+---+---+", - "| 13 | c | 3 |", - "| 3 | a | 1 |", - "+----+---+---+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+---+---+ + | a | b | c | + +----+---+---+ + | 13 | c | 3 | + | 3 | a | 1 | + +----+---+---+ + "### + ); + Ok(()) } @@ -3507,33 +3571,39 @@ async fn unnest_columns() -> Result<()> { const NUM_ROWS: usize = 4; let df = table_with_nested_types(NUM_ROWS).await?; let results = df.collect().await?; - let expected = ["+----------+------------------------------------------------+--------------------+", - "| shape_id | points | tags |", - "+----------+------------------------------------------------+--------------------+", - "| 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | [tag1] |", - "| 2 | | [tag1, tag2] |", - "| 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | |", - "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | [tag1, tag2, tag3] |", - "+----------+------------------------------------------------+--------------------+"]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+------------------------------------------------+--------------------+ + | shape_id | points | tags | + +----------+------------------------------------------------+--------------------+ + | 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | [tag1] | + | 2 | | [tag1, tag2] | + | 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | | + | 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | [tag1, tag2, tag3] | + +----------+------------------------------------------------+--------------------+ + "### + ); // Unnest tags let df = table_with_nested_types(NUM_ROWS).await?; let results = df.unnest_columns(&["tags"])?.collect().await?; - let expected = [ - "+----------+------------------------------------------------+------+", - "| shape_id | points | tags |", - "+----------+------------------------------------------------+------+", - "| 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | tag1 |", - "| 2 | | tag1 |", - "| 2 | | tag2 |", - "| 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | |", - "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag1 |", - "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag2 |", - "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag3 |", - "+----------+------------------------------------------------+------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+------------------------------------------------+------+ + | shape_id | points | tags | + +----------+------------------------------------------------+------+ + | 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | tag1 | + | 2 | | tag1 | + | 2 | | tag2 | + | 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | | + | 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag1 | + | 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag2 | + | 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag3 | + +----------+------------------------------------------------+------+ + "### + ); // Test aggregate results for tags. let df = table_with_nested_types(NUM_ROWS).await?; @@ -3543,21 +3613,23 @@ async fn unnest_columns() -> Result<()> { // Unnest points let df = table_with_nested_types(NUM_ROWS).await?; let results = df.unnest_columns(&["points"])?.collect().await?; - let expected = [ - "+----------+-----------------+--------------------+", - "| shape_id | points | tags |", - "+----------+-----------------+--------------------+", - "| 1 | {x: -3, y: -4} | [tag1] |", - "| 1 | {x: -3, y: 6} | [tag1] |", - "| 1 | {x: 2, y: -2} | [tag1] |", - "| 2 | | [tag1, tag2] |", - "| 3 | {x: -10, y: -4} | |", - "| 3 | {x: -9, y: 2} | |", - "| 4 | {x: -3, y: 5} | [tag1, tag2, tag3] |", - "| 4 | {x: 2, y: -1} | [tag1, tag2, tag3] |", - "+----------+-----------------+--------------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-----------------+--------------------+ + | shape_id | points | tags | + +----------+-----------------+--------------------+ + | 1 | {x: -3, y: -4} | [tag1] | + | 1 | {x: -3, y: 6} | [tag1] | + | 1 | {x: 2, y: -2} | [tag1] | + | 2 | | [tag1, tag2] | + | 3 | {x: -10, y: -4} | | + | 3 | {x: -9, y: 2} | | + | 4 | {x: -3, y: 5} | [tag1, tag2, tag3] | + | 4 | {x: 2, y: -1} | [tag1, tag2, tag3] | + +----------+-----------------+--------------------+ + "### + ); // Test aggregate results for points. let df = table_with_nested_types(NUM_ROWS).await?; @@ -3571,26 +3643,28 @@ async fn unnest_columns() -> Result<()> { .unnest_columns(&["tags"])? .collect() .await?; - let expected = vec![ - "+----------+-----------------+------+", - "| shape_id | points | tags |", - "+----------+-----------------+------+", - "| 1 | {x: -3, y: -4} | tag1 |", - "| 1 | {x: -3, y: 6} | tag1 |", - "| 1 | {x: 2, y: -2} | tag1 |", - "| 2 | | tag1 |", - "| 2 | | tag2 |", - "| 3 | {x: -10, y: -4} | |", - "| 3 | {x: -9, y: 2} | |", - "| 4 | {x: -3, y: 5} | tag1 |", - "| 4 | {x: -3, y: 5} | tag2 |", - "| 4 | {x: -3, y: 5} | tag3 |", - "| 4 | {x: 2, y: -1} | tag1 |", - "| 4 | {x: 2, y: -1} | tag2 |", - "| 4 | {x: 2, y: -1} | tag3 |", - "+----------+-----------------+------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-----------------+------+ + | shape_id | points | tags | + +----------+-----------------+------+ + | 1 | {x: -3, y: -4} | tag1 | + | 1 | {x: -3, y: 6} | tag1 | + | 1 | {x: 2, y: -2} | tag1 | + | 2 | | tag1 | + | 2 | | tag2 | + | 3 | {x: -10, y: -4} | | + | 3 | {x: -9, y: 2} | | + | 4 | {x: -3, y: 5} | tag1 | + | 4 | {x: -3, y: 5} | tag2 | + | 4 | {x: -3, y: 5} | tag3 | + | 4 | {x: 2, y: -1} | tag1 | + | 4 | {x: 2, y: -1} | tag2 | + | 4 | {x: 2, y: -1} | tag3 | + +----------+-----------------+------+ + "### + ); // Test aggregate results for points and tags. let df = table_with_nested_types(NUM_ROWS).await?; @@ -3628,16 +3702,18 @@ async fn unnest_dict_encoded_columns() -> Result<()> { .unnest_columns(&["make_array_expr"])?; let results = df.collect().await.unwrap(); - let expected = [ - "+-----------------+---------+", - "| make_array_expr | column1 |", - "+-----------------+---------+", - "| x | x |", - "| y | y |", - "| z | z |", - "+-----------------+---------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----------------+---------+ + | make_array_expr | column1 | + +-----------------+---------+ + | x | x | + | y | y | + | z | z | + +-----------------+---------+ + "### + ); // make_array(dict_encoded_string,literal string) let make_array_udf_expr2 = make_array_udf().call(vec![ @@ -3654,19 +3730,21 @@ async fn unnest_dict_encoded_columns() -> Result<()> { .unnest_columns(&["make_array_expr"])?; let results = df.collect().await.unwrap(); - let expected = [ - "+-----------------+---------+", - "| make_array_expr | column1 |", - "+-----------------+---------+", - "| x | x |", - "| fixed_string | x |", - "| y | y |", - "| fixed_string | y |", - "| z | z |", - "| fixed_string | z |", - "+-----------------+---------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----------------+---------+ + | make_array_expr | column1 | + +-----------------+---------+ + | x | x | + | fixed_string | x | + | y | y | + | fixed_string | y | + | z | z | + | fixed_string | z | + +-----------------+---------+ + "### + ); Ok(()) } @@ -3674,17 +3752,19 @@ async fn unnest_dict_encoded_columns() -> Result<()> { async fn unnest_column_nulls() -> Result<()> { let df = table_with_lists_and_nulls().await?; let results = df.clone().collect().await?; - let expected = [ - "+--------+----+", - "| list | id |", - "+--------+----+", - "| [1, 2] | A |", - "| | B |", - "| [] | C |", - "| [3] | D |", - "+--------+----+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +--------+----+ + | list | id | + +--------+----+ + | [1, 2] | A | + | | B | + | [] | C | + | [3] | D | + +--------+----+ + "### + ); // Unnest, preserving nulls (row with B is preserved) let options = UnnestOptions::new().with_preserve_nulls(true); @@ -3694,33 +3774,37 @@ async fn unnest_column_nulls() -> Result<()> { .unnest_columns_with_options(&["list"], options)? .collect() .await?; - let expected = [ - "+------+----+", - "| list | id |", - "+------+----+", - "| 1 | A |", - "| 2 | A |", - "| | B |", - "| 3 | D |", - "+------+----+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+----+ + | list | id | + +------+----+ + | 1 | A | + | 2 | A | + | | B | + | 3 | D | + +------+----+ + "### + ); let options = UnnestOptions::new().with_preserve_nulls(false); let results = df .unnest_columns_with_options(&["list"], options)? .collect() .await?; - let expected = [ - "+------+----+", - "| list | id |", - "+------+----+", - "| 1 | A |", - "| 2 | A |", - "| 3 | D |", - "+------+----+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+----+ + | list | id | + +------+----+ + | 1 | A | + | 2 | A | + | 3 | D | + +------+----+ + "### + ); Ok(()) } @@ -3734,19 +3818,21 @@ async fn unnest_fixed_list() -> Result<()> { let df = ctx.table("shapes").await?; let results = df.clone().collect().await?; - let expected = [ - "+----------+----------------+", - "| shape_id | tags |", - "+----------+----------------+", - "| 1 | |", - "| 2 | [tag21, tag22] |", - "| 3 | [tag31, tag32] |", - "| 4 | |", - "| 5 | [tag51, tag52] |", - "| 6 | [tag61, tag62] |", - "+----------+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+----------------+ + | shape_id | tags | + +----------+----------------+ + | 1 | | + | 2 | [tag21, tag22] | + | 3 | [tag31, tag32] | + | 4 | | + | 5 | [tag51, tag52] | + | 6 | [tag61, tag62] | + +----------+----------------+ + "### + ); let options = UnnestOptions::new().with_preserve_nulls(true); @@ -3754,23 +3840,25 @@ async fn unnest_fixed_list() -> Result<()> { .unnest_columns_with_options(&["tags"], options)? .collect() .await?; - let expected = vec![ - "+----------+-------+", - "| shape_id | tags |", - "+----------+-------+", - "| 1 | |", - "| 2 | tag21 |", - "| 2 | tag22 |", - "| 3 | tag31 |", - "| 3 | tag32 |", - "| 4 | |", - "| 5 | tag51 |", - "| 5 | tag52 |", - "| 6 | tag61 |", - "| 6 | tag62 |", - "+----------+-------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-------+ + | shape_id | tags | + +----------+-------+ + | 1 | | + | 2 | tag21 | + | 2 | tag22 | + | 3 | tag31 | + | 3 | tag32 | + | 4 | | + | 5 | tag51 | + | 5 | tag52 | + | 6 | tag61 | + | 6 | tag62 | + +----------+-------+ + "### + ); Ok(()) } @@ -3784,19 +3872,21 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> { let df = ctx.table("shapes").await?; let results = df.clone().collect().await?; - let expected = [ - "+----------+----------------+", - "| shape_id | tags |", - "+----------+----------------+", - "| 1 | |", - "| 2 | [tag21, tag22] |", - "| 3 | [tag31, tag32] |", - "| 4 | |", - "| 5 | [tag51, tag52] |", - "| 6 | [tag61, tag62] |", - "+----------+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+----------------+ + | shape_id | tags | + +----------+----------------+ + | 1 | | + | 2 | [tag21, tag22] | + | 3 | [tag31, tag32] | + | 4 | | + | 5 | [tag51, tag52] | + | 6 | [tag61, tag62] | + +----------+----------------+ + "### + ); let options = UnnestOptions::new().with_preserve_nulls(false); @@ -3804,21 +3894,23 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> { .unnest_columns_with_options(&["tags"], options)? .collect() .await?; - let expected = [ - "+----------+-------+", - "| shape_id | tags |", - "+----------+-------+", - "| 2 | tag21 |", - "| 2 | tag22 |", - "| 3 | tag31 |", - "| 3 | tag32 |", - "| 5 | tag51 |", - "| 5 | tag52 |", - "| 6 | tag61 |", - "| 6 | tag62 |", - "+----------+-------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-------+ + | shape_id | tags | + +----------+-------+ + | 2 | tag21 | + | 2 | tag22 | + | 3 | tag31 | + | 3 | tag32 | + | 5 | tag51 | + | 5 | tag52 | + | 6 | tag61 | + | 6 | tag62 | + +----------+-------+ + "### + ); Ok(()) } @@ -3851,44 +3943,48 @@ async fn unnest_fixed_list_non_null() -> Result<()> { let df = ctx.table("shapes").await?; let results = df.clone().collect().await?; - let expected = [ - "+----------+----------------+", - "| shape_id | tags |", - "+----------+----------------+", - "| 1 | [tag11, tag12] |", - "| 2 | [tag21, tag22] |", - "| 3 | [tag31, tag32] |", - "| 4 | [tag41, tag42] |", - "| 5 | [tag51, tag52] |", - "| 6 | [tag61, tag62] |", - "+----------+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+----------------+ + | shape_id | tags | + +----------+----------------+ + | 1 | [tag11, tag12] | + | 2 | [tag21, tag22] | + | 3 | [tag31, tag32] | + | 4 | [tag41, tag42] | + | 5 | [tag51, tag52] | + | 6 | [tag61, tag62] | + +----------+----------------+ + "### + ); let options = UnnestOptions::new().with_preserve_nulls(true); let results = df .unnest_columns_with_options(&["tags"], options)? .collect() .await?; - let expected = vec![ - "+----------+-------+", - "| shape_id | tags |", - "+----------+-------+", - "| 1 | tag11 |", - "| 1 | tag12 |", - "| 2 | tag21 |", - "| 2 | tag22 |", - "| 3 | tag31 |", - "| 3 | tag32 |", - "| 4 | tag41 |", - "| 4 | tag42 |", - "| 5 | tag51 |", - "| 5 | tag52 |", - "| 6 | tag61 |", - "| 6 | tag62 |", - "+----------+-------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-------+ + | shape_id | tags | + +----------+-------+ + | 1 | tag11 | + | 1 | tag12 | + | 2 | tag21 | + | 2 | tag22 | + | 3 | tag31 | + | 3 | tag32 | + | 4 | tag41 | + | 4 | tag42 | + | 5 | tag51 | + | 5 | tag52 | + | 6 | tag61 | + | 6 | tag62 | + +----------+-------+ + "### + ); Ok(()) } @@ -3899,18 +3995,20 @@ async fn unnest_aggregate_columns() -> Result<()> { let df = table_with_nested_types(NUM_ROWS).await?; let results = df.select_columns(&["tags"])?.collect().await?; - let expected = [ - r#"+--------------------+"#, - r#"| tags |"#, - r#"+--------------------+"#, - r#"| [tag1] |"#, - r#"| [tag1, tag2] |"#, - r#"| |"#, - r#"| [tag1, tag2, tag3] |"#, - r#"| [tag1, tag2, tag3] |"#, - r#"+--------------------+"#, - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +--------------------+ + | tags | + +--------------------+ + | | + | [tag1, tag2, tag3] | + | [tag1, tag2, tag3] | + | [tag1, tag2] | + | [tag1] | + +--------------------+ + "### + ); let df = table_with_nested_types(NUM_ROWS).await?; let results = df @@ -3918,14 +4016,16 @@ async fn unnest_aggregate_columns() -> Result<()> { .aggregate(vec![], vec![count(col("tags"))])? .collect() .await?; - let expected = [ - r#"+-------------+"#, - r#"| count(tags) |"#, - r#"+-------------+"#, - r#"| 9 |"#, - r#"+-------------+"#, - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +-------------+ + | count(tags) | + +-------------+ + | 9 | + +-------------+ + "### + ); Ok(()) } @@ -3995,22 +4095,24 @@ async fn unnest_array_agg() -> Result<()> { assert!(rb.num_rows() > 0); } - let expected = vec![ - "+----------+--------+", - "| shape_id | tag_id |", - "+----------+--------+", - "| 1 | 11 |", - "| 1 | 12 |", - "| 1 | 13 |", - "| 2 | 21 |", - "| 2 | 22 |", - "| 2 | 23 |", - "| 3 | 31 |", - "| 3 | 32 |", - "| 3 | 33 |", - "+----------+--------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+--------+ + | shape_id | tag_id | + +----------+--------+ + | 1 | 11 | + | 1 | 12 | + | 1 | 13 | + | 2 | 21 | + | 2 | 22 | + | 2 | 23 | + | 3 | 31 | + | 3 | 32 | + | 3 | 33 | + +----------+--------+ + "### + ); // Doing an `array_agg` by `shape_id` produces: let results = df @@ -4021,16 +4123,18 @@ async fn unnest_array_agg() -> Result<()> { )? .collect() .await?; - let expected = [ - "+----------+--------------+", - "| shape_id | tag_id |", - "+----------+--------------+", - "| 1 | [11, 12, 13] |", - "| 2 | [21, 22, 23] |", - "| 3 | [31, 32, 33] |", - "+----------+--------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+--------------+ + | shape_id | tag_id | + +----------+--------------+ + | 1 | [11, 12, 13] | + | 2 | [21, 22, 23] | + | 3 | [31, 32, 33] | + +----------+--------------+ + "### + ); // Unnesting again should produce the original batch. let results = ctx @@ -4043,22 +4147,24 @@ async fn unnest_array_agg() -> Result<()> { .unnest_columns(&["tag_id"])? .collect() .await?; - let expected = vec![ - "+----------+--------+", - "| shape_id | tag_id |", - "+----------+--------+", - "| 1 | 11 |", - "| 1 | 12 |", - "| 1 | 13 |", - "| 2 | 21 |", - "| 2 | 22 |", - "| 2 | 23 |", - "| 3 | 31 |", - "| 3 | 32 |", - "| 3 | 33 |", - "+----------+--------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+--------+ + | shape_id | tag_id | + +----------+--------+ + | 1 | 11 | + | 1 | 12 | + | 1 | 13 | + | 2 | 21 | + | 2 | 22 | + | 2 | 23 | + | 3 | 31 | + | 3 | 32 | + | 3 | 33 | + +----------+--------+ + "### + ); Ok(()) } @@ -4085,22 +4191,24 @@ async fn unnest_with_redundant_columns() -> Result<()> { let df = ctx.table("shapes").await?; let results = df.clone().collect().await?; - let expected = vec![ - "+----------+--------+", - "| shape_id | tag_id |", - "+----------+--------+", - "| 1 | 11 |", - "| 1 | 12 |", - "| 1 | 13 |", - "| 2 | 21 |", - "| 2 | 22 |", - "| 2 | 23 |", - "| 3 | 31 |", - "| 3 | 32 |", - "| 3 | 33 |", - "+----------+--------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+--------+ + | shape_id | tag_id | + +----------+--------+ + | 1 | 11 | + | 1 | 12 | + | 1 | 13 | + | 2 | 21 | + | 2 | 22 | + | 2 | 23 | + | 3 | 31 | + | 3 | 32 | + | 3 | 33 | + +----------+--------+ + "### + ); // Doing an `array_agg` by `shape_id` produces: let df = df @@ -4113,37 +4221,38 @@ async fn unnest_with_redundant_columns() -> Result<()> { .select(vec![col("shape_id")])?; let optimized_plan = df.clone().into_optimized_plan()?; - let expected = vec![ - "Projection: shapes.shape_id [shape_id:UInt32]", - " Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N]", - " Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: \"item\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]", - " TableScan: shapes projection=[shape_id] [shape_id:UInt32]", - ]; let formatted = optimized_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r###" + Projection: shapes.shape_id [shape_id:UInt32] + Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N] + Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + TableScan: shapes projection=[shape_id] [shape_id:UInt32] + "### ); let results = df.collect().await?; - let expected = [ - "+----------+", - "| shape_id |", - "+----------+", - "| 1 |", - "| 1 |", - "| 1 |", - "| 2 |", - "| 2 |", - "| 2 |", - "| 3 |", - "| 3 |", - "| 3 |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+ + | shape_id | + +----------+ + | 1 | + | 1 | + | 1 | + | 2 | + | 2 | + | 2 | + | 3 | + | 3 | + | 3 | + +----------+ + "### + ); Ok(()) } @@ -4181,22 +4290,24 @@ async fn unnest_multiple_columns() -> Result<()> { // large_list: [null, 1.1], [2.2, 3.3, 4.4], null, [], // fixed_list: null, [1,2], [3,4], null // string: a, b, c, d - let expected = [ - "+------+------------+------------+--------+", - "| list | large_list | fixed_list | string |", - "+------+------------+------------+--------+", - "| 1 | | | a |", - "| 2 | 1.1 | | a |", - "| 3 | | | a |", - "| | 2.2 | 1 | b |", - "| | 3.3 | 2 | b |", - "| | 4.4 | | b |", - "| | | 3 | c |", - "| | | 4 | c |", - "| | | | d |", - "+------+------------+------------+--------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+------------+------------+--------+ + | list | large_list | fixed_list | string | + +------+------------+------------+--------+ + | 1 | | | a | + | 2 | 1.1 | | a | + | 3 | | | a | + | | 2.2 | 1 | b | + | | 3.3 | 2 | b | + | | 4.4 | | b | + | | | 3 | c | + | | | 4 | c | + | | | | d | + +------+------------+------------+--------+ + "### + ); // Test with `preserve_nulls = false`` let results = df @@ -4210,21 +4321,23 @@ async fn unnest_multiple_columns() -> Result<()> { // large_list: [null, 1.1], [2.2, 3.3, 4.4], null, [], // fixed_list: null, [1,2], [3,4], null // string: a, b, c, d - let expected = [ - "+------+------------+------------+--------+", - "| list | large_list | fixed_list | string |", - "+------+------------+------------+--------+", - "| 1 | | | a |", - "| 2 | 1.1 | | a |", - "| 3 | | | a |", - "| | 2.2 | 1 | b |", - "| | 3.3 | 2 | b |", - "| | 4.4 | | b |", - "| | | 3 | c |", - "| | | 4 | c |", - "+------+------------+------------+--------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+------------+------------+--------+ + | list | large_list | fixed_list | string | + +------+------------+------------+--------+ + | 1 | | | a | + | 2 | 1.1 | | a | + | 3 | | | a | + | | 2.2 | 1 | b | + | | 3.3 | 2 | b | + | | 4.4 | | b | + | | | 3 | c | + | | | 4 | c | + +------+------------+------------+--------+ + "### + ); Ok(()) } @@ -4250,18 +4363,18 @@ async fn unnest_non_nullable_list() -> Result<()> { .collect() .await?; - // Unnesting may produce NULLs even if the list is non-nullable. - #[rustfmt::skip] - let expected = [ - "+----+", - "| c1 |", - "+----+", - "| 1 |", - "| 2 |", - "| |", - "+----+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +----+ + | c1 | + +----+ + | 1 | + | 2 | + | | + +----+ + "### + ); Ok(()) } @@ -4302,22 +4415,24 @@ async fn test_read_batches() -> Result<()> { ]; let df = ctx.read_batches(batches).unwrap(); df.clone().show().await.unwrap(); - let result = df.collect().await?; - let expected = [ - "+----+--------+", - "| id | number |", - "+----+--------+", - "| 1 | 1.12 |", - "| 2 | 3.4 |", - "| 3 | 2.33 |", - "| 4 | 9.1 |", - "| 5 | 6.66 |", - "| 3 | 1.11 |", - "| 4 | 2.22 |", - "| 5 | 3.33 |", - "+----+--------+", - ]; - assert_batches_sorted_eq!(expected, &result); + let results = df.collect().await?; + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+--------+ + | id | number | + +----+--------+ + | 1 | 1.12 | + | 2 | 3.4 | + | 3 | 1.11 | + | 3 | 2.33 | + | 4 | 2.22 | + | 4 | 9.1 | + | 5 | 3.33 | + | 5 | 6.66 | + +----+--------+ + "### + ); Ok(()) } #[tokio::test] @@ -4334,9 +4449,14 @@ async fn test_read_batches_empty() -> Result<()> { let batches = vec![]; let df = ctx.read_batches(batches).unwrap(); df.clone().show().await.unwrap(); - let result = df.collect().await?; - let expected = ["++", "++"]; - assert_batches_sorted_eq!(expected, &result); + let results = df.collect().await?; + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + ++ + ++ + "### + ); Ok(()) } @@ -4379,15 +4499,17 @@ async fn consecutive_projection_same_schema() -> Result<()> { .unwrap(); let results = df.collect().await?; - let expected = [ - "+----+----+----+", - "| id | t | t2 |", - "+----+----+----+", - "| 0 | | |", - "| 1 | 10 | 10 |", - "+----+----+----+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+----+----+ + | id | t | t2 | + +----+----+----+ + | 0 | | | + | 1 | 10 | 10 | + +----+----+----+ + "### + ); Ok(()) } @@ -4698,14 +4820,16 @@ async fn test_array_agg() -> Result<()> { let results = df.collect().await?; - let expected = [ - "+-------------------------------------+", - "| array_agg(test.a) |", - "+-------------------------------------+", - "| [abcDEF, abc123, CBAdef, 123AbcDef] |", - "+-------------------------------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-------------------------------------+ + | array_agg(test.a) | + +-------------------------------------+ + | [abcDEF, abc123, CBAdef, 123AbcDef] | + +-------------------------------------+ + "### + ); Ok(()) } @@ -4725,24 +4849,24 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Filter: a = $0 [a:Int32]", - " Projection: Int32(1) AS a [a:Int32]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Filter: a = $0 [a:Int32] + Projection: Int32(1) AS a [a:Int32] + EmptyRelation [] + "### ); // Executing LogicalPlans with placeholders that don't have bound values // should fail. let results = df.collect().await; let err_msg = results.unwrap_err().strip_backtrace(); - assert_eq!( + assert_snapshot!( err_msg, - "Execution error: Placeholder '$0' was not provided a value for execution." + @"Execution error: Placeholder '$0' was not provided a value for execution." ); // Providing a parameter value should resolve the error @@ -4758,25 +4882,25 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Filter: a = Int32(3) [a:Int32]", - " Projection: Int32(1) AS a [a:Int32]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Filter: a = Int32(3) [a:Int32] + Projection: Int32(1) AS a [a:Int32] + EmptyRelation [] + "### ); // N.B., the test is basically `SELECT 1 as a WHERE a = 3;` which returns no results. - #[rustfmt::skip] - let expected = [ - "++", - "++" - ]; - - assert_batches_eq!(expected, &df.collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.collect().await.unwrap()), + @r###" + ++ + ++ + "### + ); Ok(()) } @@ -4789,26 +4913,23 @@ async fn test_dataframe_placeholder_column_parameter() -> Result<()> { let df = ctx.read_empty().unwrap().select_exprs(&["$1"]).unwrap(); let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - - #[rustfmt::skip] - let expected = vec![ - "Projection: $1 [$1:Null;N]", - " EmptyRelation []" - ]; - - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Projection: $1 [$1:Null;N] + EmptyRelation [] + "### ); // Executing LogicalPlans with placeholders that don't have bound values // should fail. let results = df.collect().await; let err_msg = results.unwrap_err().strip_backtrace(); - assert_eq!( + assert_snapshot!( err_msg, - "Execution error: Placeholder '$1' was not provided a value for execution." + @"Execution error: Placeholder '$1' was not provided a value for execution." ); // Providing a parameter value should resolve the error @@ -4822,26 +4943,26 @@ async fn test_dataframe_placeholder_column_parameter() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Projection: Int32(3) AS $1 [$1:Null;N]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Projection: Int32(3) AS $1 [$1:Null;N] + EmptyRelation [] + "### ); - #[rustfmt::skip] - let expected = [ - "+----+", - "| $1 |", - "+----+", - "| 3 |", - "+----+" - ]; - - assert_batches_eq!(expected, &df.collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.collect().await.unwrap()), + @r###" + +----+ + | $1 | + +----+ + | 3 | + +----+ + "### + ); Ok(()) } @@ -4861,24 +4982,24 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Filter: a LIKE $1 [a:Utf8]", - " Projection: Utf8(\"foo\") AS a [a:Utf8]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Filter: a LIKE $1 [a:Utf8] + Projection: Utf8("foo") AS a [a:Utf8] + EmptyRelation [] + "### ); // Executing LogicalPlans with placeholders that don't have bound values // should fail. let results = df.collect().await; let err_msg = results.unwrap_err().strip_backtrace(); - assert_eq!( + assert_snapshot!( err_msg, - "Execution error: Placeholder '$1' was not provided a value for execution." + @"Execution error: Placeholder '$1' was not provided a value for execution." ); // Providing a parameter value should resolve the error @@ -4894,27 +5015,27 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Filter: a LIKE Utf8(\"f%\") [a:Utf8]", - " Projection: Utf8(\"foo\") AS a [a:Utf8]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Filter: a LIKE Utf8("f%") [a:Utf8] + Projection: Utf8("foo") AS a [a:Utf8] + EmptyRelation [] + "### ); - #[rustfmt::skip] - let expected = [ - "+-----+", - "| a |", - "+-----+", - "| foo |", - "+-----+" - ]; - - assert_batches_eq!(expected, &df.collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.collect().await.unwrap()), + @r###" + +-----+ + | a | + +-----+ + | foo | + +-----+ + "### + ); Ok(()) } @@ -4969,9 +5090,16 @@ async fn write_partitioned_parquet_results() -> Result<()> { // Check that the c2 column is gone and that c1 is abc. let results = filter_df.collect().await?; - let expected = ["+-----+", "| c1 |", "+-----+", "| abc |", "+-----+"]; - - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----+ + | c1 | + +-----+ + | abc | + +-----+ + "### + ); // Read the entire set of parquet files let df = ctx @@ -4984,16 +5112,17 @@ async fn write_partitioned_parquet_results() -> Result<()> { // Check that the df has the entire set of data let results = df.collect().await?; - let expected = [ - "+-----+-----+", - "| c1 | c2 |", - "+-----+-----+", - "| abc | 123 |", - "| def | 456 |", - "+-----+-----+", - ]; - - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +-----+-----+ + | c1 | c2 | + +-----+-----+ + | abc | 123 | + | def | 456 | + +-----+-----+ + "### + ); Ok(()) } @@ -5113,45 +5242,51 @@ async fn sparse_union_is_null() { let df = ctx.table("union_batch").await.unwrap(); // view_all - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=1} |", - "| {A=} |", - "| {B=3.2} |", - "| {B=} |", - "| {C=a} |", - "| {C=} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &df.clone().collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&df.clone().collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=1} | + | {A=} | + | {B=3.2} | + | {B=} | + | {C=a} | + | {C=} | + +----------+ + "### + ); // filter where is null let result_df = df.clone().filter(col("my_union").is_null()).unwrap(); - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=} |", - "| {B=} |", - "| {C=} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&result_df.collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=} | + | {B=} | + | {C=} | + +----------+ + "### + ); // filter where is not null let result_df = df.filter(col("my_union").is_not_null()).unwrap(); - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=1} |", - "| {B=3.2} |", - "| {C=a} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&result_df.collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=1} | + | {B=3.2} | + | {C=a} | + +----------+ + "### + ); } #[tokio::test] @@ -5190,45 +5325,51 @@ async fn dense_union_is_null() { let df = ctx.table("union_batch").await.unwrap(); // view_all - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=1} |", - "| {A=} |", - "| {B=3.2} |", - "| {B=} |", - "| {C=a} |", - "| {C=} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &df.clone().collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&df.clone().collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=1} | + | {A=} | + | {B=3.2} | + | {B=} | + | {C=a} | + | {C=} | + +----------+ + "### + ); // filter where is null let result_df = df.clone().filter(col("my_union").is_null()).unwrap(); - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=} |", - "| {B=} |", - "| {C=} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&result_df.collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=} | + | {B=} | + | {C=} | + +----------+ + "### + ); // filter where is not null let result_df = df.filter(col("my_union").is_not_null()).unwrap(); - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=1} |", - "| {B=3.2} |", - "| {C=a} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&result_df.collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=1} | + | {B=3.2} | + | {C=a} | + +----------+ + "### + ); } #[tokio::test] @@ -5257,32 +5398,37 @@ async fn boolean_dictionary_as_filter() { let df = ctx.table("dict_batch").await.unwrap(); // view_all - let expected = [ - "+---------+", - "| my_dict |", - "+---------+", - "| true |", - "| true |", - "| false |", - "| |", - "| false |", - "| true |", - "| false |", - "+---------+", - ]; - assert_batches_eq!(expected, &df.clone().collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.clone().collect().await.unwrap()), + @r###" + +---------+ + | my_dict | + +---------+ + | true | + | true | + | false | + | | + | false | + | true | + | false | + +---------+ + "### + ); let result_df = df.clone().filter(col("my_dict")).unwrap(); - let expected = [ - "+---------+", - "| my_dict |", - "+---------+", - "| true |", - "| true |", - "| true |", - "+---------+", - ]; - assert_batches_eq!(expected, &result_df.collect().await.unwrap()); + + assert_snapshot!( + batches_to_string(&result_df.collect().await.unwrap()), + @r###" + +---------+ + | my_dict | + +---------+ + | true | + | true | + | true | + +---------+ + "### + ); // test nested dictionary let keys = vec![0, 2]; // 0 -> true, 2 -> false @@ -5310,27 +5456,29 @@ async fn boolean_dictionary_as_filter() { let df = ctx.table("nested_dict_batch").await.unwrap(); // view_all - let expected = [ - "+----------------+", - "| my_nested_dict |", - "+----------------+", - "| true |", - "| false |", - "+----------------+", - ]; - - assert_batches_eq!(expected, &df.clone().collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.clone().collect().await.unwrap()), + @r###" + +----------------+ + | my_nested_dict | + +----------------+ + | true | + | false | + +----------------+ + "### + ); let result_df = df.clone().filter(col("my_nested_dict")).unwrap(); - let expected = [ - "+----------------+", - "| my_nested_dict |", - "+----------------+", - "| true |", - "+----------------+", - ]; - - assert_batches_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&result_df.collect().await.unwrap()), + @r###" + +----------------+ + | my_nested_dict | + +----------------+ + | true | + +----------------+ + "### + ); } #[tokio::test] @@ -5343,32 +5491,51 @@ async fn test_alias() -> Result<()> { df.schema().columns().iter().for_each(|c| { assert_eq!(c.relation, Some("table_alias".into())); }); - let expected = "SubqueryAlias: table_alias [a:Utf8, b:Int32, one:Int32]\ - \n Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32]\ - \n TableScan: test [a:Utf8, b:Int32]"; + let plan = df .clone() .into_unoptimized_plan() .display_indent_schema() .to_string(); - assert_eq!(plan, expected); + assert_snapshot!(plan, @r###" + SubqueryAlias: table_alias [a:Utf8, b:Int32, one:Int32] + Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32] + TableScan: test [a:Utf8, b:Int32] + "###); // Select over the aliased DataFrame let df = df.select(vec![ col("table_alias.a"), col("b") + col("table_alias.one"), ])?; - let expected = [ - "+-----------+---------------------------------+", - "| a | table_alias.b + table_alias.one |", - "+-----------+---------------------------------+", - "| abcDEF | 2 |", - "| abc123 | 11 |", - "| CBAdef | 11 |", - "| 123AbcDef | 101 |", - "+-----------+---------------------------------+", - ]; - assert_batches_sorted_eq!(expected, &df.collect().await?); + assert_snapshot!( + batches_to_sort_string(&df.collect().await.unwrap()), + @r###" + +-----------+---------------------------------+ + | a | table_alias.b + table_alias.one | + +-----------+---------------------------------+ + | 123AbcDef | 101 | + | CBAdef | 11 | + | abc123 | 11 | + | abcDEF | 2 | + +-----------+---------------------------------+ + "### + ); + Ok(()) +} + +#[tokio::test] +async fn test_alias_with_metadata() -> Result<()> { + let mut metadata = HashMap::new(); + metadata.insert(String::from("k"), String::from("v")); + let df = create_test_table("test") + .await? + .select(vec![col("a").alias_with_metadata("b", Some(metadata))])? + .alias("table_alias")?; + let df = df.select(vec![col("table_alias.b")])?; + let schema = df.schema(); + let metadata = schema.field(0).metadata(); + assert_eq!(metadata.get("k"), Some(&String::from("v"))); Ok(()) } @@ -5379,45 +5546,49 @@ async fn test_alias_self_join() -> Result<()> { let left = create_test_table("t1").await?; let right = left.clone().alias("t2")?; let joined = left.join(right, JoinType::Full, &["a"], &["a"], None)?; - let expected = [ - "+-----------+-----+-----------+-----+", - "| a | b | a | b |", - "+-----------+-----+-----------+-----+", - "| abcDEF | 1 | abcDEF | 1 |", - "| abc123 | 10 | abc123 | 10 |", - "| CBAdef | 10 | CBAdef | 10 |", - "| 123AbcDef | 100 | 123AbcDef | 100 |", - "+-----------+-----+-----------+-----+", - ]; - assert_batches_sorted_eq!(expected, &joined.collect().await?); + assert_snapshot!( + batches_to_sort_string(&joined.collect().await.unwrap()), + @r###" + +-----------+-----+-----------+-----+ + | a | b | a | b | + +-----------+-----+-----------+-----+ + | 123AbcDef | 100 | 123AbcDef | 100 | + | CBAdef | 10 | CBAdef | 10 | + | abc123 | 10 | abc123 | 10 | + | abcDEF | 1 | abcDEF | 1 | + +-----------+-----+-----------+-----+ + "### + ); Ok(()) } #[tokio::test] async fn test_alias_empty() -> Result<()> { let df = create_test_table("test").await?.alias("")?; - let expected = "SubqueryAlias: [a:Utf8, b:Int32]\ - \n TableScan: test [a:Utf8, b:Int32]"; let plan = df .clone() .into_unoptimized_plan() .display_indent_schema() .to_string(); - assert_eq!(plan, expected); - let expected = [ - "+-----------+-----+", - "| a | b |", - "+-----------+-----+", - "| abcDEF | 1 |", - "| abc123 | 10 |", - "| CBAdef | 10 |", - "| 123AbcDef | 100 |", - "+-----------+-----+", - ]; - assert_batches_sorted_eq!( - expected, - &df.select(vec![col("a"), col("b")])?.collect().await? + assert_snapshot!(plan, @r###" + SubqueryAlias: [a:Utf8, b:Int32] + TableScan: test [a:Utf8, b:Int32] + "###); + + assert_snapshot!( + batches_to_sort_string(&df.select(vec![col("a"), col("b")])?.collect().await.unwrap()), + @r###" + +-----------+-----+ + | a | b | + +-----------+-----+ + | 123AbcDef | 100 | + | CBAdef | 10 | + | abc123 | 10 | + | abcDEF | 1 | + +-----------+-----+ + "### ); + Ok(()) } @@ -5428,39 +5599,43 @@ async fn test_alias_nested() -> Result<()> { .select(vec![col("a"), col("test.b"), lit(1).alias("one")])? .alias("alias1")? .alias("alias2")?; - let expected = "SubqueryAlias: alias2 [a:Utf8, b:Int32, one:Int32]\ - \n SubqueryAlias: alias1 [a:Utf8, b:Int32, one:Int32]\ - \n Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32]\ - \n TableScan: test projection=[a, b] [a:Utf8, b:Int32]"; + let plan = df .clone() .into_optimized_plan()? .display_indent_schema() .to_string(); - assert_eq!(plan, expected); + assert_snapshot!(plan, @r###" + SubqueryAlias: alias2 [a:Utf8, b:Int32, one:Int32] + SubqueryAlias: alias1 [a:Utf8, b:Int32, one:Int32] + Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32] + TableScan: test projection=[a, b] [a:Utf8, b:Int32] + "###); // Select over the aliased DataFrame let select1 = df .clone() .select(vec![col("alias2.a"), col("b") + col("alias2.one")])?; - let expected = [ - "+-----------+-----------------------+", - "| a | alias2.b + alias2.one |", - "+-----------+-----------------------+", - "| 123AbcDef | 101 |", - "| CBAdef | 11 |", - "| abc123 | 11 |", - "| abcDEF | 2 |", - "+-----------+-----------------------+", - ]; - assert_batches_sorted_eq!(expected, &select1.collect().await?); + + assert_snapshot!( + batches_to_sort_string(&select1.collect().await.unwrap()), + @r###" + +-----------+-----------------------+ + | a | alias2.b + alias2.one | + +-----------+-----------------------+ + | 123AbcDef | 101 | + | CBAdef | 11 | + | abc123 | 11 | + | abcDEF | 2 | + +-----------+-----------------------+ + "### + ); // Only the outermost alias is visible let select2 = df.select(vec![col("alias1.a")]); - assert_eq!( + assert_snapshot!( select2.unwrap_err().strip_backtrace(), - "Schema error: No field named alias1.a. \ - Valid fields are alias2.a, alias2.b, alias2.one." + @"Schema error: No field named alias1.a. Valid fields are alias2.a, alias2.b, alias2.one." ); Ok(()) } @@ -5618,16 +5793,19 @@ async fn test_fill_null() -> Result<()> { )?; let results = df_filled.collect().await?; - let expected = [ - "+---+---------+", - "| a | b |", - "+---+---------+", - "| 1 | x |", - "| 0 | default |", - "| 3 | z |", - "+---+---------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +---+---------+ + | a | b | + +---+---------+ + | 0 | default | + | 1 | x | + | 3 | z | + +---+---------+ + "### + ); + Ok(()) } @@ -5643,32 +5821,35 @@ async fn test_fill_null_all_columns() -> Result<()> { let results = df_filled.clone().collect().await?; - let expected = [ - "+---+---------+", - "| a | b |", - "+---+---------+", - "| 1 | x |", - "| | default |", - "| 3 | z |", - "+---+---------+", - ]; - - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +---+---------+ + | a | b | + +---+---------+ + | | default | + | 1 | x | + | 3 | z | + +---+---------+ + "### + ); // Fill column "a" null values with a value that cannot be cast to Int32. let df_filled = df_filled.fill_null(ScalarValue::Int32(Some(0)), vec![])?; let results = df_filled.collect().await?; - let expected = [ - "+---+---------+", - "| a | b |", - "+---+---------+", - "| 1 | x |", - "| 0 | default |", - "| 3 | z |", - "+---+---------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +---+---------+ + | a | b | + +---+---------+ + | 0 | default | + | 1 | x | + | 3 | z | + +---+---------+ + "### + ); Ok(()) } @@ -5735,11 +5916,16 @@ async fn test_insert_into_casting_support() -> Result<()> { .await .unwrap(); - // The result should be the same as the input which is ['a123', 'b456'] - let expected = [ - "+------+", "| a |", "+------+", "| a123 |", "| b456 |", "+------+", - ]; - - assert_batches_eq!(expected, &res); + assert_snapshot!( + batches_to_string(&res), + @r###" + +------+ + | a | + +------+ + | a123 | + | b456 | + +------+ + "### + ); Ok(()) } diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 1025a49ea1e3..46221acfcc9b 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -41,6 +41,7 @@ use datafusion::physical_plan::{collect, displayable, ExecutionPlan}; use datafusion::prelude::{DataFrame, SessionConfig, SessionContext}; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion_common::HashMap; +use datafusion_common_runtime::JoinSet; use datafusion_functions_aggregate::sum::sum_udaf; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::PhysicalSortExpr; @@ -50,7 +51,6 @@ use test_utils::{add_empty_batches, StringBatchGenerator}; use rand::rngs::StdRng; use rand::{thread_rng, Rng, SeedableRng}; -use tokio::task::JoinSet; // ======================================================================== // The new aggregation fuzz tests based on [`AggregationFuzzer`] diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs index db61eaef25c9..c608adda5d1c 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs @@ -21,8 +21,8 @@ use std::sync::Arc; use arrow::array::RecordBatch; use arrow::util::pretty::pretty_format_batches; use datafusion_common::{DataFusionError, Result}; +use datafusion_common_runtime::JoinSet; use rand::{thread_rng, Rng}; -use tokio::task::JoinSet; use crate::fuzz_cases::aggregation_fuzzer::{ check_equality_of_batches, diff --git a/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs b/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs index d817e4c7a3b4..3049631d4b3f 100644 --- a/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs @@ -22,8 +22,8 @@ use std::sync::Arc; use arrow::array::{cast::AsArray, Array, OffsetSizeTrait, RecordBatch}; use datafusion::datasource::MemTable; +use datafusion_common_runtime::JoinSet; use std::collections::HashSet; -use tokio::task::JoinSet; use datafusion::prelude::{SessionConfig, SessionContext}; use test_utils::StringBatchGenerator; diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index b12b3be2d435..4a4059db2547 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -23,7 +23,6 @@ use std::time::SystemTime; use arrow::array::{ArrayRef, Int64Array, Int8Array, StringArray}; use arrow::datatypes::{Field, Schema, SchemaBuilder}; use arrow::record_batch::RecordBatch; -use datafusion::assert_batches_sorted_eq; use datafusion::datasource::file_format::parquet::fetch_parquet_metadata; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; @@ -33,11 +32,13 @@ use datafusion::datasource::physical_plan::{ use datafusion::physical_plan::collect; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion::prelude::SessionContext; +use datafusion_common::test_util::batches_to_sort_string; use datafusion_common::Result; use bytes::Bytes; use futures::future::BoxFuture; use futures::{FutureExt, TryFutureExt}; +use insta::assert_snapshot; use object_store::memory::InMemory; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; @@ -96,17 +97,15 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { let task_ctx = session_ctx.task_ctx(); let read = collect(parquet_exec, task_ctx).await.unwrap(); - let expected = [ - "+-----+----+----+", - "| c1 | c2 | c3 |", - "+-----+----+----+", - "| Foo | 1 | 10 |", - "| | 2 | 20 |", - "| bar | | |", - "+-----+----+----+", - ]; - - assert_batches_sorted_eq!(expected, &read); + assert_snapshot!(batches_to_sort_string(&read), @r" + +-----+----+----+ + | c1 | c2 | c3 | + +-----+----+----+ + | | 2 | 20 | + | Foo | 1 | 10 | + | bar | | | + +-----+----+----+ + "); } #[derive(Debug)] diff --git a/datafusion/core/tests/parquet/schema.rs b/datafusion/core/tests/parquet/schema.rs index e13fbad24426..29afd3970432 100644 --- a/datafusion/core/tests/parquet/schema.rs +++ b/datafusion/core/tests/parquet/schema.rs @@ -21,7 +21,8 @@ use std::{collections::HashMap, fs, path::Path}; use tempfile::TempDir; use super::*; -use datafusion_common::assert_batches_sorted_eq; +use datafusion_common::test_util::batches_to_sort_string; +use insta::assert_snapshot; #[tokio::test] async fn schema_merge_ignores_metadata_by_default() { @@ -57,20 +58,6 @@ async fn schema_merge_ignores_metadata_by_default() { ]; write_files(table_dir.as_path(), schemas); - // can be any order - let expected = [ - "+----+------+", - "| id | name |", - "+----+------+", - "| 1 | test |", - "| 2 | test |", - "| 3 | test |", - "| 0 | test |", - "| 5 | test |", - "| 4 | test |", - "+----+------+", - ]; - // Read the parquet files into a dataframe to confirm results // (no errors) let table_path = table_dir.to_str().unwrap().to_string(); @@ -82,7 +69,18 @@ async fn schema_merge_ignores_metadata_by_default() { .unwrap(); let actual = df.collect().await.unwrap(); - assert_batches_sorted_eq!(expected, &actual); + assert_snapshot!(batches_to_sort_string(&actual), @r" + +----+------+ + | id | name | + +----+------+ + | 0 | test | + | 1 | test | + | 2 | test | + | 3 | test | + | 4 | test | + | 5 | test | + +----+------+ + "); assert_no_metadata(&actual); // also validate it works via SQL interface as well @@ -97,7 +95,18 @@ async fn schema_merge_ignores_metadata_by_default() { .collect() .await .unwrap(); - assert_batches_sorted_eq!(expected, &actual); + assert_snapshot!(batches_to_sort_string(&actual), @r" + +----+------+ + | id | name | + +----+------+ + | 0 | test | + | 1 | test | + | 2 | test | + | 3 | test | + | 4 | test | + | 5 | test | + +----+------+ + "); assert_no_metadata(&actual); } @@ -124,17 +133,6 @@ async fn schema_merge_can_preserve_metadata() { ]; write_files(table_dir.as_path(), schemas); - // can be any order - let expected = [ - "+----+------+", - "| id | name |", - "+----+------+", - "| 1 | test |", - "| 2 | test |", - "| 0 | test |", - "+----+------+", - ]; - let mut expected_metadata = make_meta("foo", "bar"); expected_metadata.insert("foo2".into(), "baz".into()); @@ -153,7 +151,15 @@ async fn schema_merge_can_preserve_metadata() { let actual = df.collect().await.unwrap(); - assert_batches_sorted_eq!(expected, &actual); + assert_snapshot!(batches_to_sort_string(&actual), @r" + +----+------+ + | id | name | + +----+------+ + | 0 | test | + | 1 | test | + | 2 | test | + +----+------+ + "); assert_metadata(&actual, &expected_metadata); // also validate it works via SQL interface as well @@ -167,7 +173,15 @@ async fn schema_merge_can_preserve_metadata() { assert_eq!(actual.clone(), expected_metadata); let actual = df.collect().await.unwrap(); - assert_batches_sorted_eq!(expected, &actual); + assert_snapshot!(batches_to_sort_string(&actual), @r" + +----+------+ + | id | name | + +----+------+ + | 0 | test | + | 1 | test | + | 2 | test | + +----+------+ + "); assert_metadata(&actual, &expected_metadata); } diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index bb20246bf9d5..85bc1104795f 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -22,14 +22,15 @@ use arrow::array::{ StringArray, }; use arrow::datatypes::{DataType, Field, Schema}; -use datafusion::assert_batches_sorted_eq; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; use datafusion::physical_plan::collect; use datafusion::prelude::SessionContext; use datafusion::test::object_store::local_unpartitioned_file; +use datafusion_common::test_util::batches_to_sort_string; use datafusion_common::Result; use datafusion_execution::object_store::ObjectStoreUrl; +use insta::assert_snapshot; use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; @@ -70,19 +71,18 @@ async fn multi_parquet_coercion() { let task_ctx = session_ctx.task_ctx(); let read = collect(parquet_exec, task_ctx).await.unwrap(); - let expected = [ - "+-------+----+------+", - "| c1 | c2 | c3 |", - "+-------+----+------+", - "| | | |", - "| | 1 | 10.0 |", - "| | 2 | |", - "| | 2 | 20.0 |", - "| one | 1 | |", - "| three | | |", - "+-------+----+------+", - ]; - assert_batches_sorted_eq!(expected, &read); + assert_snapshot!(batches_to_sort_string(&read), @r" + +-------+----+------+ + | c1 | c2 | c3 | + +-------+----+------+ + | | | | + | | 1 | 10.0 | + | | 2 | | + | | 2 | 20.0 | + | one | 1 | | + | three | | | + +-------+----+------+ + "); } #[tokio::test] @@ -127,19 +127,18 @@ async fn multi_parquet_coercion_projection() { let task_ctx = session_ctx.task_ctx(); let read = collect(parquet_exec, task_ctx).await.unwrap(); - let expected = [ - "+----+-------+------+", - "| c2 | c1 | c3 |", - "+----+-------+------+", - "| | foo | |", - "| | three | |", - "| 1 | baz | 10.0 |", - "| 1 | one | |", - "| 2 | | |", - "| 2 | Boo | 20.0 |", - "+----+-------+------+", - ]; - assert_batches_sorted_eq!(expected, &read); + assert_snapshot!(batches_to_sort_string(&read), @r" + +----+-------+------+ + | c2 | c1 | c3 | + +----+-------+------+ + | | foo | | + | | three | | + | 1 | baz | 10.0 | + | 1 | one | | + | 2 | | | + | 2 | Boo | 20.0 | + +----+-------+------+ + "); } /// Writes `batches` to a temporary parquet file diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index b0b5f731063f..abe058df99d0 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -29,7 +29,9 @@ use datafusion_common::Result; use datafusion_common::{JoinSide, JoinType, ScalarValue}; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; -use datafusion_expr::{Operator, ScalarUDF, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, +}; use datafusion_physical_expr::expressions::{ binary, cast, col, BinaryExpr, CaseExpr, CastExpr, Column, Literal, NegativeExpr, }; @@ -57,6 +59,7 @@ use datafusion_physical_plan::streaming::StreamingTableExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; +use datafusion_expr_common::columnar_value::ColumnarValue; use itertools::Itertools; /// Mocked UDF @@ -89,6 +92,10 @@ impl ScalarUDFImpl for DummyUDF { fn return_type(&self, _arg_types: &[DataType]) -> Result { Ok(DataType::Int32) } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + panic!("dummy - not implemented") + } } #[test] diff --git a/datafusion/core/tests/serde/mod.rs b/datafusion/core/tests/serde/mod.rs new file mode 100644 index 000000000000..05dde7a54186 --- /dev/null +++ b/datafusion/core/tests/serde/mod.rs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Ensure `serde` feature from `arrow-schema` crate is re-exported. +#[test] +#[cfg(feature = "serde")] +fn ensure_serde_support() { + use datafusion::arrow::datatypes::DataType; + + #[derive(Debug, PartialEq, serde::Serialize, serde::Deserialize)] + struct WrappingStruct(DataType); + + let boolean = WrappingStruct(DataType::Boolean); + + let serialized = serde_json::to_string(&boolean).unwrap(); + assert_eq!("\"Boolean\"", serialized); + + let deserialized = serde_json::from_str(&serialized).unwrap(); + assert_eq!(boolean, deserialized); +} diff --git a/datafusion/core/tests/sql/aggregates.rs b/datafusion/core/tests/sql/aggregates.rs index 7b1f349e15b5..52372e01d41a 100644 --- a/datafusion/core/tests/sql/aggregates.rs +++ b/datafusion/core/tests/sql/aggregates.rs @@ -72,14 +72,13 @@ async fn count_partitioned() -> Result<()> { execute_with_partition("SELECT count(c1), count(c2) FROM test", 4).await?; assert_eq!(results.len(), 1); - let expected = [ - "+----------------+----------------+", - "| count(test.c1) | count(test.c2) |", - "+----------------+----------------+", - "| 40 | 40 |", - "+----------------+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +----------------+----------------+ + | count(test.c1) | count(test.c2) | + +----------------+----------------+ + | 40 | 40 | + +----------------+----------------+ + "); Ok(()) } @@ -88,17 +87,16 @@ async fn count_aggregated() -> Result<()> { let results = execute_with_partition("SELECT c1, count(c2) FROM test GROUP BY c1", 4).await?; - let expected = [ - "+----+----------------+", - "| c1 | count(test.c2) |", - "+----+----------------+", - "| 0 | 10 |", - "| 1 | 10 |", - "| 2 | 10 |", - "| 3 | 10 |", - "+----+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +----+----------------+ + | c1 | count(test.c2) | + +----+----------------+ + | 0 | 10 | + | 1 | 10 | + | 2 | 10 | + | 3 | 10 | + +----+----------------+ + "); Ok(()) } @@ -110,68 +108,67 @@ async fn count_aggregated_cube() -> Result<()> { ) .await?; - let expected = vec![ - "+----+----+----------------+", - "| c1 | c2 | count(test.c3) |", - "+----+----+----------------+", - "| | | 40 |", - "| | 1 | 4 |", - "| | 10 | 4 |", - "| | 2 | 4 |", - "| | 3 | 4 |", - "| | 4 | 4 |", - "| | 5 | 4 |", - "| | 6 | 4 |", - "| | 7 | 4 |", - "| | 8 | 4 |", - "| | 9 | 4 |", - "| 0 | | 10 |", - "| 0 | 1 | 1 |", - "| 0 | 10 | 1 |", - "| 0 | 2 | 1 |", - "| 0 | 3 | 1 |", - "| 0 | 4 | 1 |", - "| 0 | 5 | 1 |", - "| 0 | 6 | 1 |", - "| 0 | 7 | 1 |", - "| 0 | 8 | 1 |", - "| 0 | 9 | 1 |", - "| 1 | | 10 |", - "| 1 | 1 | 1 |", - "| 1 | 10 | 1 |", - "| 1 | 2 | 1 |", - "| 1 | 3 | 1 |", - "| 1 | 4 | 1 |", - "| 1 | 5 | 1 |", - "| 1 | 6 | 1 |", - "| 1 | 7 | 1 |", - "| 1 | 8 | 1 |", - "| 1 | 9 | 1 |", - "| 2 | | 10 |", - "| 2 | 1 | 1 |", - "| 2 | 10 | 1 |", - "| 2 | 2 | 1 |", - "| 2 | 3 | 1 |", - "| 2 | 4 | 1 |", - "| 2 | 5 | 1 |", - "| 2 | 6 | 1 |", - "| 2 | 7 | 1 |", - "| 2 | 8 | 1 |", - "| 2 | 9 | 1 |", - "| 3 | | 10 |", - "| 3 | 1 | 1 |", - "| 3 | 10 | 1 |", - "| 3 | 2 | 1 |", - "| 3 | 3 | 1 |", - "| 3 | 4 | 1 |", - "| 3 | 5 | 1 |", - "| 3 | 6 | 1 |", - "| 3 | 7 | 1 |", - "| 3 | 8 | 1 |", - "| 3 | 9 | 1 |", - "+----+----+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +----+----+----------------+ + | c1 | c2 | count(test.c3) | + +----+----+----------------+ + | | | 40 | + | | 1 | 4 | + | | 10 | 4 | + | | 2 | 4 | + | | 3 | 4 | + | | 4 | 4 | + | | 5 | 4 | + | | 6 | 4 | + | | 7 | 4 | + | | 8 | 4 | + | | 9 | 4 | + | 0 | | 10 | + | 0 | 1 | 1 | + | 0 | 10 | 1 | + | 0 | 2 | 1 | + | 0 | 3 | 1 | + | 0 | 4 | 1 | + | 0 | 5 | 1 | + | 0 | 6 | 1 | + | 0 | 7 | 1 | + | 0 | 8 | 1 | + | 0 | 9 | 1 | + | 1 | | 10 | + | 1 | 1 | 1 | + | 1 | 10 | 1 | + | 1 | 2 | 1 | + | 1 | 3 | 1 | + | 1 | 4 | 1 | + | 1 | 5 | 1 | + | 1 | 6 | 1 | + | 1 | 7 | 1 | + | 1 | 8 | 1 | + | 1 | 9 | 1 | + | 2 | | 10 | + | 2 | 1 | 1 | + | 2 | 10 | 1 | + | 2 | 2 | 1 | + | 2 | 3 | 1 | + | 2 | 4 | 1 | + | 2 | 5 | 1 | + | 2 | 6 | 1 | + | 2 | 7 | 1 | + | 2 | 8 | 1 | + | 2 | 9 | 1 | + | 3 | | 10 | + | 3 | 1 | 1 | + | 3 | 10 | 1 | + | 3 | 2 | 1 | + | 3 | 3 | 1 | + | 3 | 4 | 1 | + | 3 | 5 | 1 | + | 3 | 6 | 1 | + | 3 | 7 | 1 | + | 3 | 8 | 1 | + | 3 | 9 | 1 | + +----+----+----------------+ + "); Ok(()) } @@ -259,14 +256,15 @@ async fn count_distinct_integers_aggregated_single_partition() -> Result<()> { let results = run_count_distinct_integers_aggregated_scenario(partitions).await?; - let expected = ["+---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+", - "| c_group | count(test.c_uint64) | count(DISTINCT test.c_int8) | count(DISTINCT test.c_int16) | count(DISTINCT test.c_int32) | count(DISTINCT test.c_int64) | count(DISTINCT test.c_uint8) | count(DISTINCT test.c_uint16) | count(DISTINCT test.c_uint32) | count(DISTINCT test.c_uint64) |", - "+---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+", - "| a | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |", - "| b | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |", - "| c | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |", - "+---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+"]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+ + | c_group | count(test.c_uint64) | count(DISTINCT test.c_int8) | count(DISTINCT test.c_int16) | count(DISTINCT test.c_int32) | count(DISTINCT test.c_int64) | count(DISTINCT test.c_uint8) | count(DISTINCT test.c_uint16) | count(DISTINCT test.c_uint32) | count(DISTINCT test.c_uint64) | + +---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+ + | a | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | + | b | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + | c | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | + +---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+ + "); Ok(()) } @@ -283,14 +281,15 @@ async fn count_distinct_integers_aggregated_multiple_partitions() -> Result<()> let results = run_count_distinct_integers_aggregated_scenario(partitions).await?; - let expected = ["+---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+", - "| c_group | count(test.c_uint64) | count(DISTINCT test.c_int8) | count(DISTINCT test.c_int16) | count(DISTINCT test.c_int32) | count(DISTINCT test.c_int64) | count(DISTINCT test.c_uint8) | count(DISTINCT test.c_uint16) | count(DISTINCT test.c_uint32) | count(DISTINCT test.c_uint64) |", - "+---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+", - "| a | 5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |", - "| b | 5 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |", - "| c | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |", - "+---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+"]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+ + | c_group | count(test.c_uint64) | count(DISTINCT test.c_int8) | count(DISTINCT test.c_int16) | count(DISTINCT test.c_int32) | count(DISTINCT test.c_int64) | count(DISTINCT test.c_uint8) | count(DISTINCT test.c_uint16) | count(DISTINCT test.c_uint32) | count(DISTINCT test.c_uint64) | + +---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+ + | a | 5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | + | b | 5 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | + | c | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + +---------+----------------------+-----------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------------+-------------------------------+-------------------------------+ + "); Ok(()) } @@ -308,16 +307,17 @@ async fn test_accumulator_row_accumulator() -> Result<()> { LIMIT 5"; let actual = execute_to_batches(&ctx, sql).await; - let expected = ["+----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+", - "| c1 | c2 | min1 | min2 | max1 | max2 | avg1 | min3 | cnt1 | sum1 |", - "+----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+", - "| a | 1 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB | 774637006 | waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs | 4015442341 | 2437927011.0 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB | 5 | 6094771121.5 |", - "| a | 2 | b3b9esRhTzFEawbs6XhpKnD9ojutHB | 145294611 | ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8 | 3717551163 | 2267588664.0 | b3b9esRhTzFEawbs6XhpKnD9ojutHB | 3 | 3401364777.0 |", - "| a | 3 | Amn2K87Db5Es3dFQO9cw9cvpAM6h35 | 431948861 | oLZ21P2JEDooxV1pU31cIxQHEeeoLu | 3998790955 | 2225685115.1666665 | Amn2K87Db5Es3dFQO9cw9cvpAM6h35 | 6 | 6676994872.5 |", - "| a | 4 | KJFcmTVjdkCMv94wYCtfHMFhzyRsmH | 466439833 | ydkwycaISlYSlEq3TlkS2m15I2pcp8 | 2502326480 | 1655431654.0 | KJFcmTVjdkCMv94wYCtfHMFhzyRsmH | 4 | 3310812222.5 |", - "| a | 5 | MeSTAXq8gVxVjbEjgkvU9YLte0X9uE | 141047417 | QJYm7YRA3YetcBHI5wkMZeLXVmfuNy | 2496054700 | 1216992989.6666667 | MeSTAXq8gVxVjbEjgkvU9YLte0X9uE | 3 | 1825431770.0 |", - "+----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+"]; - assert_batches_eq!(expected, &actual); + assert_snapshot!(batches_to_sort_string(&actual), @r" + +----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+ + | c1 | c2 | min1 | min2 | max1 | max2 | avg1 | min3 | cnt1 | sum1 | + +----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+ + | a | 1 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB | 774637006 | waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs | 4015442341 | 2437927011.0 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB | 5 | 6094771121.5 | + | a | 2 | b3b9esRhTzFEawbs6XhpKnD9ojutHB | 145294611 | ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8 | 3717551163 | 2267588664.0 | b3b9esRhTzFEawbs6XhpKnD9ojutHB | 3 | 3401364777.0 | + | a | 3 | Amn2K87Db5Es3dFQO9cw9cvpAM6h35 | 431948861 | oLZ21P2JEDooxV1pU31cIxQHEeeoLu | 3998790955 | 2225685115.1666665 | Amn2K87Db5Es3dFQO9cw9cvpAM6h35 | 6 | 6676994872.5 | + | a | 4 | KJFcmTVjdkCMv94wYCtfHMFhzyRsmH | 466439833 | ydkwycaISlYSlEq3TlkS2m15I2pcp8 | 2502326480 | 1655431654.0 | KJFcmTVjdkCMv94wYCtfHMFhzyRsmH | 4 | 3310812222.5 | + | a | 5 | MeSTAXq8gVxVjbEjgkvU9YLte0X9uE | 141047417 | QJYm7YRA3YetcBHI5wkMZeLXVmfuNy | 2496054700 | 1216992989.6666667 | MeSTAXq8gVxVjbEjgkvU9YLte0X9uE | 3 | 1825431770.0 | + +----+----+--------------------------------+-----------+--------------------------------+------------+--------------------+--------------------------------+------+--------------+ + "); Ok(()) } diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 03c4ad7c013e..579049692e7d 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -30,10 +30,11 @@ use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::ExecutionPlanVisitor; use datafusion::prelude::*; use datafusion::test_util; -use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; use datafusion::{execution::context::SessionContext, physical_plan::displayable}; +use datafusion_common::test_util::batches_to_sort_string; use datafusion_common::utils::get_available_parallelism; use datafusion_common::{assert_contains, assert_not_contains}; +use insta::assert_snapshot; use object_store::path::Path; use std::fs::File; use std::io::Write; diff --git a/datafusion/core/tests/sql/path_partition.rs b/datafusion/core/tests/sql/path_partition.rs index c88051d5c9ef..46aecd1dc070 100644 --- a/datafusion/core/tests/sql/path_partition.rs +++ b/datafusion/core/tests/sql/path_partition.rs @@ -28,7 +28,6 @@ use datafusion::datasource::listing::ListingTableUrl; use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; use datafusion::datasource::source::DataSourceExec; use datafusion::{ - assert_batches_sorted_eq, datasource::{ file_format::{csv::CsvFormat, parquet::ParquetFormat}, listing::{ListingOptions, ListingTable, ListingTableConfig}, @@ -40,6 +39,7 @@ use datafusion::{ }; use datafusion_catalog::TableProvider; use datafusion_common::stats::Precision; +use datafusion_common::test_util::batches_to_sort_string; use datafusion_common::ScalarValue; use datafusion_execution::config::SessionConfig; use datafusion_expr::{col, lit, Expr, Operator}; @@ -49,6 +49,7 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{TimeZone, Utc}; use futures::stream::{self, BoxStream}; +use insta::assert_snapshot; use object_store::{ path::Path, GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutOptions, PutResult, @@ -145,16 +146,15 @@ async fn parquet_distinct_partition_col() -> Result<()> { .collect() .await?; - let expected = [ - "+------+-------+-----+", - "| year | month | day |", - "+------+-------+-----+", - "| 2021 | 09 | 09 |", - "| 2021 | 10 | 09 |", - "| 2021 | 10 | 28 |", - "+------+-------+-----+", - ]; - assert_batches_sorted_eq!(expected, &result); + assert_snapshot!(batches_to_sort_string(&result), @r" + +------+-------+-----+ + | year | month | day | + +------+-------+-----+ + | 2021 | 09 | 09 | + | 2021 | 10 | 09 | + | 2021 | 10 | 28 | + +------+-------+-----+ + "); //Test that the number of rows returned by partition column scan and actually reading the parquet file are the same let actual_row_count: usize = ctx .sql("SELECT id from t") @@ -275,18 +275,17 @@ async fn csv_filter_with_file_col() -> Result<()> { .collect() .await?; - let expected = [ - "+----+----+", - "| c1 | c2 |", - "+----+----+", - "| a | 1 |", - "| b | 1 |", - "| b | 5 |", - "| c | 2 |", - "| d | 5 |", - "+----+----+", - ]; - assert_batches_sorted_eq!(expected, &result); + assert_snapshot!(batches_to_sort_string(&result), @r" + +----+----+ + | c1 | c2 | + +----+----+ + | a | 1 | + | b | 1 | + | b | 5 | + | c | 2 | + | d | 5 | + +----+----+ + "); Ok(()) } @@ -313,18 +312,17 @@ async fn csv_filter_with_file_nonstring_col() -> Result<()> { .collect() .await?; - let expected = [ - "+----+----+------------+", - "| c1 | c2 | date |", - "+----+----+------------+", - "| a | 1 | 2021-10-28 |", - "| b | 1 | 2021-10-28 |", - "| b | 5 | 2021-10-28 |", - "| c | 2 | 2021-10-28 |", - "| d | 5 | 2021-10-28 |", - "+----+----+------------+", - ]; - assert_batches_sorted_eq!(expected, &result); + assert_snapshot!(batches_to_sort_string(&result), @r" + +----+----+------------+ + | c1 | c2 | date | + +----+----+------------+ + | a | 1 | 2021-10-28 | + | b | 1 | 2021-10-28 | + | b | 5 | 2021-10-28 | + | c | 2 | 2021-10-28 | + | d | 5 | 2021-10-28 | + +----+----+------------+ + "); Ok(()) } @@ -351,18 +349,17 @@ async fn csv_projection_on_partition() -> Result<()> { .collect() .await?; - let expected = [ - "+----+------------+", - "| c1 | date |", - "+----+------------+", - "| a | 2021-10-27 |", - "| b | 2021-10-27 |", - "| b | 2021-10-27 |", - "| c | 2021-10-27 |", - "| d | 2021-10-27 |", - "+----+------------+", - ]; - assert_batches_sorted_eq!(expected, &result); + assert_snapshot!(batches_to_sort_string(&result), @r" + +----+------------+ + | c1 | date | + +----+------------+ + | a | 2021-10-27 | + | b | 2021-10-27 | + | b | 2021-10-27 | + | c | 2021-10-27 | + | d | 2021-10-27 | + +----+------------+ + "); Ok(()) } @@ -390,15 +387,14 @@ async fn csv_grouping_by_partition() -> Result<()> { .collect() .await?; - let expected = [ - "+------------+----------+----------------------+", - "| date | count(*) | count(DISTINCT t.c1) |", - "+------------+----------+----------------------+", - "| 2021-10-26 | 100 | 5 |", - "| 2021-10-27 | 100 | 5 |", - "+------------+----------+----------------------+", - ]; - assert_batches_sorted_eq!(expected, &result); + assert_snapshot!(batches_to_sort_string(&result), @r" + +------------+----------+----------------------+ + | date | count(*) | count(DISTINCT t.c1) | + +------------+----------+----------------------+ + | 2021-10-26 | 100 | 5 | + | 2021-10-27 | 100 | 5 | + +------------+----------+----------------------+ + "); Ok(()) } @@ -430,21 +426,20 @@ async fn parquet_multiple_partitions() -> Result<()> { .collect() .await?; - let expected = [ - "+----+-----+", - "| id | day |", - "+----+-----+", - "| 0 | 09 |", - "| 1 | 09 |", - "| 2 | 09 |", - "| 3 | 09 |", - "| 4 | 09 |", - "| 5 | 09 |", - "| 6 | 09 |", - "| 7 | 09 |", - "+----+-----+", - ]; - assert_batches_sorted_eq!(expected, &result); + assert_snapshot!(batches_to_sort_string(&result), @r" + +----+-----+ + | id | day | + +----+-----+ + | 0 | 09 | + | 1 | 09 | + | 2 | 09 | + | 3 | 09 | + | 4 | 09 | + | 5 | 09 | + | 6 | 09 | + | 7 | 09 | + +----+-----+ + "); Ok(()) } @@ -476,21 +471,20 @@ async fn parquet_multiple_nonstring_partitions() -> Result<()> { .collect() .await?; - let expected = [ - "+----+-----+", - "| id | day |", - "+----+-----+", - "| 0 | 9 |", - "| 1 | 9 |", - "| 2 | 9 |", - "| 3 | 9 |", - "| 4 | 9 |", - "| 5 | 9 |", - "| 6 | 9 |", - "| 7 | 9 |", - "+----+-----+", - ]; - assert_batches_sorted_eq!(expected, &result); + assert_snapshot!(batches_to_sort_string(&result), @r" + +----+-----+ + | id | day | + +----+-----+ + | 0 | 9 | + | 1 | 9 | + | 2 | 9 | + | 3 | 9 | + | 4 | 9 | + | 5 | 9 | + | 6 | 9 | + | 7 | 9 | + +----+-----+ + "); Ok(()) } diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 6e81bf6410c1..f874dd7c0842 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -30,23 +30,22 @@ async fn test_list_query_parameters() -> Result<()> { .with_param_values(vec![ScalarValue::from(3i32)])? .collect() .await?; - let expected = vec![ - "+----+----+-------+", - "| c1 | c2 | c3 |", - "+----+----+-------+", - "| 3 | 1 | false |", - "| 3 | 10 | true |", - "| 3 | 2 | true |", - "| 3 | 3 | false |", - "| 3 | 4 | true |", - "| 3 | 5 | false |", - "| 3 | 6 | true |", - "| 3 | 7 | false |", - "| 3 | 8 | true |", - "| 3 | 9 | false |", - "+----+----+-------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +----+----+-------+ + | c1 | c2 | c3 | + +----+----+-------+ + | 3 | 1 | false | + | 3 | 10 | true | + | 3 | 2 | true | + | 3 | 3 | false | + | 3 | 4 | true | + | 3 | 5 | false | + | 3 | 6 | true | + | 3 | 7 | false | + | 3 | 8 | true | + | 3 | 9 | false | + +----+----+-------+ + "); Ok(()) } @@ -66,33 +65,32 @@ async fn test_named_query_parameters() -> Result<()> { ])? .collect() .await?; - let expected = vec![ - "+----+----+", - "| c1 | c2 |", - "+----+----+", - "| 1 | 1 |", - "| 1 | 2 |", - "| 1 | 3 |", - "| 1 | 4 |", - "| 1 | 5 |", - "| 1 | 6 |", - "| 1 | 7 |", - "| 1 | 8 |", - "| 1 | 9 |", - "| 1 | 10 |", - "| 2 | 1 |", - "| 2 | 2 |", - "| 2 | 3 |", - "| 2 | 4 |", - "| 2 | 5 |", - "| 2 | 6 |", - "| 2 | 7 |", - "| 2 | 8 |", - "| 2 | 9 |", - "| 2 | 10 |", - "+----+----+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +----+----+ + | c1 | c2 | + +----+----+ + | 1 | 1 | + | 1 | 10 | + | 1 | 2 | + | 1 | 3 | + | 1 | 4 | + | 1 | 5 | + | 1 | 6 | + | 1 | 7 | + | 1 | 8 | + | 1 | 9 | + | 2 | 1 | + | 2 | 10 | + | 2 | 2 | + | 2 | 3 | + | 2 | 4 | + | 2 | 5 | + | 2 | 6 | + | 2 | 7 | + | 2 | 8 | + | 2 | 9 | + +----+----+ + "); Ok(()) } @@ -114,33 +112,32 @@ async fn test_prepare_statement() -> Result<()> { let dataframe = dataframe.with_param_values(param_values)?; let results = dataframe.collect().await?; - let expected = vec![ - "+----+----+", - "| c1 | c2 |", - "+----+----+", - "| 1 | 1 |", - "| 1 | 10 |", - "| 1 | 2 |", - "| 1 | 3 |", - "| 1 | 4 |", - "| 1 | 5 |", - "| 1 | 6 |", - "| 1 | 7 |", - "| 1 | 8 |", - "| 1 | 9 |", - "| 2 | 1 |", - "| 2 | 10 |", - "| 2 | 2 |", - "| 2 | 3 |", - "| 2 | 4 |", - "| 2 | 5 |", - "| 2 | 6 |", - "| 2 | 7 |", - "| 2 | 8 |", - "| 2 | 9 |", - "+----+----+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +----+----+ + | c1 | c2 | + +----+----+ + | 1 | 1 | + | 1 | 10 | + | 1 | 2 | + | 1 | 3 | + | 1 | 4 | + | 1 | 5 | + | 1 | 6 | + | 1 | 7 | + | 1 | 8 | + | 1 | 9 | + | 2 | 1 | + | 2 | 10 | + | 2 | 2 | + | 2 | 3 | + | 2 | 4 | + | 2 | 5 | + | 2 | 6 | + | 2 | 7 | + | 2 | 8 | + | 2 | 9 | + +----+----+ + "); Ok(()) } @@ -164,14 +161,13 @@ async fn prepared_statement_type_coercion() -> Result<()> { ])? .collect() .await?; - let expected = [ - "+--------+----------+", - "| signed | unsigned |", - "+--------+----------+", - "| -1 | 1 |", - "+--------+----------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +--------+----------+ + | signed | unsigned | + +--------+----------+ + | -1 | 1 | + +--------+----------+ + "); Ok(()) } @@ -194,14 +190,13 @@ async fn test_parameter_type_coercion() -> Result<()> { ("str", ScalarValue::from("1")), ])? .collect().await?; - let expected = [ - "+--------+----------+", - "| signed | unsigned |", - "+--------+----------+", - "| -1 | 1 |", - "+--------+----------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +--------+----------+ + | signed | unsigned | + +--------+----------+ + | -1 | 1 | + +--------+----------+ + "); Ok(()) } @@ -263,14 +258,13 @@ async fn test_positional_parameter_not_bound() -> Result<()> { .collect() .await?; - let expected = [ - "+--------+----------+", - "| signed | unsigned |", - "+--------+----------+", - "| -1 | 1 |", - "+--------+----------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +--------+----------+ + | signed | unsigned | + +--------+----------+ + | -1 | 1 | + +--------+----------+ + "); Ok(()) } @@ -309,14 +303,13 @@ async fn test_named_parameter_not_bound() -> Result<()> { .collect() .await?; - let expected = [ - "+--------+----------+", - "| signed | unsigned |", - "+--------+----------+", - "| -1 | 1 |", - "+--------+----------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!(batches_to_sort_string(&results), @r" + +--------+----------+ + | signed | unsigned | + +--------+----------+ + | -1 | 1 | + +--------+----------+ + "); Ok(()) } diff --git a/datafusion/core/tests/user_defined/expr_planner.rs b/datafusion/core/tests/user_defined/expr_planner.rs index 75d890359ba8..1fc6d14c5b22 100644 --- a/datafusion/core/tests/user_defined/expr_planner.rs +++ b/datafusion/core/tests/user_defined/expr_planner.rs @@ -16,9 +16,10 @@ // under the License. use arrow::array::RecordBatch; +use datafusion::common::test_util::batches_to_string; use std::sync::Arc; -use datafusion::common::{assert_batches_eq, DFSchema}; +use datafusion::common::DFSchema; use datafusion::error::Result; use datafusion::execution::FunctionRegistry; use datafusion::logical_expr::Operator; @@ -76,27 +77,25 @@ async fn plan_and_collect(sql: &str) -> Result> { #[tokio::test] async fn test_custom_operators_arrow() { let actual = plan_and_collect("select 'foo'->'bar';").await.unwrap(); - let expected = [ - "+----------------------------+", - "| Utf8(\"foo\") || Utf8(\"bar\") |", - "+----------------------------+", - "| foobar |", - "+----------------------------+", - ]; - assert_batches_eq!(&expected, &actual); + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +----------------------------+ + | Utf8("foo") || Utf8("bar") | + +----------------------------+ + | foobar | + +----------------------------+ + "###); } #[tokio::test] async fn test_custom_operators_long_arrow() { let actual = plan_and_collect("select 1->>2;").await.unwrap(); - let expected = [ - "+---------------------+", - "| Int64(1) + Int64(2) |", - "+---------------------+", - "| 3 |", - "+---------------------+", - ]; - assert_batches_eq!(&expected, &actual); + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---------------------+ + | Int64(1) + Int64(2) | + +---------------------+ + | 3 | + +---------------------+ + "###); } #[tokio::test] @@ -104,14 +103,13 @@ async fn test_question_select() { let actual = plan_and_collect("select a ? 2 from (select 1 as a);") .await .unwrap(); - let expected = [ - "+--------------+", - "| a ? Int64(2) |", - "+--------------+", - "| true |", - "+--------------+", - ]; - assert_batches_eq!(&expected, &actual); + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +--------------+ + | a ? Int64(2) | + +--------------+ + | true | + +--------------+ + "###); } #[tokio::test] @@ -119,6 +117,11 @@ async fn test_question_filter() { let actual = plan_and_collect("select a from (select 1 as a) where a ? 2;") .await .unwrap(); - let expected = ["+---+", "| a |", "+---+", "| 1 |", "+---+"]; - assert_batches_eq!(&expected, &actual); + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+ + | a | + +---+ + | 1 | + +---+ + "###); } diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 7cda6d410f4e..5cbb05f290a7 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -30,6 +30,7 @@ use arrow::array::{ }; use arrow::datatypes::{Fields, Schema}; +use datafusion::common::test_util::batches_to_string; use datafusion::dataframe::DataFrame; use datafusion::datasource::MemTable; use datafusion::test_util::plan_and_collect; @@ -39,7 +40,6 @@ use datafusion::{ datatypes::{DataType, Field, Float64Type, TimeUnit, TimestampNanosecondType}, record_batch::RecordBatch, }, - assert_batches_eq, error::Result, logical_expr::{ AccumulatorFactoryFunction, AggregateUDF, Signature, TypeSignature, Volatility, @@ -48,7 +48,8 @@ use datafusion::{ prelude::SessionContext, scalar::ScalarValue, }; -use datafusion_common::{assert_contains, cast::as_primitive_array, exec_err}; +use datafusion_common::assert_contains; +use datafusion_common::{cast::as_primitive_array, exec_err}; use datafusion_expr::{ col, create_udaf, function::AccumulatorArgs, AggregateUDFImpl, GroupsAccumulator, LogicalPlanBuilder, SimpleAggregateUDF, @@ -60,18 +61,20 @@ use datafusion_functions_aggregate::average::AvgAccumulator; async fn test_setup() { let TestContext { ctx, test_state: _ } = TestContext::new(); let sql = "SELECT * from t order by time"; - let expected = [ - "+-------+----------------------------+", - "| value | time |", - "+-------+----------------------------+", - "| 2.0 | 1970-01-01T00:00:00.000002 |", - "| 3.0 | 1970-01-01T00:00:00.000003 |", - "| 1.0 | 1970-01-01T00:00:00.000004 |", - "| 5.0 | 1970-01-01T00:00:00.000005 |", - "| 5.0 | 1970-01-01T00:00:00.000005 |", - "+-------+----------------------------+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +-------+----------------------------+ + | value | time | + +-------+----------------------------+ + | 2.0 | 1970-01-01T00:00:00.000002 | + | 3.0 | 1970-01-01T00:00:00.000003 | + | 1.0 | 1970-01-01T00:00:00.000004 | + | 5.0 | 1970-01-01T00:00:00.000005 | + | 5.0 | 1970-01-01T00:00:00.000005 | + +-------+----------------------------+ + "###); } /// Basic user defined aggregate @@ -80,14 +83,17 @@ async fn test_udaf() { let TestContext { ctx, test_state } = TestContext::new(); assert!(!test_state.update_batch()); let sql = "SELECT time_sum(time) from t"; - let expected = [ - "+----------------------------+", - "| time_sum(t.time) |", - "+----------------------------+", - "| 1970-01-01T00:00:00.000019 |", - "+----------------------------+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +----------------------------+ + | time_sum(t.time) | + +----------------------------+ + | 1970-01-01T00:00:00.000019 | + +----------------------------+ + "###); + // normal aggregates call update_batch assert!(test_state.update_batch()); assert!(!test_state.retract_batch()); @@ -98,18 +104,21 @@ async fn test_udaf() { async fn test_udaf_as_window() { let TestContext { ctx, test_state } = TestContext::new(); let sql = "SELECT time_sum(time) OVER() as time_sum from t"; - let expected = [ - "+----------------------------+", - "| time_sum |", - "+----------------------------+", - "| 1970-01-01T00:00:00.000019 |", - "| 1970-01-01T00:00:00.000019 |", - "| 1970-01-01T00:00:00.000019 |", - "| 1970-01-01T00:00:00.000019 |", - "| 1970-01-01T00:00:00.000019 |", - "+----------------------------+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +----------------------------+ + | time_sum | + +----------------------------+ + | 1970-01-01T00:00:00.000019 | + | 1970-01-01T00:00:00.000019 | + | 1970-01-01T00:00:00.000019 | + | 1970-01-01T00:00:00.000019 | + | 1970-01-01T00:00:00.000019 | + +----------------------------+ + "###); + // aggregate over the entire window function call update_batch assert!(test_state.update_batch()); assert!(!test_state.retract_batch()); @@ -120,18 +129,21 @@ async fn test_udaf_as_window() { async fn test_udaf_as_window_with_frame() { let TestContext { ctx, test_state } = TestContext::new(); let sql = "SELECT time_sum(time) OVER(ORDER BY time ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as time_sum from t"; - let expected = [ - "+----------------------------+", - "| time_sum |", - "+----------------------------+", - "| 1970-01-01T00:00:00.000005 |", - "| 1970-01-01T00:00:00.000009 |", - "| 1970-01-01T00:00:00.000012 |", - "| 1970-01-01T00:00:00.000014 |", - "| 1970-01-01T00:00:00.000010 |", - "+----------------------------+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +----------------------------+ + | time_sum | + +----------------------------+ + | 1970-01-01T00:00:00.000005 | + | 1970-01-01T00:00:00.000009 | + | 1970-01-01T00:00:00.000012 | + | 1970-01-01T00:00:00.000014 | + | 1970-01-01T00:00:00.000010 | + +----------------------------+ + "###); + // user defined aggregates with window frame should be calling retract batch assert!(test_state.update_batch()); assert!(test_state.retract_batch()); @@ -155,14 +167,16 @@ async fn test_udaf_as_window_with_frame_without_retract_batch() { async fn test_udaf_returning_struct() { let TestContext { ctx, test_state: _ } = TestContext::new(); let sql = "SELECT first(value, time) from t"; - let expected = [ - "+------------------------------------------------+", - "| first(t.value,t.time) |", - "+------------------------------------------------+", - "| {value: 2.0, time: 1970-01-01T00:00:00.000002} |", - "+------------------------------------------------+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +------------------------------------------------+ + | first(t.value,t.time) | + +------------------------------------------------+ + | {value: 2.0, time: 1970-01-01T00:00:00.000002} | + +------------------------------------------------+ + "###); } /// Demonstrate extracting the fields from a structure using a subquery @@ -170,14 +184,16 @@ async fn test_udaf_returning_struct() { async fn test_udaf_returning_struct_subquery() { let TestContext { ctx, test_state: _ } = TestContext::new(); let sql = "select sq.first['value'], sq.first['time'] from (SELECT first(value, time) as first from t) as sq"; - let expected = [ - "+-----------------+----------------------------+", - "| sq.first[value] | sq.first[time] |", - "+-----------------+----------------------------+", - "| 2.0 | 1970-01-01T00:00:00.000002 |", - "+-----------------+----------------------------+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +-----------------+----------------------------+ + | sq.first[value] | sq.first[time] | + +-----------------+----------------------------+ + | 2.0 | 1970-01-01T00:00:00.000002 | + +-----------------+----------------------------+ + "###); } #[tokio::test] @@ -189,26 +205,29 @@ async fn test_udaf_shadows_builtin_fn() { let sql = "SELECT sum(arrow_cast(time, 'Int64')) from t"; // compute with builtin `sum` aggregator - let expected = [ - "+---------------------------------------+", - "| sum(arrow_cast(t.time,Utf8(\"Int64\"))) |", - "+---------------------------------------+", - "| 19000 |", - "+---------------------------------------+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---------------------------------------+ + | sum(arrow_cast(t.time,Utf8("Int64"))) | + +---------------------------------------+ + | 19000 | + +---------------------------------------+ + "###); // Register `TimeSum` with name `sum`. This will shadow the builtin one - let sql = "SELECT sum(time) from t"; TimeSum::register(&mut ctx, test_state.clone(), "sum"); - let expected = [ - "+----------------------------+", - "| sum(t.time) |", - "+----------------------------+", - "| 1970-01-01T00:00:00.000019 |", - "+----------------------------+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + let sql = "SELECT sum(time) from t"; + + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +----------------------------+ + | sum(t.time) | + +----------------------------+ + | 1970-01-01T00:00:00.000019 | + +----------------------------+ + "###); } async fn execute(ctx: &SessionContext, sql: &str) -> Result> { @@ -248,14 +267,13 @@ async fn simple_udaf() -> Result<()> { let result = ctx.sql("SELECT MY_AVG(a) FROM t").await?.collect().await?; - let expected = [ - "+-------------+", - "| my_avg(t.a) |", - "+-------------+", - "| 3.0 |", - "+-------------+", - ]; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + +-------------+ + | my_avg(t.a) | + +-------------+ + | 3.0 | + +-------------+ + "###); Ok(()) } @@ -315,14 +333,13 @@ async fn case_sensitive_identifiers_user_defined_aggregates() -> Result<()> { .collect() .await?; - let expected = [ - "+-------------+", - "| MY_AVG(t.i) |", - "+-------------+", - "| 1.0 |", - "+-------------+", - ]; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + +-------------+ + | MY_AVG(t.i) | + +-------------+ + | 1.0 | + +-------------+ + "###); Ok(()) } @@ -346,19 +363,25 @@ async fn test_user_defined_functions_with_alias() -> Result<()> { ctx.register_udaf(my_avg); - let expected = [ - "+------------+", - "| dummy(t.i) |", - "+------------+", - "| 1.0 |", - "+------------+", - ]; - let result = plan_and_collect(&ctx, "SELECT dummy(i) FROM t").await?; - assert_batches_eq!(expected, &result); + + insta::assert_snapshot!(batches_to_string(&result), @r###" + +------------+ + | dummy(t.i) | + +------------+ + | 1.0 | + +------------+ + "###); let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?; - assert_batches_eq!(expected, &alias_result); + + insta::assert_snapshot!(batches_to_string(&alias_result), @r###" + +------------+ + | dummy(t.i) | + +------------+ + | 1.0 | + +------------+ + "###); Ok(()) } @@ -418,14 +441,14 @@ async fn test_parameterized_aggregate_udf() -> Result<()> { ); let actual = DataFrame::new(ctx.state(), plan).collect().await?; - let expected = [ - "+------+---+---+", - "| text | a | b |", - "+------+---+---+", - "| foo | 1 | 2 |", - "+------+---+---+", - ]; - assert_batches_eq!(expected, &actual); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +------+---+---+ + | text | a | b | + +------+---+---+ + | foo | 1 | 2 | + +------+---+---+ + "###); ctx.deregister_table("t")?; Ok(()) diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index 915d61712074..e46940e63154 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -155,27 +155,25 @@ const QUERY2: &str = "SELECT 42, arrow_typeof(42)"; // Run the query using the specified execution context and compare it // to the known result async fn run_and_compare_query(ctx: SessionContext, description: &str) -> Result<()> { - let expected = vec![ - "+-------------+---------+", - "| customer_id | revenue |", - "+-------------+---------+", - "| paul | 300 |", - "| jorge | 200 |", - "| andy | 150 |", - "+-------------+---------+", - ]; - let s = exec_sql(&ctx, QUERY).await?; - let actual = s.lines().collect::>(); + let actual = s.lines().collect::>().join("\n"); + + insta::allow_duplicates! { + insta::with_settings!({ + description => description, + }, { + insta::assert_snapshot!(actual, @r###" + +-------------+---------+ + | customer_id | revenue | + +-------------+---------+ + | paul | 300 | + | jorge | 200 | + | andy | 150 | + +-------------+---------+ + "###); + }); + } - assert_eq!( - expected, - actual, - "output mismatch for {}. Expectedn\n{}Actual:\n{}", - description, - expected.join("\n"), - s - ); Ok(()) } @@ -185,25 +183,21 @@ async fn run_and_compare_query_with_analyzer_rule( ctx: SessionContext, description: &str, ) -> Result<()> { - let expected = vec![ - "+------------+--------------------------+", - "| UInt64(42) | arrow_typeof(UInt64(42)) |", - "+------------+--------------------------+", - "| 42 | UInt64 |", - "+------------+--------------------------+", - ]; - let s = exec_sql(&ctx, QUERY2).await?; - let actual = s.lines().collect::>(); + let actual = s.lines().collect::>().join("\n"); + + insta::with_settings!({ + description => description, + }, { + insta::assert_snapshot!(actual, @r###" + +------------+--------------------------+ + | UInt64(42) | arrow_typeof(UInt64(42)) | + +------------+--------------------------+ + | 42 | UInt64 | + +------------+--------------------------+ + "###); + }); - assert_eq!( - expected, - actual, - "output mismatch for {}. Expectedn\n{}Actual:\n{}", - description, - expected.join("\n"), - s - ); Ok(()) } @@ -213,27 +207,23 @@ async fn run_and_compare_query_with_auto_schemas( ctx: SessionContext, description: &str, ) -> Result<()> { - let expected = vec![ - "+----------+----------+", - "| column_1 | column_2 |", - "+----------+----------+", - "| andrew | 100 |", - "| jorge | 200 |", - "| andy | 150 |", - "+----------+----------+", - ]; - let s = exec_sql(&ctx, QUERY1).await?; - let actual = s.lines().collect::>(); + let actual = s.lines().collect::>().join("\n"); + + insta::with_settings!({ + description => description, + }, { + insta::assert_snapshot!(actual, @r###" + +----------+----------+ + | column_1 | column_2 | + +----------+----------+ + | andrew | 100 | + | jorge | 200 | + | andy | 150 | + +----------+----------+ + "###); + }); - assert_eq!( - expected, - actual, - "output mismatch for {}. Expectedn\n{}Actual:\n{}", - description, - expected.join("\n"), - s - ); Ok(()) } diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index 57bac9c6dfca..264bd6b66a60 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -26,6 +26,7 @@ use arrow::array::{ }; use arrow::compute::kernels::numeric::add; use arrow::datatypes::{DataType, Field, Schema}; +use datafusion::common::test_util::batches_to_string; use datafusion::execution::context::{FunctionFactory, RegisterFunction, SessionState}; use datafusion::prelude::*; use datafusion::{execution::registry::FunctionRegistry, test_util}; @@ -57,14 +58,15 @@ async fn csv_query_custom_udf_with_cast() -> Result<()> { register_aggregate_csv(&ctx).await?; let sql = "SELECT avg(custom_sqrt(c11)) FROM aggregate_test_100"; let actual = plan_and_collect(&ctx, sql).await.unwrap(); - let expected = [ - "+------------------------------------------+", - "| avg(custom_sqrt(aggregate_test_100.c11)) |", - "+------------------------------------------+", - "| 0.6584408483418835 |", - "+------------------------------------------+", - ]; - assert_batches_eq!(&expected, &actual); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +------------------------------------------+ + | avg(custom_sqrt(aggregate_test_100.c11)) | + +------------------------------------------+ + | 0.6584408483418835 | + +------------------------------------------+ + "###); + Ok(()) } @@ -75,14 +77,15 @@ async fn csv_query_avg_sqrt() -> Result<()> { // Note it is a different column (c12) than above (c11) let sql = "SELECT avg(custom_sqrt(c12)) FROM aggregate_test_100"; let actual = plan_and_collect(&ctx, sql).await.unwrap(); - let expected = [ - "+------------------------------------------+", - "| avg(custom_sqrt(aggregate_test_100.c12)) |", - "+------------------------------------------+", - "| 0.6706002946036459 |", - "+------------------------------------------+", - ]; - assert_batches_eq!(&expected, &actual); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +------------------------------------------+ + | avg(custom_sqrt(aggregate_test_100.c12)) | + +------------------------------------------+ + | 0.6706002946036459 | + +------------------------------------------+ + "###); + Ok(()) } @@ -146,17 +149,16 @@ async fn scalar_udf() -> Result<()> { let result = DataFrame::new(ctx.state(), plan).collect().await?; - let expected = [ - "+-----+-----+-----------------+", - "| a | b | my_add(t.a,t.b) |", - "+-----+-----+-----------------+", - "| 1 | 2 | 3 |", - "| 10 | 12 | 22 |", - "| 10 | 12 | 22 |", - "| 100 | 120 | 220 |", - "+-----+-----+-----------------+", - ]; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + +-----+-----+-----------------+ + | a | b | my_add(t.a,t.b) | + +-----+-----+-----------------+ + | 1 | 2 | 3 | + | 10 | 12 | 22 | + | 10 | 12 | 22 | + | 100 | 120 | 220 | + +-----+-----+-----------------+ + "###); let batch = &result[0]; let a = as_int32_array(batch.column(0))?; @@ -272,34 +274,32 @@ async fn scalar_udf_zero_params() -> Result<()> { ctx.register_udf(ScalarUDF::from(get_100_udf)); let result = plan_and_collect(&ctx, "select get_100() a from t").await?; - let expected = [ - "+-----+", // - "| a |", // - "+-----+", // - "| 100 |", // - "| 100 |", // - "| 100 |", // - "| 100 |", // - "+-----+", - ]; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + +-----+ + | a | + +-----+ + | 100 | + | 100 | + | 100 | + | 100 | + +-----+ + "###); let result = plan_and_collect(&ctx, "select get_100() a").await?; - let expected = [ - "+-----+", // - "| a |", // - "+-----+", // - "| 100 |", // - "+-----+", - ]; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + +-----+ + | a | + +-----+ + | 100 | + +-----+ + "###); let result = plan_and_collect(&ctx, "select get_100() from t where a=999").await?; - let expected = [ - "++", // - "++", - ]; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + ++ + ++ + "###); + Ok(()) } @@ -325,14 +325,14 @@ async fn scalar_udf_override_built_in_scalar_function() -> Result<()> { // Make sure that the UDF is used instead of the built-in function let result = plan_and_collect(&ctx, "select abs(a) a from t").await?; - let expected = [ - "+---+", // - "| a |", // - "+---+", // - "| 1 |", // - "+---+", - ]; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + +---+ + | a | + +---+ + | 1 | + +---+ + "###); + Ok(()) } @@ -427,14 +427,13 @@ async fn case_sensitive_identifiers_user_defined_functions() -> Result<()> { // Can call it if you put quotes let result = plan_and_collect(&ctx, "SELECT \"MY_FUNC\"(i) FROM t").await?; - let expected = [ - "+--------------+", - "| MY_FUNC(t.i) |", - "+--------------+", - "| 1 |", - "+--------------+", - ]; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + +--------------+ + | MY_FUNC(t.i) | + +--------------+ + | 1 | + +--------------+ + "###); Ok(()) } @@ -464,18 +463,23 @@ async fn test_user_defined_functions_with_alias() -> Result<()> { ctx.register_udf(udf); - let expected = [ - "+------------+", - "| dummy(t.i) |", - "+------------+", - "| 1 |", - "+------------+", - ]; let result = plan_and_collect(&ctx, "SELECT dummy(i) FROM t").await?; - assert_batches_eq!(expected, &result); + insta::assert_snapshot!(batches_to_string(&result), @r###" + +------------+ + | dummy(t.i) | + +------------+ + | 1 | + +------------+ + "###); let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?; - assert_batches_eq!(expected, &alias_result); + insta::assert_snapshot!(batches_to_string(&alias_result), @r###" + +------------+ + | dummy(t.i) | + +------------+ + | 1 | + +------------+ + "###); Ok(()) } @@ -679,6 +683,10 @@ impl ScalarUDFImpl for CastToI64UDF { Ok(DataType::Int64) } + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + panic!("dummy - not implemented") + } + // Demonstrate simplifying a UDF fn simplify( &self, @@ -795,7 +803,7 @@ impl ScalarUDFImpl for TakeUDF { &self.signature } fn return_type(&self, _args: &[DataType]) -> Result { - not_impl_err!("Not called because the return_type_from_exprs is implemented") + not_impl_err!("Not called because the return_type_from_args is implemented") } /// This function returns the type of the first or second argument based on @@ -946,6 +954,10 @@ impl ScalarUDFImpl for ScalarFunctionWrapper { Ok(self.return_type.clone()) } + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + panic!("dummy - not implemented") + } + fn simplify( &self, args: Vec, diff --git a/datafusion/core/tests/user_defined/user_defined_table_functions.rs b/datafusion/core/tests/user_defined/user_defined_table_functions.rs index 618f0590ab3d..e4aff0b00705 100644 --- a/datafusion/core/tests/user_defined/user_defined_table_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_table_functions.rs @@ -26,6 +26,7 @@ use arrow::csv::ReaderBuilder; use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::test_util::batches_to_string; use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::TableProvider; use datafusion::error::Result; @@ -34,7 +35,7 @@ use datafusion::physical_plan::{collect, ExecutionPlan}; use datafusion::prelude::SessionContext; use datafusion_catalog::Session; use datafusion_catalog::TableFunctionImpl; -use datafusion_common::{assert_batches_eq, DFSchema, ScalarValue}; +use datafusion_common::{DFSchema, ScalarValue}; use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, Projection, TableType}; use async_trait::async_trait; @@ -54,17 +55,17 @@ async fn test_simple_read_csv_udtf() -> Result<()> { .collect() .await?; - let excepted = [ - "+-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+", - "| n_nationkey | n_name | n_regionkey | n_comment |", - "+-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+", - "| 1 | ARGENTINA | 1 | al foxes promise slyly according to the regular accounts. bold requests alon |", - "| 2 | BRAZIL | 1 | y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special |", - "| 3 | CANADA | 1 | eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold |", - "| 4 | EGYPT | 4 | y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d |", - "| 5 | ETHIOPIA | 0 | ven packages wake quickly. regu |", - "+-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+", ]; - assert_batches_eq!(excepted, &rbs); + insta::assert_snapshot!(batches_to_string(&rbs), @r###" + +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+ + | n_nationkey | n_name | n_regionkey | n_comment | + +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+ + | 1 | ARGENTINA | 1 | al foxes promise slyly according to the regular accounts. bold requests alon | + | 2 | BRAZIL | 1 | y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special | + | 3 | CANADA | 1 | eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold | + | 4 | EGYPT | 4 | y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d | + | 5 | ETHIOPIA | 0 | ven packages wake quickly. regu | + +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+ + "###); // just run, return all rows let rbs = ctx @@ -72,23 +73,23 @@ async fn test_simple_read_csv_udtf() -> Result<()> { .await? .collect() .await?; - let excepted = [ - "+-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+", - "| n_nationkey | n_name | n_regionkey | n_comment |", - "+-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+", - "| 1 | ARGENTINA | 1 | al foxes promise slyly according to the regular accounts. bold requests alon |", - "| 2 | BRAZIL | 1 | y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special |", - "| 3 | CANADA | 1 | eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold |", - "| 4 | EGYPT | 4 | y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d |", - "| 5 | ETHIOPIA | 0 | ven packages wake quickly. regu |", - "| 6 | FRANCE | 3 | refully final requests. regular, ironi |", - "| 7 | GERMANY | 3 | l platelets. regular accounts x-ray: unusual, regular acco |", - "| 8 | INDIA | 2 | ss excuses cajole slyly across the packages. deposits print aroun |", - "| 9 | INDONESIA | 2 | slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull |", - "| 10 | IRAN | 4 | efully alongside of the slyly final dependencies. |", - "+-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+" - ]; - assert_batches_eq!(excepted, &rbs); + + insta::assert_snapshot!(batches_to_string(&rbs), @r###" + +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+ + | n_nationkey | n_name | n_regionkey | n_comment | + +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+ + | 1 | ARGENTINA | 1 | al foxes promise slyly according to the regular accounts. bold requests alon | + | 2 | BRAZIL | 1 | y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special | + | 3 | CANADA | 1 | eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold | + | 4 | EGYPT | 4 | y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d | + | 5 | ETHIOPIA | 0 | ven packages wake quickly. regu | + | 6 | FRANCE | 3 | refully final requests. regular, ironi | + | 7 | GERMANY | 3 | l platelets. regular accounts x-ray: unusual, regular acco | + | 8 | INDIA | 2 | ss excuses cajole slyly across the packages. deposits print aroun | + | 9 | INDONESIA | 2 | slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull | + | 10 | IRAN | 4 | efully alongside of the slyly final dependencies. | + +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+ + "###); Ok(()) } diff --git a/datafusion/core/tests/user_defined/user_defined_window_functions.rs b/datafusion/core/tests/user_defined/user_defined_window_functions.rs index 9acd17493da4..28394f0b9dfa 100644 --- a/datafusion/core/tests/user_defined/user_defined_window_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_window_functions.rs @@ -20,8 +20,9 @@ use arrow::array::{ArrayRef, AsArray, Int64Array, RecordBatch, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; -use datafusion::{assert_batches_eq, prelude::SessionContext}; -use datafusion_common::{Result, ScalarValue}; +use datafusion::common::test_util::batches_to_string; +use datafusion::common::{Result, ScalarValue}; +use datafusion::prelude::SessionContext; use datafusion_expr::{ PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDF, WindowUDFImpl, }; @@ -57,30 +58,30 @@ const BOUNDED_WINDOW_QUERY: &str = odd_counter(val) OVER (PARTITION BY x ORDER BY y ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) \ from t ORDER BY x, y"; -/// Test to show the contents of the setup #[tokio::test] async fn test_setup() { let test_state = TestState::new(); let TestContext { ctx, test_state: _ } = TestContext::new(test_state); let sql = "SELECT * from t order by x, y"; - let expected = vec![ - "+---+---+-----+", - "| x | y | val |", - "+---+---+-----+", - "| 1 | a | 0 |", - "| 1 | b | 1 |", - "| 1 | c | 2 |", - "| 2 | d | 3 |", - "| 2 | e | 4 |", - "| 2 | f | 5 |", - "| 2 | g | 6 |", - "| 2 | h | 6 |", - "| 2 | i | 6 |", - "| 2 | j | 6 |", - "+---+---+-----+", - ]; - assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + let actual = execute(&ctx, sql).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+ + | x | y | val | + +---+---+-----+ + | 1 | a | 0 | + | 1 | b | 1 | + | 1 | c | 2 | + | 2 | d | 3 | + | 2 | e | 4 | + | 2 | f | 5 | + | 2 | g | 6 | + | 2 | h | 6 | + | 2 | i | 6 | + | 2 | j | 6 | + +---+---+-----+ + "###); } /// Basic user defined window function @@ -89,26 +90,25 @@ async fn test_udwf() { let test_state = TestState::new(); let TestContext { ctx, test_state } = TestContext::new(test_state); - let expected = vec![ - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |", - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 1 |", - "| 1 | b | 1 | 1 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 2 |", - "| 2 | e | 4 | 2 |", - "| 2 | f | 5 | 2 |", - "| 2 | g | 6 | 2 |", - "| 2 | h | 6 | 2 |", - "| 2 | i | 6 | 2 |", - "| 2 | j | 6 | 2 |", - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap() - ); + let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 1 | + | 1 | b | 1 | 1 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 2 | + | 2 | e | 4 | 2 | + | 2 | f | 5 | 2 | + | 2 | g | 6 | 2 | + | 2 | h | 6 | 2 | + | 2 | i | 6 | 2 | + | 2 | j | 6 | 2 | + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + "###); + // evaluated on two distinct batches assert_eq!(test_state.evaluate_all_called(), 2); } @@ -133,28 +133,26 @@ async fn test_udwf_with_alias() { let test_state = TestState::new(); let TestContext { ctx, .. } = TestContext::new(test_state); - let expected = vec![ - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |", - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 1 |", - "| 1 | b | 1 | 1 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 2 |", - "| 2 | e | 4 | 2 |", - "| 2 | f | 5 | 2 |", - "| 2 | g | 6 | 2 |", - "| 2 | h | 6 | 2 |", - "| 2 | i | 6 | 2 |", - "| 2 | j | 6 | 2 |", - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, UNBOUNDED_WINDOW_QUERY_WITH_ALIAS) - .await - .unwrap() - ); + let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY_WITH_ALIAS) + .await + .unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 1 | + | 1 | b | 1 | 1 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 2 | + | 2 | e | 4 | 2 | + | 2 | f | 5 | 2 | + | 2 | g | 6 | 2 | + | 2 | h | 6 | 2 | + | 2 | i | 6 | 2 | + | 2 | j | 6 | 2 | + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + "###); } /// Basic user defined window function with bounded window @@ -164,26 +162,25 @@ async fn test_udwf_bounded_window_ignores_frame() { let TestContext { ctx, test_state } = TestContext::new(test_state); // Since the UDWF doesn't say it needs the window frame, the frame is ignored - let expected = vec![ - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 1 |", - "| 1 | b | 1 | 1 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 2 |", - "| 2 | e | 4 | 2 |", - "| 2 | f | 5 | 2 |", - "| 2 | g | 6 | 2 |", - "| 2 | h | 6 | 2 |", - "| 2 | i | 6 | 2 |", - "| 2 | j | 6 | 2 |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap() - ); + let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 1 | + | 1 | b | 1 | 1 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 2 | + | 2 | e | 4 | 2 | + | 2 | f | 5 | 2 | + | 2 | g | 6 | 2 | + | 2 | h | 6 | 2 | + | 2 | i | 6 | 2 | + | 2 | j | 6 | 2 | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + "###); + // evaluated on 2 distinct batches (when x=1 and x=2) assert_eq!(test_state.evaluate_called(), 0); assert_eq!(test_state.evaluate_all_called(), 2); @@ -195,26 +192,25 @@ async fn test_udwf_bounded_window() { let test_state = TestState::new().with_uses_window_frame(); let TestContext { ctx, test_state } = TestContext::new(test_state); - let expected = vec![ - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 1 |", - "| 1 | b | 1 | 1 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 1 |", - "| 2 | e | 4 | 2 |", - "| 2 | f | 5 | 1 |", - "| 2 | g | 6 | 1 |", - "| 2 | h | 6 | 0 |", - "| 2 | i | 6 | 0 |", - "| 2 | j | 6 | 0 |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap() - ); + let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 1 | + | 1 | b | 1 | 1 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 1 | + | 2 | e | 4 | 2 | + | 2 | f | 5 | 1 | + | 2 | g | 6 | 1 | + | 2 | h | 6 | 0 | + | 2 | i | 6 | 0 | + | 2 | j | 6 | 0 | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + "###); + // Evaluate is called for each input rows assert_eq!(test_state.evaluate_called(), 10); assert_eq!(test_state.evaluate_all_called(), 0); @@ -228,26 +224,25 @@ async fn test_stateful_udwf() { .with_uses_window_frame(); let TestContext { ctx, test_state } = TestContext::new(test_state); - let expected = vec![ - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |", - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 0 |", - "| 1 | b | 1 | 1 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 1 |", - "| 2 | e | 4 | 1 |", - "| 2 | f | 5 | 2 |", - "| 2 | g | 6 | 2 |", - "| 2 | h | 6 | 2 |", - "| 2 | i | 6 | 2 |", - "| 2 | j | 6 | 2 |", - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap() - ); + let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 0 | + | 1 | b | 1 | 1 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 1 | + | 2 | e | 4 | 1 | + | 2 | f | 5 | 2 | + | 2 | g | 6 | 2 | + | 2 | h | 6 | 2 | + | 2 | i | 6 | 2 | + | 2 | j | 6 | 2 | + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + "###); + assert_eq!(test_state.evaluate_called(), 10); assert_eq!(test_state.evaluate_all_called(), 0); } @@ -260,26 +255,25 @@ async fn test_stateful_udwf_bounded_window() { .with_uses_window_frame(); let TestContext { ctx, test_state } = TestContext::new(test_state); - let expected = vec![ - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 1 |", - "| 1 | b | 1 | 1 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 1 |", - "| 2 | e | 4 | 2 |", - "| 2 | f | 5 | 1 |", - "| 2 | g | 6 | 1 |", - "| 2 | h | 6 | 0 |", - "| 2 | i | 6 | 0 |", - "| 2 | j | 6 | 0 |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap() - ); + let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 1 | + | 1 | b | 1 | 1 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 1 | + | 2 | e | 4 | 2 | + | 2 | f | 5 | 1 | + | 2 | g | 6 | 1 | + | 2 | h | 6 | 0 | + | 2 | i | 6 | 0 | + | 2 | j | 6 | 0 | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + "###); + // Evaluate and update_state is called for each input row assert_eq!(test_state.evaluate_called(), 10); assert_eq!(test_state.evaluate_all_called(), 0); @@ -291,26 +285,25 @@ async fn test_udwf_query_include_rank() { let test_state = TestState::new().with_include_rank(); let TestContext { ctx, test_state } = TestContext::new(test_state); - let expected = vec![ - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |", - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 3 |", - "| 1 | b | 1 | 2 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 7 |", - "| 2 | e | 4 | 6 |", - "| 2 | f | 5 | 5 |", - "| 2 | g | 6 | 4 |", - "| 2 | h | 6 | 3 |", - "| 2 | i | 6 | 2 |", - "| 2 | j | 6 | 1 |", - "+---+---+-----+-----------------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap() - ); + let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 3 | + | 1 | b | 1 | 2 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 7 | + | 2 | e | 4 | 6 | + | 2 | f | 5 | 5 | + | 2 | g | 6 | 4 | + | 2 | h | 6 | 3 | + | 2 | i | 6 | 2 | + | 2 | j | 6 | 1 | + +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ + "###); + assert_eq!(test_state.evaluate_called(), 0); assert_eq!(test_state.evaluate_all_called(), 0); // evaluated on 2 distinct batches (when x=1 and x=2) @@ -323,26 +316,25 @@ async fn test_udwf_bounded_query_include_rank() { let test_state = TestState::new().with_include_rank(); let TestContext { ctx, test_state } = TestContext::new(test_state); - let expected = vec![ - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 3 |", - "| 1 | b | 1 | 2 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 7 |", - "| 2 | e | 4 | 6 |", - "| 2 | f | 5 | 5 |", - "| 2 | g | 6 | 4 |", - "| 2 | h | 6 | 3 |", - "| 2 | i | 6 | 2 |", - "| 2 | j | 6 | 1 |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap() - ); + let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 3 | + | 1 | b | 1 | 2 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 7 | + | 2 | e | 4 | 6 | + | 2 | f | 5 | 5 | + | 2 | g | 6 | 4 | + | 2 | h | 6 | 3 | + | 2 | i | 6 | 2 | + | 2 | j | 6 | 1 | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + "###); + assert_eq!(test_state.evaluate_called(), 0); assert_eq!(test_state.evaluate_all_called(), 0); // evaluated on 2 distinct batches (when x=1 and x=2) @@ -357,26 +349,25 @@ async fn test_udwf_bounded_window_returns_null() { .with_null_for_zero(); let TestContext { ctx, test_state } = TestContext::new(test_state); - let expected = vec![ - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - "| 1 | a | 0 | 1 |", - "| 1 | b | 1 | 1 |", - "| 1 | c | 2 | 1 |", - "| 2 | d | 3 | 1 |", - "| 2 | e | 4 | 2 |", - "| 2 | f | 5 | 1 |", - "| 2 | g | 6 | 1 |", - "| 2 | h | 6 | |", - "| 2 | i | 6 | |", - "| 2 | j | 6 | |", - "+---+---+-----+--------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!( - expected, - &execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap() - ); + let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap(); + + insta::assert_snapshot!(batches_to_string(&actual), @r###" + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + | 1 | a | 0 | 1 | + | 1 | b | 1 | 1 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 1 | + | 2 | e | 4 | 2 | + | 2 | f | 5 | 1 | + | 2 | g | 6 | 1 | + | 2 | h | 6 | | + | 2 | i | 6 | | + | 2 | j | 6 | | + +---+---+-----+--------------------------------------------------------------------------------------------------------------+ + "###); + // Evaluate is called for each input rows assert_eq!(test_state.evaluate_called(), 10); assert_eq!(test_state.evaluate_all_called(), 0); diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs index 7d00b14e5119..71996f3f0eaa 100644 --- a/datafusion/datasource-avro/src/mod.rs +++ b/datafusion/datasource-avro/src/mod.rs @@ -20,6 +20,9 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] //! An [Avro](https://avro.apache.org/) based [`FileSource`](datafusion_datasource::file::FileSource) implementation and related functionality. diff --git a/datafusion/datasource-csv/Cargo.toml b/datafusion/datasource-csv/Cargo.toml index 689531758cad..b95c51cbbeff 100644 --- a/datafusion/datasource-csv/Cargo.toml +++ b/datafusion/datasource-csv/Cargo.toml @@ -44,13 +44,9 @@ datafusion-physical-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } futures = { workspace = true } -itertools = { workspace = true } -log = { workspace = true } object_store = { workspace = true } -rand = { workspace = true } regex = { workspace = true } tokio = { workspace = true } -url = { workspace = true } [lints] workspace = true diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs index cab561d163b3..522cb12db0c7 100644 --- a/datafusion/datasource-csv/src/file_format.rs +++ b/datafusion/datasource-csv/src/file_format.rs @@ -666,8 +666,8 @@ impl DisplayAs for CsvSink { write!(f, ")") } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + writeln!(f, "format: csv")?; + write!(f, "file={}", &self.config.original_url) } } } diff --git a/datafusion/datasource-csv/src/mod.rs b/datafusion/datasource-csv/src/mod.rs index 4117d1fee5fc..90c3689cd1c0 100644 --- a/datafusion/datasource-csv/src/mod.rs +++ b/datafusion/datasource-csv/src/mod.rs @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] + pub mod file_format; pub mod source; diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index bb584433d1a4..b9d974c88484 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -35,6 +35,7 @@ use arrow::csv; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, DataFusionError, Result, Statistics}; +use datafusion_common_runtime::JoinSet; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; @@ -52,7 +53,6 @@ use futures::{StreamExt, TryStreamExt}; use object_store::buffered::BufWriter; use object_store::{GetOptions, GetResultPayload, ObjectStore}; use tokio::io::AsyncWriteExt; -use tokio::task::JoinSet; use crate::file_format::CsvDecoder; diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index 2df49b535128..9b6d5925fe81 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -325,8 +325,8 @@ impl DisplayAs for JsonSink { write!(f, ")") } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + writeln!(f, "format: json")?; + write!(f, "file={}", &self.config.original_url) } } } diff --git a/datafusion/datasource-json/src/mod.rs b/datafusion/datasource-json/src/mod.rs index 35dabfa109fc..18bb8792c3ff 100644 --- a/datafusion/datasource-json/src/mod.rs +++ b/datafusion/datasource-json/src/mod.rs @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] + pub mod file_format; pub mod source; diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs index 249593587b82..4605ad3d948d 100644 --- a/datafusion/datasource-json/src/source.rs +++ b/datafusion/datasource-json/src/source.rs @@ -25,6 +25,7 @@ use std::task::Poll; use crate::file_format::JsonDecoder; use datafusion_common::error::{DataFusionError, Result}; +use datafusion_common_runtime::JoinSet; use datafusion_datasource::decoder::{deserialize_stream, DecoderDeserializer}; use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_meta::FileMeta; @@ -51,7 +52,6 @@ use futures::{StreamExt, TryStreamExt}; use object_store::buffered::BufWriter; use object_store::{GetOptions, GetResultPayload, ObjectStore}; use tokio::io::AsyncWriteExt; -use tokio::task::JoinSet; /// Execution plan for scanning NdJson data source #[derive(Debug, Clone)] diff --git a/datafusion/datasource-parquet/Cargo.toml b/datafusion/datasource-parquet/Cargo.toml index d84487bba9bd..8aa041b7a4a7 100644 --- a/datafusion/datasource-parquet/Cargo.toml +++ b/datafusion/datasource-parquet/Cargo.toml @@ -52,7 +52,7 @@ object_store = { workspace = true } parking_lot = { workspace = true } parquet = { workspace = true } rand = { workspace = true } -tokio = { workspace = true, features = ["fs"] } +tokio = { workspace = true } [dev-dependencies] chrono = { workspace = true } diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 48761d85e708..8a78407c64d1 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -45,7 +45,7 @@ use datafusion_common::{ DataFusionError, GetExt, Result, DEFAULT_PARQUET_EXTENSION, }; use datafusion_common::{HashMap, Statistics}; -use datafusion_common_runtime::SpawnedTask; +use datafusion_common_runtime::{JoinSet, SpawnedTask}; use datafusion_datasource::display::FileGroupDisplay; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; @@ -82,7 +82,6 @@ use parquet::file::writer::SerializedFileWriter; use parquet::format::FileMetaData; use tokio::io::{AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::task::JoinSet; use crate::can_expr_be_pushed_down_with_schemas; use crate::source::ParquetSource; @@ -465,8 +464,114 @@ impl FileFormat for ParquetFormat { } } +/// Apply necessary schema type coercions to make file schema match table schema. +/// +/// This function performs two main types of transformations in a single pass: +/// 1. Binary types to string types conversion - Converts binary data types to their +/// corresponding string types when the table schema expects string data +/// 2. Regular to view types conversion - Converts standard string/binary types to +/// view types when the table schema uses view types +/// +/// # Arguments +/// * `table_schema` - The table schema containing the desired types +/// * `file_schema` - The file schema to be transformed +/// +/// # Returns +/// * `Some(Schema)` - If any transformations were applied, returns the transformed schema +/// * `None` - If no transformations were needed +pub fn apply_file_schema_type_coercions( + table_schema: &Schema, + file_schema: &Schema, +) -> Option { + let mut needs_view_transform = false; + let mut needs_string_transform = false; + + // Create a mapping of table field names to their data types for fast lookup + // and simultaneously check if we need any transformations + let table_fields: HashMap<_, _> = table_schema + .fields() + .iter() + .map(|f| { + let dt = f.data_type(); + // Check if we need view type transformation + if matches!(dt, &DataType::Utf8View | &DataType::BinaryView) { + needs_view_transform = true; + } + // Check if we need string type transformation + if matches!( + dt, + &DataType::Utf8 | &DataType::LargeUtf8 | &DataType::Utf8View + ) { + needs_string_transform = true; + } + + (f.name(), dt) + }) + .collect(); + + // Early return if no transformation needed + if !needs_view_transform && !needs_string_transform { + return None; + } + + let transformed_fields: Vec> = file_schema + .fields() + .iter() + .map(|field| { + let field_name = field.name(); + let field_type = field.data_type(); + + // Look up the corresponding field type in the table schema + if let Some(table_type) = table_fields.get(field_name) { + match (table_type, field_type) { + // table schema uses string type, coerce the file schema to use string type + ( + &DataType::Utf8, + DataType::Binary | DataType::LargeBinary | DataType::BinaryView, + ) => { + return field_with_new_type(field, DataType::Utf8); + } + // table schema uses large string type, coerce the file schema to use large string type + ( + &DataType::LargeUtf8, + DataType::Binary | DataType::LargeBinary | DataType::BinaryView, + ) => { + return field_with_new_type(field, DataType::LargeUtf8); + } + // table schema uses string view type, coerce the file schema to use view type + ( + &DataType::Utf8View, + DataType::Binary | DataType::LargeBinary | DataType::BinaryView, + ) => { + return field_with_new_type(field, DataType::Utf8View); + } + // Handle view type conversions + (&DataType::Utf8View, DataType::Utf8 | DataType::LargeUtf8) => { + return field_with_new_type(field, DataType::Utf8View); + } + (&DataType::BinaryView, DataType::Binary | DataType::LargeBinary) => { + return field_with_new_type(field, DataType::BinaryView); + } + _ => {} + } + } + + // If no transformation is needed, keep the original field + Arc::clone(field) + }) + .collect(); + + Some(Schema::new_with_metadata( + transformed_fields, + file_schema.metadata.clone(), + )) +} + /// Coerces the file schema if the table schema uses a view type. -#[cfg(not(target_arch = "wasm32"))] +#[deprecated( + since = "47.0.0", + note = "Use `apply_file_schema_type_coercions` instead" +)] pub fn coerce_file_schema_to_view_type( table_schema: &Schema, file_schema: &Schema, @@ -516,7 +621,10 @@ pub fn coerce_file_schema_to_view_type( /// If the table schema uses a string type, coerce the file schema to use a string type. /// /// See [ParquetFormat::binary_as_string] for details -#[cfg(not(target_arch = "wasm32"))] +#[deprecated( + since = "47.0.0", + note = "Use `apply_file_schema_type_coercions` instead" +)] pub fn coerce_file_schema_to_string_type( table_schema: &Schema, file_schema: &Schema, @@ -688,10 +796,34 @@ pub async fn fetch_statistics( statistics_from_parquet_meta_calc(&metadata, table_schema) } -/// Convert statistics in [`ParquetMetaData`] into [`Statistics`] using ['StatisticsConverter`] +/// Convert statistics in [`ParquetMetaData`] into [`Statistics`] using [`StatisticsConverter`] /// /// The statistics are calculated for each column in the table schema /// using the row group statistics in the parquet metadata. +/// +/// # Key behaviors: +/// +/// 1. Extracts row counts and byte sizes from all row groups +/// 2. Applies schema type coercions to align file schema with table schema +/// 3. Collects and aggregates statistics across row groups when available +/// +/// # When there are no statistics: +/// +/// If the Parquet file doesn't contain any statistics (has_statistics is false), the function returns a Statistics object with: +/// - Exact row count +/// - Exact byte size +/// - All column statistics marked as unknown via Statistics::unknown_column(&table_schema) +/// # When only some columns have statistics: +/// +/// For columns with statistics: +/// - Min/max values are properly extracted and represented as Precision::Exact +/// - Null counts are calculated by summing across row groups +/// +/// For columns without statistics, +/// - For min/max, there are two situations: +/// 1. The column isn't in arrow schema, then min/max values are set to Precision::Absent +/// 2. The column is in arrow schema, but not in parquet schema due to schema revolution, min/max values are set to Precision::Exact(null) +/// - Null counts are set to Precision::Exact(num_rows) (conservatively assuming all values could be null) pub fn statistics_from_parquet_meta_calc( metadata: &ParquetMetaData, table_schema: SchemaRef, @@ -707,9 +839,10 @@ pub fn statistics_from_parquet_meta_calc( total_byte_size += row_group_meta.total_byte_size() as usize; if !has_statistics { - row_group_meta.columns().iter().for_each(|column| { - has_statistics = column.statistics().is_some(); - }); + has_statistics = row_group_meta + .columns() + .iter() + .any(|column| column.statistics().is_some()); } } statistics.num_rows = Precision::Exact(num_rows); @@ -720,11 +853,8 @@ pub fn statistics_from_parquet_meta_calc( file_metadata.schema_descr(), file_metadata.key_value_metadata(), )?; - if let Some(merged) = coerce_file_schema_to_string_type(&table_schema, &file_schema) { - file_schema = merged; - } - if let Some(merged) = coerce_file_schema_to_view_type(&table_schema, &file_schema) { + if let Some(merged) = apply_file_schema_type_coercions(&table_schema, &file_schema) { file_schema = merged; } diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs index fb1f2d55169f..cecee0031715 100644 --- a/datafusion/datasource-parquet/src/mod.rs +++ b/datafusion/datasource-parquet/src/mod.rs @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] + //! [`ParquetExec`] FileSource for reading Parquet files pub mod access_plan; diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 3c623f558e43..732fef47d5a7 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -19,14 +19,11 @@ use std::sync::Arc; -use crate::file_format::{ - coerce_file_schema_to_string_type, coerce_file_schema_to_view_type, -}; use crate::page_filter::PagePruningAccessPlanFilter; use crate::row_group_filter::RowGroupAccessPlanFilter; use crate::{ - row_filter, should_enable_page_index, ParquetAccessPlan, ParquetFileMetrics, - ParquetFileReaderFactory, + apply_file_schema_type_coercions, row_filter, should_enable_page_index, + ParquetAccessPlan, ParquetFileMetrics, ParquetFileReaderFactory, }; use datafusion_datasource::file_meta::FileMeta; use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; @@ -107,6 +104,7 @@ impl FileOpener for ParquetOpener { let projected_schema = SchemaRef::from(self.table_schema.project(&self.projection)?); + let schema_adapter_factory = Arc::clone(&self.schema_adapter_factory); let schema_adapter = self .schema_adapter_factory .create(projected_schema, Arc::clone(&self.table_schema)); @@ -131,14 +129,8 @@ impl FileOpener for ParquetOpener { ArrowReaderMetadata::load_async(&mut reader, options.clone()).await?; let mut schema = Arc::clone(metadata.schema()); - if let Some(merged) = - coerce_file_schema_to_string_type(&table_schema, &schema) - { - schema = Arc::new(merged); - } - // read with view types - if let Some(merged) = coerce_file_schema_to_view_type(&table_schema, &schema) + if let Some(merged) = apply_file_schema_type_coercions(&table_schema, &schema) { schema = Arc::new(merged); } @@ -173,7 +165,7 @@ impl FileOpener for ParquetOpener { builder.metadata(), reorder_predicates, &file_metrics, - Arc::clone(&schema_mapping), + &schema_adapter_factory, ); match row_filter { diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs index 39fcecf37c6d..da6bf114d71d 100644 --- a/datafusion/datasource-parquet/src/row_filter.rs +++ b/datafusion/datasource-parquet/src/row_filter.rs @@ -64,7 +64,7 @@ use std::collections::BTreeSet; use std::sync::Arc; use arrow::array::BooleanArray; -use arrow::datatypes::{DataType, Schema}; +use arrow::datatypes::{DataType, Schema, SchemaRef}; use arrow::error::{ArrowError, Result as ArrowResult}; use arrow::record_batch::RecordBatch; use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter}; @@ -72,12 +72,10 @@ use parquet::arrow::ProjectionMask; use parquet::file::metadata::ParquetMetaData; use datafusion_common::cast::as_boolean_array; -use datafusion_common::tree_node::{ - Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, -}; -use datafusion_common::{arrow_datafusion_err, DataFusionError, Result, ScalarValue}; -use datafusion_datasource::schema_adapter::SchemaMapper; -use datafusion_physical_expr::expressions::{Column, Literal}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion_common::Result; +use datafusion_datasource::schema_adapter::{SchemaAdapterFactory, SchemaMapper}; +use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::reassign_predicate_columns; use datafusion_physical_expr::{split_conjunction, PhysicalExpr}; @@ -102,8 +100,6 @@ pub(crate) struct DatafusionArrowPredicate { /// Path to the columns in the parquet schema required to evaluate the /// expression projection_mask: ProjectionMask, - /// Columns required to evaluate the expression in the arrow schema - projection: Vec, /// how many rows were filtered out by this predicate rows_pruned: metrics::Count, /// how many rows passed this predicate @@ -111,34 +107,24 @@ pub(crate) struct DatafusionArrowPredicate { /// how long was spent evaluating this predicate time: metrics::Time, /// used to perform type coercion while filtering rows - schema_mapping: Arc, + schema_mapper: Arc, } impl DatafusionArrowPredicate { /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate` pub fn try_new( candidate: FilterCandidate, - schema: &Schema, metadata: &ParquetMetaData, rows_pruned: metrics::Count, rows_matched: metrics::Count, time: metrics::Time, - schema_mapping: Arc, ) -> Result { - let schema = Arc::new(schema.project(&candidate.projection)?); - let physical_expr = reassign_predicate_columns(candidate.expr, &schema, true)?; - - // ArrowPredicate::evaluate is passed columns in the order they appear in the file - // If the predicate has multiple columns, we therefore must project the columns based - // on the order they appear in the file - let projection = match candidate.projection.len() { - 0 | 1 => vec![], - 2.. => remap_projection(&candidate.projection), - }; + let projected_schema = Arc::clone(&candidate.filter_schema); + let physical_expr = + reassign_predicate_columns(candidate.expr, &projected_schema, true)?; Ok(Self { physical_expr, - projection, projection_mask: ProjectionMask::roots( metadata.file_metadata().schema_descr(), candidate.projection, @@ -146,7 +132,7 @@ impl DatafusionArrowPredicate { rows_pruned, rows_matched, time, - schema_mapping, + schema_mapper: candidate.schema_mapper, }) } } @@ -156,12 +142,8 @@ impl ArrowPredicate for DatafusionArrowPredicate { &self.projection_mask } - fn evaluate(&mut self, mut batch: RecordBatch) -> ArrowResult { - if !self.projection.is_empty() { - batch = batch.project(&self.projection)?; - }; - - let batch = self.schema_mapping.map_partial_batch(batch)?; + fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult { + let batch = self.schema_mapper.map_batch(batch)?; // scoped timer updates on drop let mut timer = self.time.timer(); @@ -194,9 +176,22 @@ impl ArrowPredicate for DatafusionArrowPredicate { /// See the module level documentation for more information. pub(crate) struct FilterCandidate { expr: Arc, + /// Estimate for the total number of bytes that will need to be processed + /// to evaluate this filter. This is used to estimate the cost of evaluating + /// the filter and to order the filters when `reorder_predicates` is true. + /// This is generated by summing the compressed size of all columns that the filter references. required_bytes: usize, + /// Can this filter use an index (e.g. a page index) to prune rows? can_use_index: bool, + /// The projection to read from the file schema to get the columns + /// required to pass thorugh a `SchemaMapper` to the table schema + /// upon which we then evaluate the filter expression. projection: Vec, + /// A `SchemaMapper` used to map batches read from the file schema to + /// the filter's projection of the table schema. + schema_mapper: Arc, + /// The projected table schema that this filter references + filter_schema: SchemaRef, } /// Helper to build a `FilterCandidate`. @@ -220,41 +215,40 @@ pub(crate) struct FilterCandidate { /// but old files do not have the columns. /// /// When a file is missing a column from the table schema, the value of the -/// missing column is filled in with `NULL` via a `SchemaAdapter`. +/// missing column is filled in by a `SchemaAdapter` (by default as `NULL`). /// /// When a predicate is pushed down to the parquet reader, the predicate is -/// evaluated in the context of the file schema. If the predicate references a -/// column that is in the table schema but not in the file schema, the column -/// reference must be rewritten to a literal expression that represents the -/// `NULL` value that would be produced by the `SchemaAdapter`. -/// -/// For example, if: -/// * The table schema is `id, name, address` -/// * The file schema is `id, name` (missing the `address` column) -/// * predicate is `address = 'foo'` -/// -/// When evaluating the predicate as a filter on the parquet file, the predicate -/// must be rewritten to `NULL = 'foo'` as the `address` column will be filled -/// in with `NULL` values during the rest of the evaluation. -struct FilterCandidateBuilder<'a> { +/// evaluated in the context of the file schema. +/// For each predicate we build a filter schema which is the projection of the table +/// schema that contains only the columns that this filter references. +/// If any columns from the file schema are missing from a particular file they are +/// added by the `SchemaAdapter`, by default as `NULL`. +struct FilterCandidateBuilder { expr: Arc, - /// The schema of this parquet file - file_schema: &'a Schema, + /// The schema of this parquet file. + /// Columns may have different types from the table schema and there may be + /// columns in the file schema that are not in the table schema or columns that + /// are in the table schema that are not in the file schema. + file_schema: SchemaRef, /// The schema of the table (merged schema) -- columns may be in different /// order than in the file and have columns that are not in the file schema - table_schema: &'a Schema, + table_schema: SchemaRef, + /// A `SchemaAdapterFactory` used to map the file schema to the table schema. + schema_adapter_factory: Arc, } -impl<'a> FilterCandidateBuilder<'a> { +impl FilterCandidateBuilder { pub fn new( expr: Arc, - file_schema: &'a Schema, - table_schema: &'a Schema, + file_schema: Arc, + table_schema: Arc, + schema_adapter_factory: Arc, ) -> Self { Self { expr, file_schema, table_schema, + schema_adapter_factory, } } @@ -266,20 +260,32 @@ impl<'a> FilterCandidateBuilder<'a> { /// * `Ok(None)` if the expression cannot be used as an ArrowFilter /// * `Err(e)` if an error occurs while building the candidate pub fn build(self, metadata: &ParquetMetaData) -> Result> { - let Some((required_indices, rewritten_expr)) = - pushdown_columns(self.expr, self.file_schema, self.table_schema)? + let Some(required_indices_into_table_schema) = + pushdown_columns(&self.expr, &self.table_schema)? else { return Ok(None); }; - let required_bytes = size_of_columns(&required_indices, metadata)?; - let can_use_index = columns_sorted(&required_indices, metadata)?; + let projected_table_schema = Arc::new( + self.table_schema + .project(&required_indices_into_table_schema)?, + ); + + let (schema_mapper, projection_into_file_schema) = self + .schema_adapter_factory + .create(Arc::clone(&projected_table_schema), self.table_schema) + .map_schema(&self.file_schema)?; + + let required_bytes = size_of_columns(&projection_into_file_schema, metadata)?; + let can_use_index = columns_sorted(&projection_into_file_schema, metadata)?; Ok(Some(FilterCandidate { - expr: rewritten_expr, + expr: self.expr, required_bytes, can_use_index, - projection: required_indices.into_iter().collect(), + projection: projection_into_file_schema, + schema_mapper: Arc::clone(&schema_mapper), + filter_schema: Arc::clone(&projected_table_schema), })) } } @@ -294,33 +300,29 @@ struct PushdownChecker<'schema> { /// Does the expression reference any columns that are in the table /// schema but not in the file schema? projected_columns: bool, - // the indices of all the columns found within the given expression which exist inside the given - // [`file_schema`] - required_column_indices: BTreeSet, - file_schema: &'schema Schema, + // Indices into the table schema of the columns required to evaluate the expression + required_columns: BTreeSet, table_schema: &'schema Schema, } impl<'schema> PushdownChecker<'schema> { - fn new(file_schema: &'schema Schema, table_schema: &'schema Schema) -> Self { + fn new(table_schema: &'schema Schema) -> Self { Self { non_primitive_columns: false, projected_columns: false, - required_column_indices: BTreeSet::default(), - file_schema, + required_columns: BTreeSet::default(), table_schema, } } fn check_single_column(&mut self, column_name: &str) -> Option { - if let Ok(idx) = self.file_schema.index_of(column_name) { - self.required_column_indices.insert(idx); - - if DataType::is_nested(self.file_schema.field(idx).data_type()) { + if let Ok(idx) = self.table_schema.index_of(column_name) { + self.required_columns.insert(idx); + if DataType::is_nested(self.table_schema.field(idx).data_type()) { self.non_primitive_columns = true; return Some(TreeNodeRecursion::Jump); } - } else if self.table_schema.index_of(column_name).is_err() { + } else { // If the column does not exist in the (un-projected) table schema then // it must be a projected column. self.projected_columns = true; @@ -336,82 +338,40 @@ impl<'schema> PushdownChecker<'schema> { } } -impl TreeNodeRewriter for PushdownChecker<'_> { +impl TreeNodeVisitor<'_> for PushdownChecker<'_> { type Node = Arc; - fn f_down( - &mut self, - node: Arc, - ) -> Result>> { + fn f_down(&mut self, node: &Self::Node) -> Result { if let Some(column) = node.as_any().downcast_ref::() { if let Some(recursion) = self.check_single_column(column.name()) { - return Ok(Transformed::new(node, false, recursion)); + return Ok(recursion); } } - Ok(Transformed::no(node)) - } - - /// After visiting all children, rewrite column references to nulls if - /// they are not in the file schema. - /// We do this because they won't be relevant if they're not in the file schema, since that's - /// the only thing we're dealing with here as this is only used for the parquet pushdown during - /// scanning - fn f_up( - &mut self, - expr: Arc, - ) -> Result>> { - if let Some(column) = expr.as_any().downcast_ref::() { - // if the expression is a column, is it in the file schema? - if self.file_schema.field_with_name(column.name()).is_err() { - return self - .table_schema - .field_with_name(column.name()) - .and_then(|field| { - // Replace the column reference with a NULL (using the type from the table schema) - // e.g. `column = 'foo'` is rewritten be transformed to `NULL = 'foo'` - // - // See comments on `FilterCandidateBuilder` for more information - let null_value = ScalarValue::try_from(field.data_type())?; - Ok(Transformed::yes(Arc::new(Literal::new(null_value)) as _)) - }) - // If the column is not in the table schema, should throw the error - .map_err(|e| arrow_datafusion_err!(e)); - } - } - - Ok(Transformed::no(expr)) + Ok(TreeNodeRecursion::Continue) } } -type ProjectionAndExpr = (BTreeSet, Arc); - // Checks if a given expression can be pushed down into `DataSourceExec` as opposed to being evaluated // post-parquet-scan in a `FilterExec`. If it can be pushed down, this returns all the // columns in the given expression so that they can be used in the parquet scanning, along with the // expression rewritten as defined in [`PushdownChecker::f_up`] fn pushdown_columns( - expr: Arc, - file_schema: &Schema, + expr: &Arc, table_schema: &Schema, -) -> Result> { - let mut checker = PushdownChecker::new(file_schema, table_schema); - - let expr = expr.rewrite(&mut checker).data()?; - - Ok((!checker.prevents_pushdown()).then_some((checker.required_column_indices, expr))) +) -> Result>> { + let mut checker = PushdownChecker::new(table_schema); + expr.visit(&mut checker)?; + Ok((!checker.prevents_pushdown()) + .then_some(checker.required_columns.into_iter().collect())) } /// creates a PushdownChecker for a single use to check a given column with the given schemes. Used /// to check preemptively if a column name would prevent pushdowning. /// effectively does the inverse of [`pushdown_columns`] does, but with a single given column /// (instead of traversing the entire tree to determine this) -fn would_column_prevent_pushdown( - column_name: &str, - file_schema: &Schema, - table_schema: &Schema, -) -> bool { - let mut checker = PushdownChecker::new(file_schema, table_schema); +fn would_column_prevent_pushdown(column_name: &str, table_schema: &Schema) -> bool { + let mut checker = PushdownChecker::new(table_schema); // the return of this is only used for [`PushdownChecker::f_down()`], so we can safely ignore // it here. I'm just verifying we know the return type of this so nobody accidentally changes @@ -427,14 +387,13 @@ fn would_column_prevent_pushdown( /// Otherwise, true. pub fn can_expr_be_pushed_down_with_schemas( expr: &datafusion_expr::Expr, - file_schema: &Schema, + _file_schema: &Schema, table_schema: &Schema, ) -> bool { let mut can_be_pushed = true; expr.apply(|expr| match expr { datafusion_expr::Expr::Column(column) => { - can_be_pushed &= - !would_column_prevent_pushdown(column.name(), file_schema, table_schema); + can_be_pushed &= !would_column_prevent_pushdown(column.name(), table_schema); Ok(if can_be_pushed { TreeNodeRecursion::Jump } else { @@ -447,41 +406,12 @@ pub fn can_expr_be_pushed_down_with_schemas( can_be_pushed } -/// Computes the projection required to go from the file's schema order to the projected -/// order expected by this filter -/// -/// Effectively this computes the rank of each element in `src` -fn remap_projection(src: &[usize]) -> Vec { - let len = src.len(); - - // Compute the column mapping from projected order to file order - // i.e. the indices required to sort projected schema into the file schema - // - // e.g. projection: [5, 9, 0] -> [2, 0, 1] - let mut sorted_indexes: Vec<_> = (0..len).collect(); - sorted_indexes.sort_unstable_by_key(|x| src[*x]); - - // Compute the mapping from schema order to projected order - // i.e. the indices required to sort file schema into the projected schema - // - // Above we computed the order of the projected schema according to the file - // schema, and so we can use this as the comparator - // - // e.g. sorted_indexes [2, 0, 1] -> [1, 2, 0] - let mut projection: Vec<_> = (0..len).collect(); - projection.sort_unstable_by_key(|x| sorted_indexes[*x]); - projection -} - /// Calculate the total compressed size of all `Column`'s required for /// predicate `Expr`. /// /// This value represents the total amount of IO required to evaluate the /// predicate. -fn size_of_columns( - columns: &BTreeSet, - metadata: &ParquetMetaData, -) -> Result { +fn size_of_columns(columns: &[usize], metadata: &ParquetMetaData) -> Result { let mut total_size = 0; let row_groups = metadata.row_groups(); for idx in columns { @@ -498,10 +428,7 @@ fn size_of_columns( /// /// Sorted columns may be queried more efficiently in the presence of /// a PageIndex. -fn columns_sorted( - _columns: &BTreeSet, - _metadata: &ParquetMetaData, -) -> Result { +fn columns_sorted(_columns: &[usize], _metadata: &ParquetMetaData) -> Result { // TODO How do we know this? Ok(false) } @@ -522,12 +449,12 @@ fn columns_sorted( /// `a = 1` and `c = 3`. pub fn build_row_filter( expr: &Arc, - file_schema: &Schema, - table_schema: &Schema, + file_schema: &SchemaRef, + table_schema: &SchemaRef, metadata: &ParquetMetaData, reorder_predicates: bool, file_metrics: &ParquetFileMetrics, - schema_mapping: Arc, + schema_adapter_factory: &Arc, ) -> Result> { let rows_pruned = &file_metrics.pushdown_rows_pruned; let rows_matched = &file_metrics.pushdown_rows_matched; @@ -541,8 +468,13 @@ pub fn build_row_filter( let mut candidates: Vec = predicates .into_iter() .map(|expr| { - FilterCandidateBuilder::new(Arc::clone(expr), file_schema, table_schema) - .build(metadata) + FilterCandidateBuilder::new( + Arc::clone(expr), + Arc::clone(file_schema), + Arc::clone(table_schema), + Arc::clone(schema_adapter_factory), + ) + .build(metadata) }) .collect::, _>>()? .into_iter() @@ -568,12 +500,10 @@ pub fn build_row_filter( .map(|candidate| { DatafusionArrowPredicate::try_new( candidate, - file_schema, metadata, rows_pruned.clone(), rows_matched.clone(), time.clone(), - Arc::clone(&schema_mapping), ) .map(|pred| Box::new(pred) as _) }) @@ -584,19 +514,17 @@ pub fn build_row_filter( #[cfg(test)] mod test { use super::*; - use datafusion_datasource::schema_adapter::{ - DefaultSchemaAdapterFactory, SchemaAdapterFactory, - }; + use datafusion_common::ScalarValue; use arrow::datatypes::{Field, Fields, TimeUnit::Nanosecond}; - use datafusion_expr::{cast, col, lit, Expr}; + use datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory; + use datafusion_expr::{col, Expr}; use datafusion_physical_expr::planner::logical2physical; use datafusion_physical_plan::metrics::{Count, Time}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use parquet::arrow::parquet_to_arrow_schema; use parquet::file::reader::{FileReader, SerializedFileReader}; - use rand::prelude::*; // We should ignore predicate that read non-primitive columns #[test] @@ -616,51 +544,19 @@ mod test { let expr = col("int64_list").is_not_null(); let expr = logical2physical(&expr, &table_schema); - let candidate = FilterCandidateBuilder::new(expr, &table_schema, &table_schema) - .build(metadata) - .expect("building candidate"); - - assert!(candidate.is_none()); - } - - // If a column exists in the table schema but not the file schema it should be rewritten to a null expression - #[test] - fn test_filter_candidate_builder_rewrite_missing_column() { - let testdata = datafusion_common::test_util::parquet_test_data(); - let file = std::fs::File::open(format!("{testdata}/alltypes_plain.parquet")) - .expect("opening file"); - - let reader = SerializedFileReader::new(file).expect("creating reader"); - - let metadata = reader.metadata(); - - let table_schema = - parquet_to_arrow_schema(metadata.file_metadata().schema_descr(), None) - .expect("parsing schema"); - - let file_schema = Schema::new(vec![ - Field::new("bigint_col", DataType::Int64, true), - Field::new("float_col", DataType::Float32, true), - ]); - - // The parquet file with `file_schema` just has `bigint_col` and `float_col` column, and don't have the `int_col` - let expr = col("bigint_col").eq(cast(col("int_col"), DataType::Int64)); - let expr = logical2physical(&expr, &table_schema); - let expected_candidate_expr = - col("bigint_col").eq(cast(lit(ScalarValue::Int32(None)), DataType::Int64)); - let expected_candidate_expr = - logical2physical(&expected_candidate_expr, &table_schema); - - let candidate = FilterCandidateBuilder::new(expr, &file_schema, &table_schema) - .build(metadata) - .expect("building candidate"); + let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory); + let table_schema = Arc::new(table_schema.clone()); - assert!(candidate.is_some()); + let candidate = FilterCandidateBuilder::new( + expr, + table_schema.clone(), + table_schema, + schema_adapter_factory, + ) + .build(metadata) + .expect("building candidate"); - assert_eq!( - candidate.unwrap().expr.to_string(), - expected_candidate_expr.to_string() - ); + assert!(candidate.is_none()); } #[test] @@ -682,42 +578,43 @@ mod test { false, )]); - let table_ref = Arc::new(table_schema.clone()); - let schema_adapter = - DefaultSchemaAdapterFactory.create(Arc::clone(&table_ref), table_ref); - let (schema_mapping, _) = schema_adapter - .map_schema(&file_schema) - .expect("creating schema mapping"); - - let mut parquet_reader = parquet_reader_builder.build().expect("building reader"); - - // Parquet file is small, we only need 1 record batch - let first_rb = parquet_reader - .next() - .expect("expected record batch") - .expect("expected error free record batch"); - // Test all should fail let expr = col("timestamp_col").lt(Expr::Literal( ScalarValue::TimestampNanosecond(Some(1), Some(Arc::from("UTC"))), )); let expr = logical2physical(&expr, &table_schema); - let candidate = FilterCandidateBuilder::new(expr, &file_schema, &table_schema) - .build(&metadata) - .expect("building candidate") - .expect("candidate expected"); + let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory); + let table_schema = Arc::new(table_schema.clone()); + let candidate = FilterCandidateBuilder::new( + expr, + file_schema.clone(), + table_schema.clone(), + schema_adapter_factory, + ) + .build(&metadata) + .expect("building candidate") + .expect("candidate expected"); let mut row_filter = DatafusionArrowPredicate::try_new( candidate, - &file_schema, &metadata, Count::new(), Count::new(), Time::new(), - Arc::clone(&schema_mapping), ) .expect("creating filter predicate"); + let mut parquet_reader = parquet_reader_builder + .with_projection(row_filter.projection().clone()) + .build() + .expect("building reader"); + + // Parquet file is small, we only need 1 record batch + let first_rb = parquet_reader + .next() + .expect("expected record batch") + .expect("expected error free record batch"); + let filtered = row_filter.evaluate(first_rb.clone()); assert!(matches!(filtered, Ok(a) if a == BooleanArray::from(vec![false; 8]))); @@ -726,19 +623,23 @@ mod test { ScalarValue::TimestampNanosecond(Some(0), Some(Arc::from("UTC"))), )); let expr = logical2physical(&expr, &table_schema); - let candidate = FilterCandidateBuilder::new(expr, &file_schema, &table_schema) - .build(&metadata) - .expect("building candidate") - .expect("candidate expected"); + let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory); + let candidate = FilterCandidateBuilder::new( + expr, + file_schema, + table_schema, + schema_adapter_factory, + ) + .build(&metadata) + .expect("building candidate") + .expect("candidate expected"); let mut row_filter = DatafusionArrowPredicate::try_new( candidate, - &file_schema, &metadata, Count::new(), Count::new(), Time::new(), - schema_mapping, ) .expect("creating filter predicate"); @@ -746,24 +647,6 @@ mod test { assert!(matches!(filtered, Ok(a) if a == BooleanArray::from(vec![true; 8]))); } - #[test] - fn test_remap_projection() { - let mut rng = thread_rng(); - for _ in 0..100 { - // A random selection of column indexes in arbitrary order - let projection: Vec<_> = (0..100).map(|_| rng.gen()).collect(); - - // File order is the projection sorted - let mut file_order = projection.clone(); - file_order.sort_unstable(); - - let remap = remap_projection(&projection); - // Applying the remapped projection to the file order should yield the original - let remapped: Vec<_> = remap.iter().map(|r| file_order[*r]).collect(); - assert_eq!(projection, remapped) - } - } - #[test] fn nested_data_structures_prevent_pushdown() { let table_schema = get_basic_table_schema(); @@ -803,9 +686,10 @@ mod test { fn basic_expr_doesnt_prevent_pushdown() { let table_schema = get_basic_table_schema(); - let file_schema = Schema::new(vec![Field::new("str_col", DataType::Utf8, true)]); + let file_schema = + Schema::new(vec![Field::new("string_col", DataType::Utf8, true)]); - let expr = col("str_col").is_null(); + let expr = col("string_col").is_null(); assert!(can_expr_be_pushed_down_with_schemas( &expr, @@ -819,13 +703,13 @@ mod test { let table_schema = get_basic_table_schema(); let file_schema = Schema::new(vec![ - Field::new("str_col", DataType::Utf8, true), - Field::new("int_col", DataType::UInt64, true), + Field::new("string_col", DataType::Utf8, true), + Field::new("bigint_col", DataType::Int64, true), ]); - let expr = col("str_col") + let expr = col("string_col") .is_not_null() - .or(col("int_col").gt(Expr::Literal(ScalarValue::UInt64(Some(5))))); + .or(col("bigint_col").gt(Expr::Literal(ScalarValue::Int64(Some(5))))); assert!(can_expr_be_pushed_down_with_schemas( &expr, diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 683d62a1df49..47e692cb966d 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -34,6 +34,7 @@ use datafusion_common::config::TableParquetOptions; use datafusion_common::Statistics; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_optimizer::pruning::PruningPredicate; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder}; @@ -580,7 +581,7 @@ impl FileSource for ParquetSource { } DisplayFormatType::TreeRender => { if let Some(predicate) = self.predicate() { - writeln!(f, "predicate={predicate}")?; + writeln!(f, "predicate={}", fmt_sql(predicate.as_ref()))?; } Ok(()) } diff --git a/datafusion/datasource-parquet/src/writer.rs b/datafusion/datasource-parquet/src/writer.rs index cfdb057a4bc4..64eb37c81f5d 100644 --- a/datafusion/datasource-parquet/src/writer.rs +++ b/datafusion/datasource-parquet/src/writer.rs @@ -16,6 +16,7 @@ // under the License. use datafusion_common::DataFusionError; +use datafusion_common_runtime::JoinSet; use datafusion_datasource::ListingTableUrl; use datafusion_execution::TaskContext; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; @@ -25,7 +26,6 @@ use object_store::path::Path; use parquet::arrow::AsyncArrowWriter; use parquet::file::properties::WriterProperties; use std::sync::Arc; -use tokio::task::JoinSet; /// Executes a query and writes the results to a partitioned Parquet file. pub async fn plan_to_parquet( diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index 473800c7779f..922b36aa5f62 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -46,7 +46,7 @@ async-compression = { version = "0.4.19", features = [ ], optional = true } async-trait = { workspace = true } bytes = { workspace = true } -bzip2 = { version = "0.5.1", optional = true } +bzip2 = { version = "0.5.2", optional = true } chrono = { workspace = true } datafusion-catalog = { workspace = true } datafusion-common = { workspace = true, features = ["object_store"] } @@ -66,7 +66,7 @@ parquet = { workspace = true, optional = true } rand = { workspace = true } tempfile = { workspace = true, optional = true } tokio = { workspace = true } -tokio-util = { version = "0.7.4", features = ["io"], optional = true } +tokio-util = { version = "0.7.14", features = ["io"], optional = true } url = { workspace = true } xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 91b5f0157739..82308bda7012 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -167,6 +167,9 @@ pub struct FileScanConfig { pub new_lines_in_values: bool, /// File source such as `ParquetSource`, `CsvSource`, `JsonSource`, etc. pub file_source: Arc, + /// Batch size while creating new batches + /// Defaults to [`datafusion_common::config::ExecutionOptions`] batch_size. + pub batch_size: Option, } impl DataSource for FileScanConfig { @@ -176,10 +179,13 @@ impl DataSource for FileScanConfig { context: Arc, ) -> Result { let object_store = context.runtime_env().object_store(&self.object_store_url)?; + let batch_size = self + .batch_size + .unwrap_or_else(|| context.session_config().batch_size()); let source = self .file_source - .with_batch_size(context.session_config().batch_size()) + .with_batch_size(batch_size) .with_schema(Arc::clone(&self.file_schema)) .with_projection(self); @@ -338,6 +344,7 @@ impl FileScanConfig { file_compression_type: FileCompressionType::UNCOMPRESSED, new_lines_in_values: false, file_source: Arc::clone(&file_source), + batch_size: None, }; config = config.with_source(Arc::clone(&file_source)); @@ -492,6 +499,12 @@ impl FileScanConfig { self } + /// Set the batch_size property + pub fn with_batch_size(mut self, batch_size: Option) -> Self { + self.batch_size = batch_size; + self + } + /// Specifies whether newlines in (quoted) values are supported. /// /// Parsing newlines in quoted values may be affected by execution behaviour such as diff --git a/datafusion/datasource/src/file_sink_config.rs b/datafusion/datasource/src/file_sink_config.rs index 6087f930d3fe..279c9d2100ec 100644 --- a/datafusion/datasource/src/file_sink_config.rs +++ b/datafusion/datasource/src/file_sink_config.rs @@ -86,6 +86,8 @@ pub trait FileSink: DataSink { /// The base configurations to provide when creating a physical plan for /// writing to any given file format. pub struct FileSinkConfig { + /// The unresolved URL specified by the user + pub original_url: String, /// Object store URL, used to get an ObjectStore instance pub object_store_url: ObjectStoreUrl, /// A vector of [`PartitionedFile`] structs, each representing a file partition diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index 240e3c82bbfc..15e25ca386cf 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -20,6 +20,9 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] //! A table that uses the `ObjectStore` listing capability //! to get the list of files to process. diff --git a/datafusion/datasource/src/schema_adapter.rs b/datafusion/datasource/src/schema_adapter.rs index e3a4ea4918c1..4164cda8cba1 100644 --- a/datafusion/datasource/src/schema_adapter.rs +++ b/datafusion/datasource/src/schema_adapter.rs @@ -96,19 +96,6 @@ pub trait SchemaAdapter: Send + Sync { pub trait SchemaMapper: Debug + Send + Sync { /// Adapts a `RecordBatch` to match the `table_schema` fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result; - - /// Adapts a [`RecordBatch`] that does not have all the columns from the - /// file schema. - /// - /// This method is used, for example, when applying a filter to a subset of - /// the columns as part of `DataFusionArrowPredicate` when `filter_pushdown` - /// is enabled. - /// - /// This method is slower than `map_batch` as it looks up columns by name. - fn map_partial_batch( - &self, - batch: RecordBatch, - ) -> datafusion_common::Result; } /// Default [`SchemaAdapterFactory`] for mapping schemas. @@ -215,11 +202,10 @@ impl SchemaAdapterFactory for DefaultSchemaAdapterFactory { fn create( &self, projected_table_schema: SchemaRef, - table_schema: SchemaRef, + _table_schema: SchemaRef, ) -> Box { Box::new(DefaultSchemaAdapter { projected_table_schema, - table_schema, }) } } @@ -231,12 +217,6 @@ pub(crate) struct DefaultSchemaAdapter { /// The schema for the table, projected to include only the fields being output (projected) by the /// associated ParquetSource projected_table_schema: SchemaRef, - /// The entire table schema for the table we're using this to adapt. - /// - /// This is used to evaluate any filters pushed down into the scan - /// which may refer to columns that are not referred to anywhere - /// else in the plan. - table_schema: SchemaRef, } impl SchemaAdapter for DefaultSchemaAdapter { @@ -290,7 +270,6 @@ impl SchemaAdapter for DefaultSchemaAdapter { Arc::new(SchemaMapping { projected_table_schema: Arc::clone(&self.projected_table_schema), field_mappings, - table_schema: Arc::clone(&self.table_schema), }), projection, )) @@ -300,27 +279,12 @@ impl SchemaAdapter for DefaultSchemaAdapter { /// The SchemaMapping struct holds a mapping from the file schema to the table /// schema and any necessary type conversions. /// -/// Note, because `map_batch` and `map_partial_batch` functions have different -/// needs, this struct holds two schemas: -/// -/// 1. The projected **table** schema -/// 2. The full table schema -/// /// [`map_batch`] is used by the ParquetOpener to produce a RecordBatch which /// has the projected schema, since that's the schema which is supposed to come /// out of the execution of this query. Thus `map_batch` uses /// `projected_table_schema` as it can only operate on the projected fields. /// -/// [`map_partial_batch`] is used to create a RecordBatch with a schema that -/// can be used for Parquet predicate pushdown, meaning that it may contain -/// fields which are not in the projected schema (as the fields that parquet -/// pushdown filters operate can be completely distinct from the fields that are -/// projected (output) out of the ParquetSource). `map_partial_batch` thus uses -/// `table_schema` to create the resulting RecordBatch (as it could be operating -/// on any fields in the schema). -/// /// [`map_batch`]: Self::map_batch -/// [`map_partial_batch`]: Self::map_partial_batch #[derive(Debug)] pub struct SchemaMapping { /// The schema of the table. This is the expected schema after conversion @@ -332,18 +296,12 @@ pub struct SchemaMapping { /// They are Options instead of just plain `usize`s because the table could /// have fields that don't exist in the file. field_mappings: Vec>, - /// The entire table schema, as opposed to the projected_table_schema (which - /// only contains the columns that we are projecting out of this query). - /// This contains all fields in the table, regardless of if they will be - /// projected out or not. - table_schema: SchemaRef, } impl SchemaMapper for SchemaMapping { /// Adapts a `RecordBatch` to match the `projected_table_schema` using the stored mapping and - /// conversions. The produced RecordBatch has a schema that contains only the projected - /// columns, so if one needs a RecordBatch with a schema that references columns which are not - /// in the projected, it would be better to use `map_partial_batch` + /// conversions. + /// The produced RecordBatch has a schema that contains only the projected columns. fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result { let batch_rows = batch.num_rows(); let batch_cols = batch.columns().to_vec(); @@ -376,54 +334,4 @@ impl SchemaMapper for SchemaMapping { let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?; Ok(record_batch) } - - /// Adapts a [`RecordBatch`]'s schema into one that has all the correct output types and only - /// contains the fields that exist in both the file schema and table schema. - /// - /// Unlike `map_batch` this method also preserves the columns that - /// may not appear in the final output (`projected_table_schema`) but may - /// appear in push down predicates - fn map_partial_batch( - &self, - batch: RecordBatch, - ) -> datafusion_common::Result { - let batch_cols = batch.columns().to_vec(); - let schema = batch.schema(); - - // for each field in the batch's schema (which is based on a file, not a table)... - let (cols, fields) = schema - .fields() - .iter() - .zip(batch_cols.iter()) - .flat_map(|(field, batch_col)| { - self.table_schema - // try to get the same field from the table schema that we have stored in self - .field_with_name(field.name()) - // and if we don't have it, that's fine, ignore it. This may occur when we've - // created an external table whose fields are a subset of the fields in this - // file, then tried to read data from the file into this table. If that is the - // case here, it's fine to ignore because we don't care about this field - // anyways - .ok() - // but if we do have it, - .map(|table_field| { - // try to cast it into the correct output type. we don't want to ignore this - // error, though, so it's propagated. - cast(batch_col, table_field.data_type()) - // and if that works, return the field and column. - .map(|new_col| (new_col, table_field.clone())) - }) - }) - .collect::, _>>()? - .into_iter() - .unzip::<_, _, Vec<_>, Vec<_>>(); - - // Necessary to handle empty batches - let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); - - let schema = - Arc::new(Schema::new_with_metadata(fields, schema.metadata().clone())); - let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?; - Ok(record_batch) - } } diff --git a/datafusion/datasource/src/statistics.rs b/datafusion/datasource/src/statistics.rs index 9df5aa993d43..cd002a96683a 100644 --- a/datafusion/datasource/src/statistics.rs +++ b/datafusion/datasource/src/statistics.rs @@ -30,7 +30,7 @@ use arrow::{ compute::SortColumn, row::{Row, Rows}, }; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_datafusion_err, plan_err, DataFusionError, Result}; use datafusion_physical_expr::{expressions::Column, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; @@ -202,10 +202,10 @@ impl MinMaxStatistics { .zip(max_values.column_by_name(column.name())) } .ok_or_else(|| { - DataFusionError::Plan(format!( + plan_datafusion_err!( "missing column in MinMaxStatistics::new: '{}'", column.name() - )) + ) }) }) .collect::>>()? diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs index 89e73a8a2b26..0667a043e20f 100644 --- a/datafusion/datasource/src/url.rs +++ b/datafusion/datasource/src/url.rs @@ -136,7 +136,12 @@ impl ListingTableUrl { } /// Creates a new [`ListingTableUrl`] from a url and optional glob expression - fn try_new(url: Url, glob: Option) -> Result { + /// + /// [`Self::parse`] supports glob expression only for file system paths. + /// However, some applications may want to support glob expression for URLs with a scheme. + /// The application can split the URL into a base URL and a glob expression and use this method + /// to create a [`ListingTableUrl`]. + pub fn try_new(url: Url, glob: Option) -> Result { let prefix = Path::from_url_path(url.path())?; Ok(Self { url, prefix, glob }) } diff --git a/datafusion/datasource/src/write/orchestration.rs b/datafusion/datasource/src/write/orchestration.rs index 1364e7d9f236..0ac1d26c6cc1 100644 --- a/datafusion/datasource/src/write/orchestration.rs +++ b/datafusion/datasource/src/write/orchestration.rs @@ -28,7 +28,7 @@ use datafusion_common::error::Result; use arrow::array::RecordBatch; use datafusion_common::{internal_datafusion_err, internal_err, DataFusionError}; -use datafusion_common_runtime::SpawnedTask; +use datafusion_common_runtime::{JoinSet, SpawnedTask}; use datafusion_execution::TaskContext; use bytes::Bytes; @@ -36,7 +36,6 @@ use futures::join; use object_store::ObjectStore; use tokio::io::{AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc::{self, Receiver}; -use tokio::task::JoinSet; type WriterType = Box; type SerializerType = Arc; diff --git a/datafusion/execution/src/lib.rs b/datafusion/execution/src/lib.rs index a9e3a27f8035..6a0a4b6322ee 100644 --- a/datafusion/execution/src/lib.rs +++ b/datafusion/execution/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! DataFusion execution configuration and runtime structures diff --git a/datafusion/expr-common/src/accumulator.rs b/datafusion/expr-common/src/accumulator.rs index dc1e023d4c3c..3a63c3289481 100644 --- a/datafusion/expr-common/src/accumulator.rs +++ b/datafusion/expr-common/src/accumulator.rs @@ -109,6 +109,7 @@ pub trait Accumulator: Send + Sync + Debug { /// │(AggregateMode::Final) │ state() is called for each /// │ │ group and the resulting /// └─────────────────────────┘ RecordBatches passed to the + /// Final GroupBy via merge_batch() /// ▲ /// │ /// ┌────────────────┴───────────────┐ diff --git a/datafusion/expr-common/src/lib.rs b/datafusion/expr-common/src/lib.rs index ee40038beb21..961670a3b7f4 100644 --- a/datafusion/expr-common/src/lib.rs +++ b/datafusion/expr-common/src/lib.rs @@ -28,7 +28,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] pub mod accumulator; diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 682cc885cd6b..fb559e163bb1 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -1177,26 +1177,6 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; - match (lhs_type, rhs_type) { - // If Utf8View is in any side, we coerce to Utf8. - (Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => { - Some(Utf8) - } - // Then, if LargeUtf8 is in any side, we coerce to LargeUtf8. - (LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8), - // Utf8 coerces to Utf8 - (Utf8, Utf8) => Some(Utf8), - _ => None, - } -} - fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { @@ -1327,7 +1307,7 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - regex_comparison_string_coercion(lhs_type, rhs_type) + string_coercion(lhs_type, rhs_type) .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false)) .or_else(|| regex_null_coercion(lhs_type, rhs_type)) } @@ -1802,42 +1782,168 @@ mod tests { Operator::RegexMatch, DataType::Utf8 ); + test_coercion_binary_rule!( + DataType::Utf8, + DataType::Utf8View, + Operator::RegexMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Utf8View, + DataType::Utf8, + Operator::RegexMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Utf8View, + DataType::Utf8View, + Operator::RegexMatch, + DataType::Utf8View + ); test_coercion_binary_rule!( DataType::Utf8, DataType::Utf8, Operator::RegexNotMatch, DataType::Utf8 ); + test_coercion_binary_rule!( + DataType::Utf8View, + DataType::Utf8, + Operator::RegexNotMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Utf8, + DataType::Utf8View, + Operator::RegexNotMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Utf8View, + DataType::Utf8View, + Operator::RegexNotMatch, + DataType::Utf8View + ); test_coercion_binary_rule!( DataType::Utf8, DataType::Utf8, Operator::RegexNotIMatch, DataType::Utf8 ); + test_coercion_binary_rule!( + DataType::Utf8View, + DataType::Utf8, + Operator::RegexNotIMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Utf8, + DataType::Utf8View, + Operator::RegexNotIMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Utf8View, + DataType::Utf8View, + Operator::RegexNotIMatch, + DataType::Utf8View + ); test_coercion_binary_rule!( DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), DataType::Utf8, Operator::RegexMatch, DataType::Utf8 ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), + DataType::Utf8View, + Operator::RegexMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()), + DataType::Utf8, + Operator::RegexMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()), + DataType::Utf8View, + Operator::RegexMatch, + DataType::Utf8View + ); test_coercion_binary_rule!( DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), DataType::Utf8, Operator::RegexIMatch, DataType::Utf8 ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()), + DataType::Utf8, + Operator::RegexIMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), + DataType::Utf8View, + Operator::RegexIMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()), + DataType::Utf8View, + Operator::RegexIMatch, + DataType::Utf8View + ); test_coercion_binary_rule!( DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), DataType::Utf8, Operator::RegexNotMatch, DataType::Utf8 ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), + DataType::Utf8View, + Operator::RegexNotMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()), + DataType::Utf8, + Operator::RegexNotMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), + DataType::Utf8View, + Operator::RegexNotMatch, + DataType::Utf8View + ); test_coercion_binary_rule!( DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), DataType::Utf8, Operator::RegexNotIMatch, DataType::Utf8 ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()), + DataType::Utf8, + Operator::RegexNotIMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()), + DataType::Utf8View, + Operator::RegexNotIMatch, + DataType::Utf8View + ); + test_coercion_binary_rule!( + DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()), + DataType::Utf8View, + Operator::RegexNotIMatch, + DataType::Utf8View + ); test_coercion_binary_rule!( DataType::Int16, DataType::Int64, diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 4ab514b72b8b..d8a6687aa8a5 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -64,6 +64,15 @@ use sqlparser::ast::{ /// /// [`ExprFunctionExt`]: crate::expr_fn::ExprFunctionExt /// +/// # Printing Expressions +/// +/// You can print `Expr`s using the the `Debug` trait, `Display` trait, or +/// [`Self::human_display`]. See the [examples](#examples-displaying-exprs) below. +/// +/// If you need SQL to pass to other systems, consider using [`Unparser`]. +/// +/// [`Unparser`]: https://docs.rs/datafusion/latest/datafusion/sql/unparser/struct.Unparser.html +/// /// # Schema Access /// /// See [`ExprSchemable::get_type`] to access the [`DataType`] and nullability @@ -76,9 +85,9 @@ use sqlparser::ast::{ /// `Expr` and [`TreeNode::transform`] can be used to rewrite an expression. See /// the examples below and [`TreeNode`] for more information. /// -/// # Examples +/// # Examples: Creating and Using `Expr`s /// -/// ## Column references and literals +/// ## Column References and Literals /// /// [`Expr::Column`] refer to the values of columns and are often created with /// the [`col`] function. For example to create an expression `c1` referring to @@ -104,6 +113,7 @@ use sqlparser::ast::{ /// // All literals are strongly typed in DataFusion. To make an `i64` 42: /// let expr = lit(42i64); /// assert_eq!(expr, Expr::Literal(ScalarValue::Int64(Some(42)))); +/// assert_eq!(expr, Expr::Literal(ScalarValue::Int64(Some(42)))); /// // To make a (typed) NULL: /// let expr = Expr::Literal(ScalarValue::Int64(None)); /// // to make an (untyped) NULL (the optimizer will coerce this to the correct type): @@ -171,7 +181,51 @@ use sqlparser::ast::{ /// ]); /// ``` /// -/// # Visiting and Rewriting `Expr`s +/// # Examples: Displaying `Exprs` +/// +/// There are three ways to print an `Expr` depending on the usecase. +/// +/// ## Use `Debug` trait +/// +/// Following Rust conventions, the `Debug` implementation prints out the +/// internal structure of the expression, which is useful for debugging. +/// +/// ``` +/// # use datafusion_expr::{lit, col}; +/// let expr = col("c1") + lit(42); +/// assert_eq!(format!("{expr:?}"), "BinaryExpr(BinaryExpr { left: Column(Column { relation: None, name: \"c1\" }), op: Plus, right: Literal(Int32(42)) })"); +/// ``` +/// +/// ## Use the `Display` trait (detailed expression) +/// +/// The `Display` implementation prints out the expression in a SQL-like form, +/// but has additional details such as the data type of literals. This is useful +/// for understanding the expression in more detail and is used for the low level +/// [`ExplainFormat::Indent`] explain plan format. +/// +/// [`ExplainFormat::Indent`]: crate::logical_plan::ExplainFormat::Indent +/// +/// ``` +/// # use datafusion_expr::{lit, col}; +/// let expr = col("c1") + lit(42); +/// assert_eq!(format!("{expr}"), "c1 + Int32(42)"); +/// ``` +/// +/// ## Use [`Self::human_display`] (human readable) +/// +/// [`Self::human_display`] prints out the expression in a SQL-like form, optimized +/// for human consumption by end users. It is used for the +/// [`ExplainFormat::Tree`] explain plan format. +/// +/// [`ExplainFormat::Tree`]: crate::logical_plan::ExplainFormat::Tree +/// +///``` +/// # use datafusion_expr::{lit, col}; +/// let expr = col("c1") + lit(42); +/// assert_eq!(format!("{}", expr.human_display()), "c1 + 42"); +/// ``` +/// +/// # Examples: Visiting and Rewriting `Expr`s /// /// Here is an example that finds all literals in an `Expr` tree: /// ``` @@ -391,11 +445,34 @@ impl Unnest { } /// Alias expression -#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +#[derive(Clone, PartialEq, Eq, Debug)] pub struct Alias { pub expr: Box, pub relation: Option, pub name: String, + pub metadata: Option>, +} + +impl Hash for Alias { + fn hash(&self, state: &mut H) { + self.expr.hash(state); + self.relation.hash(state); + self.name.hash(state); + } +} + +impl PartialOrd for Alias { + fn partial_cmp(&self, other: &Self) -> Option { + let cmp = self.expr.partial_cmp(&other.expr); + let Some(std::cmp::Ordering::Equal) = cmp else { + return cmp; + }; + let cmp = self.relation.partial_cmp(&other.relation); + let Some(std::cmp::Ordering::Equal) = cmp else { + return cmp; + }; + self.name.partial_cmp(&other.name) + } } impl Alias { @@ -409,6 +486,7 @@ impl Alias { expr: Box::new(expr), relation: relation.map(|r| r.into()), name: name.into(), + metadata: None, } } @@ -422,8 +500,17 @@ impl Alias { expr, relation: relation.map(|r| r.into()), name: name.into(), + metadata: None, } } + + pub fn with_metadata( + mut self, + metadata: Option>, + ) -> Self { + self.metadata = metadata; + self + } } /// Binary expression @@ -1128,6 +1215,31 @@ impl Expr { SchemaDisplay(self) } + /// Human readable display formatting for this expression. + /// + /// This function is primarily used in printing the explain tree output, + /// (e.g. `EXPLAIN FORMAT TREE `), providing a readable format to + /// show how expressions are used in physical and logical plans. See the + /// [`Expr`] for other ways to format expressions + /// + /// Note this format is intended for human consumption rather than SQL for + /// other systems. If you need SQL to pass to other systems, consider using + /// [`Unparser`]. + /// + /// [`Unparser`]: https://docs.rs/datafusion/latest/datafusion/sql/unparser/struct.Unparser.html + /// + /// # Example + /// ``` + /// # use datafusion_expr::{col, lit}; + /// let expr = col("foo") + lit(42); + /// // For EXPLAIN output: + /// // "foo + 42" + /// println!("{}", expr.human_display()); + /// ``` + pub fn human_display(&self) -> impl Display + '_ { + SqlDisplay(self) + } + /// Returns the qualifier and the schema name of this expression. /// /// Used when the expression forms the output field of a certain plan. @@ -1306,6 +1418,27 @@ impl Expr { } } + /// Return `self AS name` alias expression with metadata + /// + /// The metadata will be attached to the Arrow Schema field when the expression + /// is converted to a field via `Expr.to_field()`. + /// + /// # Example + /// ``` + /// # use datafusion_expr::col; + /// use std::collections::HashMap; + /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]); + /// let expr = col("foo").alias_with_metadata("bar", Some(metadata)); + /// ``` + /// + pub fn alias_with_metadata( + self, + name: impl Into, + metadata: Option>, + ) -> Expr { + Expr::Alias(Alias::new(self, None::<&str>, name.into()).with_metadata(metadata)) + } + /// Return `self AS name` alias expression with a specific qualifier pub fn alias_qualified( self, @@ -1320,6 +1453,28 @@ impl Expr { } } + /// Return `self AS name` alias expression with a specific qualifier and metadata + /// + /// The metadata will be attached to the Arrow Schema field when the expression + /// is converted to a field via `Expr.to_field()`. + /// + /// # Example + /// ``` + /// # use datafusion_expr::col; + /// use std::collections::HashMap; + /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]); + /// let expr = col("foo").alias_qualified_with_metadata(Some("tbl"), "bar", Some(metadata)); + /// ``` + /// + pub fn alias_qualified_with_metadata( + self, + relation: Option>, + name: impl Into, + metadata: Option>, + ) -> Expr { + Expr::Alias(Alias::new(self, relation, name.into()).with_metadata(metadata)) + } + /// Remove an alias from an expression if one exists. /// /// If the expression is not an alias, the expression is returned unchanged. @@ -1771,11 +1926,13 @@ impl NormalizeEq for Expr { expr: self_expr, relation: self_relation, name: self_name, + .. }), Expr::Alias(Alias { expr: other_expr, relation: other_relation, name: other_name, + .. }), ) => { self_name == other_name @@ -2121,6 +2278,7 @@ impl HashNode for Expr { expr: _expr, relation, name, + .. }) => { relation.hash(state); name.hash(state); @@ -2551,6 +2709,187 @@ impl Display for SchemaDisplay<'_> { } } +/// A helper struct for displaying an `Expr` as an SQL-like string. +struct SqlDisplay<'a>(&'a Expr); + +impl Display for SqlDisplay<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self.0 { + Expr::Literal(scalar) => scalar.fmt(f), + Expr::Alias(Alias { name, .. }) => write!(f, "{name}"), + Expr::Between(Between { + expr, + negated, + low, + high, + }) => { + if *negated { + write!( + f, + "{} NOT BETWEEN {} AND {}", + SqlDisplay(expr), + SqlDisplay(low), + SqlDisplay(high), + ) + } else { + write!( + f, + "{} BETWEEN {} AND {}", + SqlDisplay(expr), + SqlDisplay(low), + SqlDisplay(high), + ) + } + } + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + write!(f, "{} {op} {}", SqlDisplay(left), SqlDisplay(right),) + } + Expr::Case(Case { + expr, + when_then_expr, + else_expr, + }) => { + write!(f, "CASE ")?; + + if let Some(e) = expr { + write!(f, "{} ", SqlDisplay(e))?; + } + + for (when, then) in when_then_expr { + write!(f, "WHEN {} THEN {} ", SqlDisplay(when), SqlDisplay(then),)?; + } + + if let Some(e) = else_expr { + write!(f, "ELSE {} ", SqlDisplay(e))?; + } + + write!(f, "END") + } + Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) => { + write!(f, "{}", SqlDisplay(expr)) + } + Expr::InList(InList { + expr, + list, + negated, + }) => { + write!( + f, + "{}{} IN {}", + SqlDisplay(expr), + if *negated { " NOT" } else { "" }, + ExprListDisplay::comma_separated(list.as_slice()) + ) + } + Expr::GroupingSet(GroupingSet::Cube(exprs)) => { + write!( + f, + "ROLLUP ({})", + ExprListDisplay::comma_separated(exprs.as_slice()) + ) + } + Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { + write!(f, "GROUPING SETS (")?; + for exprs in lists_of_exprs.iter() { + write!( + f, + "({})", + ExprListDisplay::comma_separated(exprs.as_slice()) + )?; + } + write!(f, ")") + } + Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { + write!( + f, + "ROLLUP ({})", + ExprListDisplay::comma_separated(exprs.as_slice()) + ) + } + Expr::IsNull(expr) => write!(f, "{} IS NULL", SqlDisplay(expr)), + Expr::IsNotNull(expr) => { + write!(f, "{} IS NOT NULL", SqlDisplay(expr)) + } + Expr::IsUnknown(expr) => { + write!(f, "{} IS UNKNOWN", SqlDisplay(expr)) + } + Expr::IsNotUnknown(expr) => { + write!(f, "{} IS NOT UNKNOWN", SqlDisplay(expr)) + } + Expr::IsTrue(expr) => write!(f, "{} IS TRUE", SqlDisplay(expr)), + Expr::IsFalse(expr) => write!(f, "{} IS FALSE", SqlDisplay(expr)), + Expr::IsNotTrue(expr) => { + write!(f, "{} IS NOT TRUE", SqlDisplay(expr)) + } + Expr::IsNotFalse(expr) => { + write!(f, "{} IS NOT FALSE", SqlDisplay(expr)) + } + Expr::Like(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive, + }) => { + write!( + f, + "{} {}{} {}", + SqlDisplay(expr), + if *negated { "NOT " } else { "" }, + if *case_insensitive { "ILIKE" } else { "LIKE" }, + SqlDisplay(pattern), + )?; + + if let Some(char) = escape_char { + write!(f, " CHAR '{char}'")?; + } + + Ok(()) + } + Expr::Negative(expr) => write!(f, "(- {})", SqlDisplay(expr)), + Expr::Not(expr) => write!(f, "NOT {}", SqlDisplay(expr)), + Expr::Unnest(Unnest { expr }) => { + write!(f, "UNNEST({})", SqlDisplay(expr)) + } + Expr::SimilarTo(Like { + negated, + expr, + pattern, + escape_char, + .. + }) => { + write!( + f, + "{} {} {}", + SqlDisplay(expr), + if *negated { + "NOT SIMILAR TO" + } else { + "SIMILAR TO" + }, + SqlDisplay(pattern), + )?; + if let Some(char) = escape_char { + write!(f, " CHAR '{char}'")?; + } + + Ok(()) + } + Expr::AggregateFunction(AggregateFunction { func, params }) => { + match func.human_display(params) { + Ok(name) => { + write!(f, "{name}") + } + Err(e) => { + write!(f, "got error from schema_name {}", e) + } + } + } + _ => write!(f, "{}", self.0), + } + } +} + /// Get schema_name for Vector of expressions /// /// Internal usage. Please call `schema_name_from_exprs` instead @@ -2562,6 +2901,38 @@ pub(crate) fn schema_name_from_exprs_comma_separated_without_space( schema_name_from_exprs_inner(exprs, ",") } +/// Formats a list of `&Expr` with a custom separator using SQL display format +pub struct ExprListDisplay<'a> { + exprs: &'a [Expr], + sep: &'a str, +} + +impl<'a> ExprListDisplay<'a> { + /// Create a new display struct with the given expressions and separator + pub fn new(exprs: &'a [Expr], sep: &'a str) -> Self { + Self { exprs, sep } + } + + /// Create a new display struct with comma-space separator + pub fn comma_separated(exprs: &'a [Expr]) -> Self { + Self::new(exprs, ", ") + } +} + +impl Display for ExprListDisplay<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + let mut first = true; + for expr in self.exprs { + if !first { + write!(f, "{}", self.sep)?; + } + write!(f, "{}", SqlDisplay(expr))?; + first = false; + } + Ok(()) + } +} + /// Get schema_name for Vector of expressions pub fn schema_name_from_exprs(exprs: &[Expr]) -> Result { schema_name_from_exprs_inner(exprs, ", ") diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index a8e7fd76d037..966aba7d1195 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -25,6 +25,7 @@ use crate::function::{ AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory, StateFieldsArgs, }; +use crate::select_expr::SelectExpr; use crate::{ conditional_expressions::CaseBuilder, expr::Sort, logical_plan::Subquery, AggregateUDF, Expr, LogicalPlan, Operator, PartitionEvaluator, ScalarFunctionArgs, @@ -37,7 +38,7 @@ use arrow::compute::kernels::cast_utils::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, }; use arrow::datatypes::{DataType, Field}; -use datafusion_common::{plan_err, Column, Result, ScalarValue, TableReference}; +use datafusion_common::{plan_err, Column, Result, ScalarValue, Spans, TableReference}; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use datafusion_functions_window_common::partition::PartitionEvaluatorArgs; use sqlparser::ast::NullTreatment; @@ -120,21 +121,13 @@ pub fn placeholder(id: impl Into) -> Expr { /// let p = wildcard(); /// assert_eq!(p.to_string(), "*") /// ``` -pub fn wildcard() -> Expr { - #[expect(deprecated)] - Expr::Wildcard { - qualifier: None, - options: Box::new(WildcardOptions::default()), - } +pub fn wildcard() -> SelectExpr { + SelectExpr::Wildcard(WildcardOptions::default()) } /// Create an '*' [`Expr::Wildcard`] expression with the wildcard options -pub fn wildcard_with_options(options: WildcardOptions) -> Expr { - #[expect(deprecated)] - Expr::Wildcard { - qualifier: None, - options: Box::new(options), - } +pub fn wildcard_with_options(options: WildcardOptions) -> SelectExpr { + SelectExpr::Wildcard(options) } /// Create an 't.*' [`Expr::Wildcard`] expression that matches all columns from a specific table @@ -147,24 +140,16 @@ pub fn wildcard_with_options(options: WildcardOptions) -> Expr { /// let p = qualified_wildcard(TableReference::bare("t")); /// assert_eq!(p.to_string(), "t.*") /// ``` -pub fn qualified_wildcard(qualifier: impl Into) -> Expr { - #[expect(deprecated)] - Expr::Wildcard { - qualifier: Some(qualifier.into()), - options: Box::new(WildcardOptions::default()), - } +pub fn qualified_wildcard(qualifier: impl Into) -> SelectExpr { + SelectExpr::QualifiedWildcard(qualifier.into(), WildcardOptions::default()) } /// Create an 't.*' [`Expr::Wildcard`] expression with the wildcard options pub fn qualified_wildcard_with_options( qualifier: impl Into, options: WildcardOptions, -) -> Expr { - #[expect(deprecated)] - Expr::Wildcard { - qualifier: Some(qualifier.into()), - options: Box::new(options), - } +) -> SelectExpr { + SelectExpr::QualifiedWildcard(qualifier.into(), options) } /// Return a new expression `left right` @@ -252,6 +237,7 @@ pub fn exists(subquery: Arc) -> Expr { subquery: Subquery { subquery, outer_ref_columns, + spans: Spans::new(), }, negated: false, }) @@ -264,6 +250,7 @@ pub fn not_exists(subquery: Arc) -> Expr { subquery: Subquery { subquery, outer_ref_columns, + spans: Spans::new(), }, negated: true, }) @@ -277,6 +264,7 @@ pub fn in_subquery(expr: Expr, subquery: Arc) -> Expr { Subquery { subquery, outer_ref_columns, + spans: Spans::new(), }, false, )) @@ -290,6 +278,7 @@ pub fn not_in_subquery(expr: Expr, subquery: Arc) -> Expr { Subquery { subquery, outer_ref_columns, + spans: Spans::new(), }, true, )) @@ -301,6 +290,7 @@ pub fn scalar_subquery(subquery: Arc) -> Expr { Expr::ScalarSubquery(Subquery { subquery, outer_ref_columns, + spans: Spans::new(), }) } diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 0a14cb5c60a0..a349c83a4934 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -30,7 +30,7 @@ use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ not_impl_err, plan_datafusion_err, plan_err, Column, DataFusionError, ExprSchema, - Result, TableReference, + Result, Spans, TableReference, }; use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer; use datafusion_functions_window_common::field::WindowUDFFieldArgs; @@ -343,7 +343,16 @@ impl ExprSchemable for Expr { fn metadata(&self, schema: &dyn ExprSchema) -> Result> { match self { Expr::Column(c) => Ok(schema.metadata(c)?.clone()), - Expr::Alias(Alias { expr, .. }) => expr.metadata(schema), + Expr::Alias(Alias { expr, metadata, .. }) => { + let mut ret = expr.metadata(schema)?; + if let Some(metadata) = metadata { + if !metadata.is_empty() { + ret.extend(metadata.clone()); + return Ok(ret); + } + } + Ok(ret) + } Expr::Cast(Cast { expr, .. }) => expr.metadata(schema), _ => Ok(HashMap::new()), } @@ -608,6 +617,7 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result>, filters: Vec, ) -> Result { - TableScan::try_new(table_name, table_source, projection, filters, None) - .map(LogicalPlan::TableScan) - .map(Self::new) + Self::scan_with_filters_inner(table_name, table_source, projection, filters, None) } /// Convert a table provider into a builder with a TableScan with filter and fetch @@ -478,15 +478,43 @@ impl LogicalPlanBuilder { filters: Vec, fetch: Option, ) -> Result { - TableScan::try_new(table_name, table_source, projection, filters, fetch) - .map(LogicalPlan::TableScan) - .map(Self::new) + Self::scan_with_filters_inner( + table_name, + table_source, + projection, + filters, + fetch, + ) + } + + fn scan_with_filters_inner( + table_name: impl Into, + table_source: Arc, + projection: Option>, + filters: Vec, + fetch: Option, + ) -> Result { + let table_scan = + TableScan::try_new(table_name, table_source, projection, filters, fetch)?; + + // Inline TableScan + if table_scan.filters.is_empty() { + if let Some(p) = table_scan.source.get_logical_plan() { + let sub_plan = p.into_owned(); + // Ensures that the reference to the inlined table remains the + // same, meaning we don't have to change any of the parent nodes + // that reference this table. + return Self::new(sub_plan).alias(table_scan.table_name); + } + } + + Ok(Self::new(LogicalPlan::TableScan(table_scan))) } /// Wrap a plan in a window pub fn window_plan( input: LogicalPlan, - window_exprs: Vec, + window_exprs: impl IntoIterator, ) -> Result { let mut plan = input; let mut groups = group_window_expr_by_sort_keys(window_exprs)?; @@ -520,10 +548,11 @@ impl LogicalPlanBuilder { } Ok(plan) } + /// Apply a projection without alias. pub fn project( self, - expr: impl IntoIterator>, + expr: impl IntoIterator>, ) -> Result { project(Arc::unwrap_or_clone(self.plan), expr).map(Self::new) } @@ -532,7 +561,7 @@ impl LogicalPlanBuilder { /// (true to validate, false to not validate) pub fn project_with_validation( self, - expr: Vec<(impl Into, bool)>, + expr: Vec<(impl Into, bool)>, ) -> Result { project_with_validation(Arc::unwrap_or_clone(self.plan), expr).map(Self::new) } @@ -776,6 +805,7 @@ impl LogicalPlanBuilder { &missing_cols, is_distinct, )?; + let sort_plan = LogicalPlan::Sort(Sort { expr: normalize_sorts(sorts, &plan)?, input: Arc::new(plan), @@ -1211,6 +1241,7 @@ impl LogicalPlanBuilder { Ok(Self::new(LogicalPlan::Explain(Explain { verbose, plan: self.plan, + explain_format: ExplainFormat::Indent, stringified_plans, schema, logical_optimization_succeeded: false, @@ -1655,7 +1686,7 @@ pub fn union_by_name( /// * An invalid expression is used (e.g. a `sort` expression) pub fn project( plan: LogicalPlan, - expr: impl IntoIterator>, + expr: impl IntoIterator>, ) -> Result { project_with_validation(plan, expr.into_iter().map(|e| (e, true))) } @@ -1669,15 +1700,54 @@ pub fn project( /// * An invalid expression is used (e.g. a `sort` expression) fn project_with_validation( plan: LogicalPlan, - expr: impl IntoIterator, bool)>, + expr: impl IntoIterator, bool)>, ) -> Result { let mut projected_expr = vec![]; for (e, validate) in expr { let e = e.into(); match e { - #[expect(deprecated)] - Expr::Wildcard { .. } => projected_expr.push(e), - _ => { + SelectExpr::Wildcard(opt) => { + let expanded = expand_wildcard(plan.schema(), &plan, Some(&opt))?; + + // If there is a REPLACE statement, replace that column with the given + // replace expression. Column name remains the same. + let expanded = if let Some(replace) = opt.replace { + replace_columns(expanded, &replace)? + } else { + expanded + }; + + for e in expanded { + if validate { + projected_expr + .push(columnize_expr(normalize_col(e, &plan)?, &plan)?) + } else { + projected_expr.push(e) + } + } + } + SelectExpr::QualifiedWildcard(table_ref, opt) => { + let expanded = + expand_qualified_wildcard(&table_ref, plan.schema(), Some(&opt))?; + + // If there is a REPLACE statement, replace that column with the given + // replace expression. Column name remains the same. + let expanded = if let Some(replace) = opt.replace { + replace_columns(expanded, &replace)? + } else { + expanded + }; + + for e in expanded { + if validate { + projected_expr + .push(columnize_expr(normalize_col(e, &plan)?, &plan)?) + } else { + projected_expr.push(e) + } + } + } + SelectExpr::Expression(e) => { if validate { projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?) } else { @@ -1691,6 +1761,29 @@ fn project_with_validation( Projection::try_new(projected_expr, Arc::new(plan)).map(LogicalPlan::Projection) } +/// If there is a REPLACE statement in the projected expression in the form of +/// "REPLACE (some_column_within_an_expr AS some_column)", this function replaces +/// that column with the given replace expression. Column name remains the same. +/// Multiple REPLACEs are also possible with comma separations. +fn replace_columns( + mut exprs: Vec, + replace: &PlannedReplaceSelectItem, +) -> Result> { + for expr in exprs.iter_mut() { + if let Expr::Column(Column { name, .. }) = expr { + if let Some((_, new_expr)) = replace + .items() + .iter() + .zip(replace.expressions().iter()) + .find(|(item, _)| item.column_name.value == *name) + { + *expr = new_expr.clone().alias(name.clone()) + } + } + } + Ok(exprs) +} + /// Create a SubqueryAlias to wrap a LogicalPlan. pub fn subquery_alias( plan: LogicalPlan, @@ -1809,7 +1902,7 @@ pub fn wrap_projection_for_join_if_necessary( projection.extend(join_key_items); LogicalPlanBuilder::from(input) - .project(projection)? + .project(projection.into_iter().map(SelectExpr::from))? .build()? } else { input diff --git a/datafusion/expr/src/logical_plan/invariants.rs b/datafusion/expr/src/logical_plan/invariants.rs index f4ac33b6afef..d83410bf99c9 100644 --- a/datafusion/expr/src/logical_plan/invariants.rs +++ b/datafusion/expr/src/logical_plan/invariants.rs @@ -112,11 +112,11 @@ fn assert_valid_semantic_plan(plan: &LogicalPlan) -> Result<()> { /// Returns an error if the plan does not have the expected schema. /// Ignores metadata and nullability. pub fn assert_expected_schema(schema: &DFSchemaRef, plan: &LogicalPlan) -> Result<()> { - let equivalent = plan.schema().equivalent_names_and_types(schema); + let compatible = plan.schema().has_equivalent_names_and_types(schema); - if !equivalent { + if let Err(e) = compatible { internal_err!( - "Failed due to a difference in schemas, original schema: {:?}, new schema: {:?}", + "Failed due to a difference in schemas: {e}, original schema: {:?}, new schema: {:?}", schema, plan.schema() ) diff --git a/datafusion/expr/src/logical_plan/mod.rs b/datafusion/expr/src/logical_plan/mod.rs index 916b2131be04..a55f4d97b212 100644 --- a/datafusion/expr/src/logical_plan/mod.rs +++ b/datafusion/expr/src/logical_plan/mod.rs @@ -38,9 +38,9 @@ pub use ddl::{ pub use dml::{DmlStatement, WriteOp}; pub use plan::{ projection_schema, Aggregate, Analyze, ColumnUnnestList, DescribeTable, Distinct, - DistinctOn, EmptyRelation, Explain, Extension, FetchType, Filter, Join, - JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, Projection, - RecursiveQuery, Repartition, SkipType, Sort, StringifiedPlan, Subquery, + DistinctOn, EmptyRelation, Explain, ExplainFormat, Extension, FetchType, Filter, + Join, JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, + Projection, RecursiveQuery, Repartition, SkipType, Sort, StringifiedPlan, Subquery, SubqueryAlias, TableScan, ToStringifiedPlan, Union, Unnest, Values, Window, }; pub use statement::{ diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 682342d27b29..641489b5d9ce 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -21,6 +21,7 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::{self, Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; +use std::str::FromStr; use std::sync::{Arc, LazyLock}; use super::dml::CopyTo; @@ -38,9 +39,8 @@ use crate::logical_plan::display::{GraphvizVisitor, IndentVisitor}; use crate::logical_plan::extension::UserDefinedLogicalNode; use crate::logical_plan::{DmlStatement, Statement}; use crate::utils::{ - enumerate_grouping_sets, exprlist_len, exprlist_to_fields, find_base_plan, - find_out_reference_exprs, grouping_set_expr_count, grouping_set_to_exprlist, - split_conjunction, + enumerate_grouping_sets, exprlist_to_fields, find_out_reference_exprs, + grouping_set_expr_count, grouping_set_to_exprlist, split_conjunction, }; use crate::{ build_join_schema, expr_vec_fmt, BinaryExpr, CreateMemoryTable, CreateView, Execute, @@ -56,7 +56,7 @@ use datafusion_common::tree_node::{ use datafusion_common::{ aggregate_functional_dependencies, internal_err, plan_err, Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, - FunctionalDependencies, ParamValues, Result, ScalarValue, TableReference, + FunctionalDependencies, ParamValues, Result, ScalarValue, Spans, TableReference, UnnestOptions, }; use indexmap::IndexSet; @@ -940,7 +940,9 @@ impl LogicalPlan { })) } LogicalPlan::Subquery(Subquery { - outer_ref_columns, .. + outer_ref_columns, + spans, + .. }) => { self.assert_no_expressions(expr)?; let input = self.only_input(inputs)?; @@ -948,6 +950,7 @@ impl LogicalPlan { Ok(LogicalPlan::Subquery(Subquery { subquery: Arc::new(subquery), outer_ref_columns: outer_ref_columns.clone(), + spans: spans.clone(), })) } LogicalPlan::SubqueryAlias(SubqueryAlias { alias, .. }) => { @@ -1084,6 +1087,7 @@ impl LogicalPlan { Ok(LogicalPlan::Explain(Explain { verbose: e.verbose, plan: Arc::new(input), + explain_format: e.explain_format.clone(), stringified_plans: e.stringified_plans.clone(), schema: Arc::clone(&e.schema), logical_optimization_succeeded: e.logical_optimization_succeeded, @@ -2813,6 +2817,7 @@ impl Union { } } + let mut name_counts: HashMap = HashMap::new(); let union_fields = (0..fields_count) .map(|i| { let fields = inputs @@ -2820,7 +2825,8 @@ impl Union { .map(|input| input.schema().field(i)) .collect::>(); let first_field = fields[0]; - let name = first_field.name(); + let base_name = first_field.name().to_string(); + let data_type = if loose_types { // TODO apply type coercion here, or document why it's better to defer // temporarily use the data type from the left input and later rely on the analyzer to @@ -2843,13 +2849,21 @@ impl Union { )? }; let nullable = fields.iter().any(|field| field.is_nullable()); - let mut field = Field::new(name, data_type.clone(), nullable); + + // Generate unique field name + let name = if let Some(count) = name_counts.get_mut(&base_name) { + *count += 1; + format!("{}_{}", base_name, count) + } else { + name_counts.insert(base_name.clone(), 0); + base_name + }; + + let mut field = Field::new(&name, data_type.clone(), nullable); let field_metadata = intersect_maps(fields.iter().map(|field| field.metadata())); field.set_metadata(field_metadata); - // TODO reusing table reference from the first schema is probably wrong - let table_reference = first_schema.qualified_field(i).0.cloned(); - Ok((table_reference, Arc::new(field))) + Ok((None, Arc::new(field))) }) .collect::>()?; let union_schema_metadata = @@ -2924,12 +2938,167 @@ impl PartialOrd for DescribeTable { } } +/// Output formats for controlling for Explain plans +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum ExplainFormat { + /// Indent mode + /// + /// Example: + /// ```text + /// > explain format indent select x from values (1) t(x); + /// +---------------+-----------------------------------------------------+ + /// | plan_type | plan | + /// +---------------+-----------------------------------------------------+ + /// | logical_plan | SubqueryAlias: t | + /// | | Projection: column1 AS x | + /// | | Values: (Int64(1)) | + /// | physical_plan | ProjectionExec: expr=[column1@0 as x] | + /// | | DataSourceExec: partitions=1, partition_sizes=[1] | + /// | | | + /// +---------------+-----------------------------------------------------+ + /// ``` + Indent, + /// Tree mode + /// + /// Example: + /// ```text + /// > explain format tree select x from values (1) t(x); + /// +---------------+-------------------------------+ + /// | plan_type | plan | + /// +---------------+-------------------------------+ + /// | physical_plan | ┌───────────────────────────┐ | + /// | | │ ProjectionExec │ | + /// | | │ -------------------- │ | + /// | | │ x: column1@0 │ | + /// | | └─────────────┬─────────────┘ | + /// | | ┌─────────────┴─────────────┐ | + /// | | │ DataSourceExec │ | + /// | | │ -------------------- │ | + /// | | │ bytes: 128 │ | + /// | | │ format: memory │ | + /// | | │ rows: 1 │ | + /// | | └───────────────────────────┘ | + /// | | | + /// +---------------+-------------------------------+ + /// ``` + Tree, + /// Postgres Json mode + /// + /// A displayable structure that produces plan in postgresql JSON format. + /// + /// Users can use this format to visualize the plan in existing plan + /// visualization tools, for example [dalibo](https://explain.dalibo.com/) + /// + /// Example: + /// ```text + /// > explain format pgjson select x from values (1) t(x); + /// +--------------+--------------------------------------+ + /// | plan_type | plan | + /// +--------------+--------------------------------------+ + /// | logical_plan | [ | + /// | | { | + /// | | "Plan": { | + /// | | "Alias": "t", | + /// | | "Node Type": "Subquery", | + /// | | "Output": [ | + /// | | "x" | + /// | | ], | + /// | | "Plans": [ | + /// | | { | + /// | | "Expressions": [ | + /// | | "column1 AS x" | + /// | | ], | + /// | | "Node Type": "Projection", | + /// | | "Output": [ | + /// | | "x" | + /// | | ], | + /// | | "Plans": [ | + /// | | { | + /// | | "Node Type": "Values", | + /// | | "Output": [ | + /// | | "column1" | + /// | | ], | + /// | | "Plans": [], | + /// | | "Values": "(Int64(1))" | + /// | | } | + /// | | ] | + /// | | } | + /// | | ] | + /// | | } | + /// | | } | + /// | | ] | + /// +--------------+--------------------------------------+ + /// ``` + PostgresJSON, + /// Graphviz mode + /// + /// Example: + /// ```text + /// > explain format graphviz select x from values (1) t(x); + /// +--------------+------------------------------------------------------------------------+ + /// | plan_type | plan | + /// +--------------+------------------------------------------------------------------------+ + /// | logical_plan | | + /// | | // Begin DataFusion GraphViz Plan, | + /// | | // display it online here: https://dreampuf.github.io/GraphvizOnline | + /// | | | + /// | | digraph { | + /// | | subgraph cluster_1 | + /// | | { | + /// | | graph[label="LogicalPlan"] | + /// | | 2[shape=box label="SubqueryAlias: t"] | + /// | | 3[shape=box label="Projection: column1 AS x"] | + /// | | 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back] | + /// | | 4[shape=box label="Values: (Int64(1))"] | + /// | | 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back] | + /// | | } | + /// | | subgraph cluster_5 | + /// | | { | + /// | | graph[label="Detailed LogicalPlan"] | + /// | | 6[shape=box label="SubqueryAlias: t\nSchema: [x:Int64;N]"] | + /// | | 7[shape=box label="Projection: column1 AS x\nSchema: [x:Int64;N]"] | + /// | | 6 -> 7 [arrowhead=none, arrowtail=normal, dir=back] | + /// | | 8[shape=box label="Values: (Int64(1))\nSchema: [column1:Int64;N]"] | + /// | | 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back] | + /// | | } | + /// | | } | + /// | | // End DataFusion GraphViz Plan | + /// | | | + /// +--------------+------------------------------------------------------------------------+ + /// ``` + Graphviz, +} + +/// Implement parsing strings to `ExplainFormat` +impl FromStr for ExplainFormat { + type Err = DataFusionError; + + fn from_str(format: &str) -> std::result::Result { + match format.to_lowercase().as_str() { + "indent" => Ok(ExplainFormat::Indent), + "tree" => Ok(ExplainFormat::Tree), + "pgjson" => Ok(ExplainFormat::PostgresJSON), + "graphviz" => Ok(ExplainFormat::Graphviz), + _ => { + plan_err!("Invalid explain format. Expected 'indent', 'tree', 'pgjson' or 'graphviz'. Got '{format}'") + } + } + } +} + /// Produces a relation with string representations of /// various parts of the plan +/// +/// See [the documentation] for more information +/// +/// [the documentation]: https://datafusion.apache.org/user-guide/sql/explain.html #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Explain { /// Should extra (detailed, intermediate plans) be included? pub verbose: bool, + /// Output format for explain, if specified. + /// If none, defaults to `text` + pub explain_format: ExplainFormat, /// The logical plan that is being EXPLAIN'd pub plan: Arc, /// Represent the various stages plans have gone through @@ -3494,11 +3663,10 @@ fn calc_func_dependencies_for_project( .flatten() .collect::>(); - let len = exprlist_len(exprs, input.schema(), Some(find_base_plan(input).schema()))?; Ok(input .schema() .functional_dependencies() - .project_functional_dependencies(&proj_indices, len)) + .project_functional_dependencies(&proj_indices, exprs.len())) } /// Sorts its input according to a list of sort expressions. @@ -3617,6 +3785,8 @@ pub struct Subquery { pub subquery: Arc, /// The outer references used in the subquery pub outer_ref_columns: Vec, + /// Span information for subquery projection columns + pub spans: Spans, } impl Normalizeable for Subquery { @@ -3651,6 +3821,7 @@ impl Subquery { Subquery { subquery: plan, outer_ref_columns: self.outer_ref_columns.clone(), + spans: Spans::new(), } } } diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index dfc18c74c70a..7f6e1e025387 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -159,10 +159,12 @@ impl TreeNode for LogicalPlan { LogicalPlan::Subquery(Subquery { subquery, outer_ref_columns, + spans, }) => subquery.map_elements(f)?.update_data(|subquery| { LogicalPlan::Subquery(Subquery { subquery, outer_ref_columns, + spans, }) }), LogicalPlan::SubqueryAlias(SubqueryAlias { @@ -202,6 +204,7 @@ impl TreeNode for LogicalPlan { .update_data(LogicalPlan::Distinct), LogicalPlan::Explain(Explain { verbose, + explain_format: format, plan, stringified_plans, schema, @@ -209,6 +212,7 @@ impl TreeNode for LogicalPlan { }) => plan.map_elements(f)?.update_data(|plan| { LogicalPlan::Explain(Explain { verbose, + explain_format: format, plan, stringified_plans, schema, diff --git a/datafusion/expr/src/select_expr.rs b/datafusion/expr/src/select_expr.rs new file mode 100644 index 000000000000..039df20f397b --- /dev/null +++ b/datafusion/expr/src/select_expr.rs @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fmt; + +use arrow::datatypes::FieldRef; +use datafusion_common::{Column, TableReference}; + +use crate::{expr::WildcardOptions, Expr}; + +/// Represents a SELECT expression in a SQL query. +/// +/// `SelectExpr` supports three types of expressions commonly found in the SELECT clause: +/// +/// * Wildcard (`*`) - Selects all columns +/// * Qualified wildcard (`table.*`) - Selects all columns from a specific table +/// * Regular expression - Any other expression like columns, functions, literals etc. +/// +/// This enum is typically used when you need to handle wildcards. After expanding `*` in the query, +/// you can use `Expr` for all other expressions. +/// +/// # Examples +/// +/// ``` +/// use datafusion_expr::col; +/// use datafusion_expr::expr::WildcardOptions; +/// use datafusion_expr::select_expr::SelectExpr; +/// +/// // SELECT * +/// let wildcard = SelectExpr::Wildcard(WildcardOptions::default()); +/// +/// // SELECT mytable.* +/// let qualified = SelectExpr::QualifiedWildcard( +/// "mytable".into(), +/// WildcardOptions::default() +/// ); +/// +/// // SELECT col1 +/// let expr = SelectExpr::Expression(col("col1").into()); +/// ``` +#[derive(Clone, Debug)] +pub enum SelectExpr { + /// Represents a wildcard (`*`) that selects all columns from all tables. + /// The `WildcardOptions` control additional behavior like exclusions. + Wildcard(WildcardOptions), + + /// Represents a qualified wildcard (`table.*`) that selects all columns from a specific table. + /// The `TableReference` specifies the table and `WildcardOptions` control additional behavior. + QualifiedWildcard(TableReference, WildcardOptions), + + /// Represents any other valid SELECT expression like column references, + /// function calls, literals, etc. + Expression(Expr), +} + +impl fmt::Display for SelectExpr { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + SelectExpr::Wildcard(opt) => write!(f, "*{opt}"), + SelectExpr::QualifiedWildcard(table, opt) => write!(f, "{table}.*{opt}"), + SelectExpr::Expression(expr) => write!(f, "{expr}"), + } + } +} + +impl From for SelectExpr { + fn from(expr: Expr) -> Self { + SelectExpr::Expression(expr) + } +} + +/// Create an [`SelectExpr::Expression`] from a [`Column`] +impl From for SelectExpr { + fn from(value: Column) -> Self { + Expr::Column(value).into() + } +} + +/// Create an [`SelectExpr::Expression`] from an optional qualifier and a [`FieldRef`]. This is +/// useful for creating [`SelectExpr::Expression`] from a `DFSchema`. +/// +/// See example on [`Expr`] +impl<'a> From<(Option<&'a TableReference>, &'a FieldRef)> for SelectExpr { + fn from(value: (Option<&'a TableReference>, &'a FieldRef)) -> Self { + Expr::from(Column::from(value)).into() + } +} diff --git a/datafusion/expr/src/tree_node.rs b/datafusion/expr/src/tree_node.rs index 49cc79c60a27..f20dab7e165f 100644 --- a/datafusion/expr/src/tree_node.rs +++ b/datafusion/expr/src/tree_node.rs @@ -132,7 +132,10 @@ impl TreeNode for Expr { expr, relation, name, - }) => f(*expr)?.update_data(|e| e.alias_qualified(relation, name)), + metadata, + }) => f(*expr)?.update_data(|e| { + e.alias_qualified_with_metadata(relation, name, metadata) + }), Expr::InSubquery(InSubquery { expr, subquery, diff --git a/datafusion/expr/src/type_coercion/mod.rs b/datafusion/expr/src/type_coercion/mod.rs index 3a5c65fb46ee..4fc150ef2996 100644 --- a/datafusion/expr/src/type_coercion/mod.rs +++ b/datafusion/expr/src/type_coercion/mod.rs @@ -79,9 +79,12 @@ pub fn is_datetime(dt: &DataType) -> bool { ) } -/// Determine whether the given data type `dt` is a `Utf8` or `LargeUtf8`. -pub fn is_utf8_or_large_utf8(dt: &DataType) -> bool { - matches!(dt, DataType::Utf8 | DataType::LargeUtf8) +/// Determine whether the given data type `dt` is a `Utf8` or `Utf8View` or `LargeUtf8`. +pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool { + matches!( + dt, + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 + ) } /// Determine whether the given data type `dt` is a `Decimal`. diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index f9039cea2edc..b75e8fd3cd3c 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -31,7 +31,7 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use crate::expr::{ schema_name_from_exprs, schema_name_from_exprs_comma_separated_without_space, - schema_name_from_sorts, AggregateFunction, AggregateFunctionParams, + schema_name_from_sorts, AggregateFunction, AggregateFunctionParams, ExprListDisplay, WindowFunctionParams, }; use crate::function::{ @@ -175,6 +175,13 @@ impl AggregateUDF { self.inner.schema_name(params) } + /// Returns a human readable expression. + /// + /// See [`Expr::human_display`] for details. + pub fn human_display(&self, params: &AggregateFunctionParams) -> Result { + self.inner.human_display(params) + } + pub fn window_function_schema_name( &self, params: &WindowFunctionParams, @@ -452,6 +459,45 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { Ok(schema_name) } + /// Returns a human readable expression. + /// + /// See [`Expr::human_display`] for details. + fn human_display(&self, params: &AggregateFunctionParams) -> Result { + let AggregateFunctionParams { + args, + distinct, + filter, + order_by, + null_treatment, + } = params; + + let mut schema_name = String::new(); + + schema_name.write_fmt(format_args!( + "{}({}{})", + self.name(), + if *distinct { "DISTINCT " } else { "" }, + ExprListDisplay::comma_separated(args.as_slice()) + ))?; + + if let Some(null_treatment) = null_treatment { + schema_name.write_fmt(format_args!(" {}", null_treatment))?; + } + + if let Some(filter) = filter { + schema_name.write_fmt(format_args!(" FILTER (WHERE {filter})"))?; + }; + + if let Some(order_by) = order_by { + schema_name.write_fmt(format_args!( + " ORDER BY [{}]", + schema_name_from_sorts(order_by)? + ))?; + }; + + Ok(schema_name) + } + /// Returns the name of the column this expression would create /// /// See [`Expr::schema_name`] for details diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index d724209d9dac..9b2400774a3d 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -20,9 +20,7 @@ use crate::expr::schema_name_from_exprs_comma_separated_without_space; use crate::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::sort_properties::{ExprProperties, SortProperties}; -use crate::{ - ColumnarValue, Documentation, Expr, ScalarFunctionImplementation, Signature, -}; +use crate::{ColumnarValue, Documentation, Expr, Signature}; use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, ExprSchema, Result, ScalarValue}; use datafusion_expr_common::interval_arithmetic::Interval; @@ -172,7 +170,7 @@ impl ScalarUDF { /// /// # Notes /// - /// If a function implement [`ScalarUDFImpl::return_type_from_exprs`], + /// If a function implement [`ScalarUDFImpl::return_type_from_args`], /// its [`ScalarUDFImpl::return_type`] should raise an error. /// /// See [`ScalarUDFImpl::return_type`] for more details. @@ -180,22 +178,6 @@ impl ScalarUDF { self.inner.return_type(arg_types) } - /// The datatype this function returns given the input argument input types. - /// This function is used when the input arguments are [`Expr`]s. - /// - /// - /// See [`ScalarUDFImpl::return_type_from_exprs`] for more details. - #[allow(deprecated)] - pub fn return_type_from_exprs( - &self, - args: &[Expr], - schema: &dyn ExprSchema, - arg_types: &[DataType], - ) -> Result { - // If the implementation provides a return_type_from_exprs, use it - self.inner.return_type_from_exprs(args, schema, arg_types) - } - /// Return the datatype this function returns given the input argument types. /// /// See [`ScalarUDFImpl::return_type_from_args`] for more details. @@ -214,27 +196,11 @@ impl ScalarUDF { self.inner.simplify(args, info) } - #[deprecated(since = "42.1.0", note = "Use `invoke_with_args` instead")] - pub fn invoke(&self, args: &[ColumnarValue]) -> Result { - #[allow(deprecated)] - self.inner.invoke(args) - } - #[allow(deprecated)] pub fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool { self.inner.is_nullable(args, schema) } - #[deprecated(since = "46.0.0", note = "Use `invoke_with_args` instead")] - pub fn invoke_batch( - &self, - args: &[ColumnarValue], - number_rows: usize, - ) -> Result { - #[allow(deprecated)] - self.inner.invoke_batch(args, number_rows) - } - /// Invoke the function on `args`, returning the appropriate result. /// /// See [`ScalarUDFImpl::invoke_with_args`] for details. @@ -242,25 +208,6 @@ impl ScalarUDF { self.inner.invoke_with_args(args) } - /// Invoke the function without `args` but number of rows, returning the appropriate result. - /// - /// Note: This method is deprecated and will be removed in future releases. - /// User defined functions should implement [`Self::invoke_with_args`] instead. - #[deprecated(since = "42.1.0", note = "Use `invoke_with_args` instead")] - pub fn invoke_no_args(&self, number_rows: usize) -> Result { - #[allow(deprecated)] - self.inner.invoke_no_args(number_rows) - } - - /// Returns a `ScalarFunctionImplementation` that can invoke the function - /// during execution - #[deprecated(since = "42.0.0", note = "Use `invoke_with_args` instead")] - pub fn fun(&self) -> ScalarFunctionImplementation { - let captured = Arc::clone(&self.inner); - #[allow(deprecated)] - Arc::new(move |args| captured.invoke(args)) - } - /// Get the circuits of inner implementation pub fn short_circuits(&self) -> bool { self.inner.short_circuits() @@ -351,7 +298,7 @@ pub struct ScalarFunctionArgs<'a> { pub args: Vec, /// The number of rows in record batch being evaluated pub number_rows: usize, - /// The return type of the scalar function returned (from `return_type` or `return_type_from_exprs`) + /// The return type of the scalar function returned (from `return_type` or `return_type_from_args`) /// when creating the physical expression from the logical expression pub return_type: &'a DataType, } @@ -540,16 +487,6 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// [`DataFusionError::Internal`]: datafusion_common::DataFusionError::Internal fn return_type(&self, arg_types: &[DataType]) -> Result; - #[deprecated(since = "45.0.0", note = "Use `return_type_from_args` instead")] - fn return_type_from_exprs( - &self, - _args: &[Expr], - _schema: &dyn ExprSchema, - arg_types: &[DataType], - ) -> Result { - self.return_type(arg_types) - } - /// What type will be returned by this function, given the arguments? /// /// By default, this function calls [`Self::return_type`] with the @@ -594,47 +531,6 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { true } - /// Invoke the function on `args`, returning the appropriate result - /// - /// Note: This method is deprecated and will be removed in future releases. - /// User defined functions should implement [`Self::invoke_with_args`] instead. - #[deprecated(since = "42.1.0", note = "Use `invoke_with_args` instead")] - fn invoke(&self, _args: &[ColumnarValue]) -> Result { - not_impl_err!( - "Function {} does not implement invoke but called", - self.name() - ) - } - - /// Invoke the function with `args` and the number of rows, - /// returning the appropriate result. - /// - /// Note: See notes on [`Self::invoke_with_args`] - /// - /// Note: This method is deprecated and will be removed in future releases. - /// User defined functions should implement [`Self::invoke_with_args`] instead. - /// - /// See for more details. - #[deprecated(since = "46.0.0", note = "Use `invoke_with_args` instead")] - fn invoke_batch( - &self, - args: &[ColumnarValue], - number_rows: usize, - ) -> Result { - match args.is_empty() { - true => - { - #[allow(deprecated)] - self.invoke_no_args(number_rows) - } - false => - { - #[allow(deprecated)] - self.invoke(args) - } - } - } - /// Invoke the function returning the appropriate result. /// /// # Performance @@ -645,23 +541,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// /// [`ColumnarValue::values_to_arrays`] can be used to convert the arguments /// to arrays, which will likely be simpler code, but be slower. - fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - #[allow(deprecated)] - self.invoke_batch(&args.args, args.number_rows) - } - - /// Invoke the function without `args`, instead the number of rows are provided, - /// returning the appropriate result. - /// - /// Note: This method is deprecated and will be removed in future releases. - /// User defined functions should implement [`Self::invoke_with_args`] instead. - #[deprecated(since = "42.1.0", note = "Use `invoke_with_args` instead")] - fn invoke_no_args(&self, _number_rows: usize) -> Result { - not_impl_err!( - "Function {} does not implement invoke_no_args but called", - self.name() - ) - } + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result; /// Returns any aliases (alternate names) for this function. /// @@ -889,16 +769,6 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { &self.aliases } - #[allow(deprecated)] - fn return_type_from_exprs( - &self, - args: &[Expr], - schema: &dyn ExprSchema, - arg_types: &[DataType], - ) -> Result { - self.inner.return_type_from_exprs(args, schema, arg_types) - } - fn return_type_from_args(&self, args: ReturnTypeArgs) -> Result { self.inner.return_type_from_args(args) } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 3404cce17188..552ce1502d46 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -19,7 +19,6 @@ use std::cmp::Ordering; use std::collections::{BTreeSet, HashSet}; -use std::ops::Deref; use std::sync::Arc; use crate::expr::{Alias, Sort, WildcardOptions, WindowFunction, WindowFunctionParams}; @@ -576,7 +575,7 @@ pub fn compare_sort_expr( /// Group a slice of window expression expr by their order by expressions pub fn group_window_expr_by_sort_keys( - window_expr: Vec, + window_expr: impl IntoIterator, ) -> Result)>> { let mut result = vec![]; window_expr.into_iter().try_for_each(|expr| match &expr { @@ -696,165 +695,11 @@ pub fn exprlist_to_fields<'a>( plan: &LogicalPlan, ) -> Result, Arc)>> { // Look for exact match in plan's output schema - let wildcard_schema = find_base_plan(plan).schema(); let input_schema = plan.schema(); - let result = exprs - .into_iter() - .map(|e| match e { - #[expect(deprecated)] - Expr::Wildcard { qualifier, options } => match qualifier { - None => { - let mut excluded = exclude_using_columns(plan)?; - excluded.extend(get_excluded_columns( - options.exclude.as_ref(), - options.except.as_ref(), - wildcard_schema, - None, - )?); - Ok(wildcard_schema - .iter() - .filter(|(q, f)| { - !excluded.contains(&Column::new(q.cloned(), f.name())) - }) - .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .collect::>()) - } - Some(qualifier) => { - let excluded: Vec = get_excluded_columns( - options.exclude.as_ref(), - options.except.as_ref(), - wildcard_schema, - Some(qualifier), - )? - .into_iter() - .map(|c| c.flat_name()) - .collect(); - Ok(wildcard_schema - .fields_with_qualified(qualifier) - .into_iter() - .filter_map(|field| { - let flat_name = format!("{}.{}", qualifier, field.name()); - if excluded.contains(&flat_name) { - None - } else { - Some(( - Some(qualifier.clone()), - Arc::new(field.to_owned()), - )) - } - }) - .collect::>()) - } - }, - _ => Ok(vec![e.to_field(input_schema)?]), - }) - .collect::>>()? - .into_iter() - .flatten() - .collect(); - Ok(result) -} - -/// Find the suitable base plan to expand the wildcard expression recursively. -/// When planning [LogicalPlan::Window] and [LogicalPlan::Aggregate], we will generate -/// an intermediate plan based on the relation plan (e.g. [LogicalPlan::TableScan], [LogicalPlan::Subquery], ...). -/// If we expand a wildcard expression basing the intermediate plan, we could get some duplicate fields. -pub fn find_base_plan(input: &LogicalPlan) -> &LogicalPlan { - match input { - LogicalPlan::Window(window) => find_base_plan(&window.input), - LogicalPlan::Aggregate(agg) => find_base_plan(&agg.input), - // [SqlToRel::try_process_unnest] will convert Expr(Unnest(Expr)) to Projection/Unnest/Projection - // We should expand the wildcard expression based on the input plan of the inner Projection. - LogicalPlan::Unnest(unnest) => { - if let LogicalPlan::Projection(projection) = unnest.input.deref() { - find_base_plan(&projection.input) - } else { - input - } - } - LogicalPlan::Filter(filter) => { - if filter.having { - // If a filter is used for a having clause, its input plan is an aggregation. - // We should expand the wildcard expression based on the aggregation's input plan. - find_base_plan(&filter.input) - } else { - input - } - } - _ => input, - } -} - -/// Count the number of real fields. We should expand the wildcard expression to get the actual number. -pub fn exprlist_len( - exprs: &[Expr], - schema: &DFSchemaRef, - wildcard_schema: Option<&DFSchemaRef>, -) -> Result { exprs - .iter() - .map(|e| match e { - #[expect(deprecated)] - Expr::Wildcard { - qualifier: None, - options, - } => { - let excluded = get_excluded_columns( - options.exclude.as_ref(), - options.except.as_ref(), - wildcard_schema.unwrap_or(schema), - None, - )? - .into_iter() - .collect::>(); - Ok( - get_exprs_except_skipped(wildcard_schema.unwrap_or(schema), excluded) - .len(), - ) - } - #[expect(deprecated)] - Expr::Wildcard { - qualifier: Some(qualifier), - options, - } => { - let related_wildcard_schema = wildcard_schema.as_ref().map_or_else( - || Ok(Arc::clone(schema)), - |schema| { - // Eliminate the fields coming from other tables. - let qualified_fields = schema - .fields() - .iter() - .enumerate() - .filter_map(|(idx, field)| { - let (maybe_table_ref, _) = schema.qualified_field(idx); - if maybe_table_ref.is_none_or(|q| q == qualifier) { - Some((maybe_table_ref.cloned(), Arc::clone(field))) - } else { - None - } - }) - .collect::>(); - let metadata = schema.metadata().clone(); - DFSchema::new_with_metadata(qualified_fields, metadata) - .map(Arc::new) - }, - )?; - let excluded = get_excluded_columns( - options.exclude.as_ref(), - options.except.as_ref(), - related_wildcard_schema.as_ref(), - Some(qualifier), - )? - .into_iter() - .collect::>(); - Ok( - get_exprs_except_skipped(related_wildcard_schema.as_ref(), excluded) - .len(), - ) - } - _ => Ok(1), - }) - .sum() + .into_iter() + .map(|e| e.to_field(input_schema)) + .collect() } /// Convert an expression into Column expression if it's already provided as input plan. diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs index 82b33650523b..8771b25137cf 100644 --- a/datafusion/expr/src/window_frame.rs +++ b/datafusion/expr/src/window_frame.rs @@ -29,7 +29,7 @@ use std::fmt::{self, Formatter}; use std::hash::Hash; use datafusion_common::{plan_err, sql_err, DataFusionError, Result, ScalarValue}; -use sqlparser::ast; +use sqlparser::ast::{self, ValueWithSpan}; use sqlparser::parser::ParserError::ParserError; /// The frame specification determines which output rows are read by an aggregate @@ -368,7 +368,7 @@ fn convert_frame_bound_to_scalar_value( match units { // For ROWS and GROUPS we are sure that the ScalarValue must be a non-negative integer ... ast::WindowFrameUnits::Rows | ast::WindowFrameUnits::Groups => match v { - ast::Expr::Value(ast::Value::Number(value, false)) => { + ast::Expr::Value(ValueWithSpan{value: ast::Value::Number(value, false), span: _}) => { Ok(ScalarValue::try_from_string(value, &DataType::UInt64)?) }, ast::Expr::Interval(ast::Interval { @@ -379,7 +379,7 @@ fn convert_frame_bound_to_scalar_value( fractional_seconds_precision: None, }) => { let value = match *value { - ast::Expr::Value(ast::Value::SingleQuotedString(item)) => item, + ast::Expr::Value(ValueWithSpan{value: ast::Value::SingleQuotedString(item), span: _}) => item, e => { return sql_err!(ParserError(format!( "INTERVAL expression cannot be {e:?}" @@ -395,14 +395,14 @@ fn convert_frame_bound_to_scalar_value( // ... instead for RANGE it could be anything depending on the type of the ORDER BY clause, // so we use a ScalarValue::Utf8. ast::WindowFrameUnits::Range => Ok(ScalarValue::Utf8(Some(match v { - ast::Expr::Value(ast::Value::Number(value, false)) => value, + ast::Expr::Value(ValueWithSpan{value: ast::Value::Number(value, false), span: _}) => value, ast::Expr::Interval(ast::Interval { value, leading_field, .. }) => { let result = match *value { - ast::Expr::Value(ast::Value::SingleQuotedString(item)) => item, + ast::Expr::Value(ValueWithSpan{value: ast::Value::SingleQuotedString(item), span: _}) => item, e => { return sql_err!(ParserError(format!( "INTERVAL expression cannot be {e:?}" @@ -514,10 +514,10 @@ mod tests { let window_frame = ast::WindowFrame { units: ast::WindowFrameUnits::Rows, start_bound: ast::WindowFrameBound::Preceding(Some(Box::new( - ast::Expr::Value(ast::Value::Number("2".to_string(), false)), + ast::Expr::value(ast::Value::Number("2".to_string(), false)), ))), end_bound: Some(ast::WindowFrameBound::Preceding(Some(Box::new( - ast::Expr::Value(ast::Value::Number("1".to_string(), false)), + ast::Expr::value(ast::Value::Number("1".to_string(), false)), )))), }; @@ -575,10 +575,9 @@ mod tests { test_bound!(Range, None, ScalarValue::Null); // Number - let number = Some(Box::new(ast::Expr::Value(ast::Value::Number( - "42".to_string(), - false, - )))); + let number = Some(Box::new(ast::Expr::Value( + ast::Value::Number("42".to_string(), false).into(), + ))); test_bound!(Rows, number.clone(), ScalarValue::UInt64(Some(42))); test_bound!(Groups, number.clone(), ScalarValue::UInt64(Some(42))); test_bound!( @@ -589,9 +588,9 @@ mod tests { // Interval let number = Some(Box::new(ast::Expr::Interval(ast::Interval { - value: Box::new(ast::Expr::Value(ast::Value::SingleQuotedString( - "1".to_string(), - ))), + value: Box::new(ast::Expr::Value( + ast::Value::SingleQuotedString("1".to_string()).into(), + )), leading_field: Some(ast::DateTimeField::Day), fractional_seconds_precision: None, last_field: None, diff --git a/datafusion/ffi/src/catalog_provider.rs b/datafusion/ffi/src/catalog_provider.rs new file mode 100644 index 000000000000..0886d4749d72 --- /dev/null +++ b/datafusion/ffi/src/catalog_provider.rs @@ -0,0 +1,338 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, ffi::c_void, sync::Arc}; + +use abi_stable::{ + std_types::{ROption, RResult, RString, RVec}, + StableAbi, +}; +use datafusion::catalog::{CatalogProvider, SchemaProvider}; +use tokio::runtime::Handle; + +use crate::{ + df_result, rresult_return, + schema_provider::{FFI_SchemaProvider, ForeignSchemaProvider}, +}; + +use datafusion::error::Result; + +/// A stable struct for sharing [`CatalogProvider`] across FFI boundaries. +#[repr(C)] +#[derive(Debug, StableAbi)] +#[allow(non_camel_case_types)] +pub struct FFI_CatalogProvider { + pub schema_names: unsafe extern "C" fn(provider: &Self) -> RVec, + + pub schema: unsafe extern "C" fn( + provider: &Self, + name: RString, + ) -> ROption, + + pub register_schema: + unsafe extern "C" fn( + provider: &Self, + name: RString, + schema: &FFI_SchemaProvider, + ) -> RResult, RString>, + + pub deregister_schema: + unsafe extern "C" fn( + provider: &Self, + name: RString, + cascade: bool, + ) -> RResult, RString>, + + /// Used to create a clone on the provider of the execution plan. This should + /// only need to be called by the receiver of the plan. + pub clone: unsafe extern "C" fn(plan: &Self) -> Self, + + /// Release the memory of the private data when it is no longer being used. + pub release: unsafe extern "C" fn(arg: &mut Self), + + /// Return the major DataFusion version number of this provider. + pub version: unsafe extern "C" fn() -> u64, + + /// Internal data. This is only to be accessed by the provider of the plan. + /// A [`ForeignCatalogProvider`] should never attempt to access this data. + pub private_data: *mut c_void, +} + +unsafe impl Send for FFI_CatalogProvider {} +unsafe impl Sync for FFI_CatalogProvider {} + +struct ProviderPrivateData { + provider: Arc, + runtime: Option, +} + +impl FFI_CatalogProvider { + unsafe fn inner(&self) -> &Arc { + let private_data = self.private_data as *const ProviderPrivateData; + &(*private_data).provider + } + + unsafe fn runtime(&self) -> Option { + let private_data = self.private_data as *const ProviderPrivateData; + (*private_data).runtime.clone() + } +} + +unsafe extern "C" fn schema_names_fn_wrapper( + provider: &FFI_CatalogProvider, +) -> RVec { + let names = provider.inner().schema_names(); + names.into_iter().map(|s| s.into()).collect() +} + +unsafe extern "C" fn schema_fn_wrapper( + provider: &FFI_CatalogProvider, + name: RString, +) -> ROption { + let maybe_schema = provider.inner().schema(name.as_str()); + maybe_schema + .map(|schema| FFI_SchemaProvider::new(schema, provider.runtime())) + .into() +} + +unsafe extern "C" fn register_schema_fn_wrapper( + provider: &FFI_CatalogProvider, + name: RString, + schema: &FFI_SchemaProvider, +) -> RResult, RString> { + let runtime = provider.runtime(); + let provider = provider.inner(); + let schema = Arc::new(ForeignSchemaProvider::from(schema)); + + let returned_schema = + rresult_return!(provider.register_schema(name.as_str(), schema)) + .map(|schema| FFI_SchemaProvider::new(schema, runtime)) + .into(); + + RResult::ROk(returned_schema) +} + +unsafe extern "C" fn deregister_schema_fn_wrapper( + provider: &FFI_CatalogProvider, + name: RString, + cascade: bool, +) -> RResult, RString> { + let runtime = provider.runtime(); + let provider = provider.inner(); + + let maybe_schema = + rresult_return!(provider.deregister_schema(name.as_str(), cascade)); + + RResult::ROk( + maybe_schema + .map(|schema| FFI_SchemaProvider::new(schema, runtime)) + .into(), + ) +} + +unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_CatalogProvider) { + let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData); + drop(private_data); +} + +unsafe extern "C" fn clone_fn_wrapper( + provider: &FFI_CatalogProvider, +) -> FFI_CatalogProvider { + let old_private_data = provider.private_data as *const ProviderPrivateData; + let runtime = (*old_private_data).runtime.clone(); + + let private_data = Box::into_raw(Box::new(ProviderPrivateData { + provider: Arc::clone(&(*old_private_data).provider), + runtime, + })) as *mut c_void; + + FFI_CatalogProvider { + schema_names: schema_names_fn_wrapper, + schema: schema_fn_wrapper, + register_schema: register_schema_fn_wrapper, + deregister_schema: deregister_schema_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, + version: super::version, + private_data, + } +} + +impl Drop for FFI_CatalogProvider { + fn drop(&mut self) { + unsafe { (self.release)(self) } + } +} + +impl FFI_CatalogProvider { + /// Creates a new [`FFI_CatalogProvider`]. + pub fn new( + provider: Arc, + runtime: Option, + ) -> Self { + let private_data = Box::new(ProviderPrivateData { provider, runtime }); + + Self { + schema_names: schema_names_fn_wrapper, + schema: schema_fn_wrapper, + register_schema: register_schema_fn_wrapper, + deregister_schema: deregister_schema_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, + version: super::version, + private_data: Box::into_raw(private_data) as *mut c_void, + } + } +} + +/// This wrapper struct exists on the receiver side of the FFI interface, so it has +/// no guarantees about being able to access the data in `private_data`. Any functions +/// defined on this struct must only use the stable functions provided in +/// FFI_CatalogProvider to interact with the foreign table provider. +#[derive(Debug)] +pub struct ForeignCatalogProvider(FFI_CatalogProvider); + +unsafe impl Send for ForeignCatalogProvider {} +unsafe impl Sync for ForeignCatalogProvider {} + +impl From<&FFI_CatalogProvider> for ForeignCatalogProvider { + fn from(provider: &FFI_CatalogProvider) -> Self { + Self(provider.clone()) + } +} + +impl Clone for FFI_CatalogProvider { + fn clone(&self) -> Self { + unsafe { (self.clone)(self) } + } +} + +impl CatalogProvider for ForeignCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + unsafe { + (self.0.schema_names)(&self.0) + .into_iter() + .map(|s| s.into()) + .collect() + } + } + + fn schema(&self, name: &str) -> Option> { + unsafe { + let maybe_provider: Option = + (self.0.schema)(&self.0, name.into()).into(); + + maybe_provider.map(|provider| { + Arc::new(ForeignSchemaProvider(provider)) as Arc + }) + } + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> Result>> { + unsafe { + let schema = match schema.as_any().downcast_ref::() { + Some(s) => &s.0, + None => &FFI_SchemaProvider::new(schema, None), + }; + let returned_schema: Option = + df_result!((self.0.register_schema)(&self.0, name.into(), schema))? + .into(); + + Ok(returned_schema + .map(|s| Arc::new(ForeignSchemaProvider(s)) as Arc)) + } + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> Result>> { + unsafe { + let returned_schema: Option = + df_result!((self.0.deregister_schema)(&self.0, name.into(), cascade))? + .into(); + + Ok(returned_schema + .map(|s| Arc::new(ForeignSchemaProvider(s)) as Arc)) + } + } +} + +#[cfg(test)] +mod tests { + use datafusion::catalog::{MemoryCatalogProvider, MemorySchemaProvider}; + + use super::*; + + #[test] + fn test_round_trip_ffi_catalog_provider() { + let prior_schema = Arc::new(MemorySchemaProvider::new()); + + let catalog = Arc::new(MemoryCatalogProvider::new()); + assert!(catalog + .as_ref() + .register_schema("prior_schema", prior_schema) + .unwrap() + .is_none()); + + let ffi_catalog = FFI_CatalogProvider::new(catalog, None); + + let foreign_catalog: ForeignCatalogProvider = (&ffi_catalog).into(); + + let prior_schema_names = foreign_catalog.schema_names(); + assert_eq!(prior_schema_names.len(), 1); + assert_eq!(prior_schema_names[0], "prior_schema"); + + // Replace an existing schema with one of the same name + let returned_schema = foreign_catalog + .register_schema("prior_schema", Arc::new(MemorySchemaProvider::new())) + .expect("Unable to register schema"); + assert!(returned_schema.is_some()); + assert_eq!(foreign_catalog.schema_names().len(), 1); + + // Add a new schema name + let returned_schema = foreign_catalog + .register_schema("second_schema", Arc::new(MemorySchemaProvider::new())) + .expect("Unable to register schema"); + assert!(returned_schema.is_none()); + assert_eq!(foreign_catalog.schema_names().len(), 2); + + // Remove a schema + let returned_schema = foreign_catalog + .deregister_schema("prior_schema", false) + .expect("Unable to deregister schema"); + assert!(returned_schema.is_some()); + assert_eq!(foreign_catalog.schema_names().len(), 1); + + // Retrieve non-existant schema + let returned_schema = foreign_catalog.schema("prior_schema"); + assert!(returned_schema.is_none()); + + // Retrieve valid schema + let returned_schema = foreign_catalog.schema("second_schema"); + assert!(returned_schema.is_some()); + } +} diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs index 4eabf91d892a..877129fc5bb1 100644 --- a/datafusion/ffi/src/lib.rs +++ b/datafusion/ffi/src/lib.rs @@ -20,14 +20,17 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] pub mod arrow_wrappers; +pub mod catalog_provider; pub mod execution_plan; pub mod insert_op; pub mod plan_properties; pub mod record_batch_stream; +pub mod schema_provider; pub mod session_config; pub mod table_provider; pub mod table_source; diff --git a/datafusion/ffi/src/schema_provider.rs b/datafusion/ffi/src/schema_provider.rs new file mode 100644 index 000000000000..6e5a590e1a09 --- /dev/null +++ b/datafusion/ffi/src/schema_provider.rs @@ -0,0 +1,385 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, ffi::c_void, sync::Arc}; + +use abi_stable::{ + std_types::{ROption, RResult, RString, RVec}, + StableAbi, +}; +use async_ffi::{FfiFuture, FutureExt}; +use async_trait::async_trait; +use datafusion::{ + catalog::{SchemaProvider, TableProvider}, + error::DataFusionError, +}; +use tokio::runtime::Handle; + +use crate::{ + df_result, rresult_return, + table_provider::{FFI_TableProvider, ForeignTableProvider}, +}; + +use datafusion::error::Result; + +/// A stable struct for sharing [`SchemaProvider`] across FFI boundaries. +#[repr(C)] +#[derive(Debug, StableAbi)] +#[allow(non_camel_case_types)] +pub struct FFI_SchemaProvider { + pub owner_name: ROption, + + pub table_names: unsafe extern "C" fn(provider: &Self) -> RVec, + + pub table: unsafe extern "C" fn( + provider: &Self, + name: RString, + ) -> FfiFuture< + RResult, RString>, + >, + + pub register_table: + unsafe extern "C" fn( + provider: &Self, + name: RString, + table: FFI_TableProvider, + ) -> RResult, RString>, + + pub deregister_table: + unsafe extern "C" fn( + provider: &Self, + name: RString, + ) -> RResult, RString>, + + pub table_exist: unsafe extern "C" fn(provider: &Self, name: RString) -> bool, + + /// Used to create a clone on the provider of the execution plan. This should + /// only need to be called by the receiver of the plan. + pub clone: unsafe extern "C" fn(plan: &Self) -> Self, + + /// Release the memory of the private data when it is no longer being used. + pub release: unsafe extern "C" fn(arg: &mut Self), + + /// Return the major DataFusion version number of this provider. + pub version: unsafe extern "C" fn() -> u64, + + /// Internal data. This is only to be accessed by the provider of the plan. + /// A [`ForeignSchemaProvider`] should never attempt to access this data. + pub private_data: *mut c_void, +} + +unsafe impl Send for FFI_SchemaProvider {} +unsafe impl Sync for FFI_SchemaProvider {} + +struct ProviderPrivateData { + provider: Arc, + runtime: Option, +} + +impl FFI_SchemaProvider { + unsafe fn inner(&self) -> &Arc { + let private_data = self.private_data as *const ProviderPrivateData; + &(*private_data).provider + } + + unsafe fn runtime(&self) -> Option { + let private_data = self.private_data as *const ProviderPrivateData; + (*private_data).runtime.clone() + } +} + +unsafe extern "C" fn table_names_fn_wrapper( + provider: &FFI_SchemaProvider, +) -> RVec { + let provider = provider.inner(); + + let table_names = provider.table_names(); + table_names.into_iter().map(|s| s.into()).collect() +} + +unsafe extern "C" fn table_fn_wrapper( + provider: &FFI_SchemaProvider, + name: RString, +) -> FfiFuture, RString>> { + let runtime = provider.runtime(); + let provider = Arc::clone(provider.inner()); + + async move { + let table = rresult_return!(provider.table(name.as_str()).await) + .map(|t| FFI_TableProvider::new(t, true, runtime)) + .into(); + + RResult::ROk(table) + } + .into_ffi() +} + +unsafe extern "C" fn register_table_fn_wrapper( + provider: &FFI_SchemaProvider, + name: RString, + table: FFI_TableProvider, +) -> RResult, RString> { + let runtime = provider.runtime(); + let provider = provider.inner(); + + let table = Arc::new(ForeignTableProvider(table)); + + let returned_table = rresult_return!(provider.register_table(name.into(), table)) + .map(|t| FFI_TableProvider::new(t, true, runtime)); + + RResult::ROk(returned_table.into()) +} + +unsafe extern "C" fn deregister_table_fn_wrapper( + provider: &FFI_SchemaProvider, + name: RString, +) -> RResult, RString> { + let runtime = provider.runtime(); + let provider = provider.inner(); + + let returned_table = rresult_return!(provider.deregister_table(name.as_str())) + .map(|t| FFI_TableProvider::new(t, true, runtime)); + + RResult::ROk(returned_table.into()) +} + +unsafe extern "C" fn table_exist_fn_wrapper( + provider: &FFI_SchemaProvider, + name: RString, +) -> bool { + provider.inner().table_exist(name.as_str()) +} + +unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_SchemaProvider) { + let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData); + drop(private_data); +} + +unsafe extern "C" fn clone_fn_wrapper( + provider: &FFI_SchemaProvider, +) -> FFI_SchemaProvider { + let old_private_data = provider.private_data as *const ProviderPrivateData; + let runtime = (*old_private_data).runtime.clone(); + + let private_data = Box::into_raw(Box::new(ProviderPrivateData { + provider: Arc::clone(&(*old_private_data).provider), + runtime, + })) as *mut c_void; + + FFI_SchemaProvider { + owner_name: provider.owner_name.clone(), + table_names: table_names_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, + version: super::version, + private_data, + table: table_fn_wrapper, + register_table: register_table_fn_wrapper, + deregister_table: deregister_table_fn_wrapper, + table_exist: table_exist_fn_wrapper, + } +} + +impl Drop for FFI_SchemaProvider { + fn drop(&mut self) { + unsafe { (self.release)(self) } + } +} + +impl FFI_SchemaProvider { + /// Creates a new [`FFI_SchemaProvider`]. + pub fn new( + provider: Arc, + runtime: Option, + ) -> Self { + let owner_name = provider.owner_name().map(|s| s.into()).into(); + let private_data = Box::new(ProviderPrivateData { provider, runtime }); + + Self { + owner_name, + table_names: table_names_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, + version: super::version, + private_data: Box::into_raw(private_data) as *mut c_void, + table: table_fn_wrapper, + register_table: register_table_fn_wrapper, + deregister_table: deregister_table_fn_wrapper, + table_exist: table_exist_fn_wrapper, + } + } +} + +/// This wrapper struct exists on the receiver side of the FFI interface, so it has +/// no guarantees about being able to access the data in `private_data`. Any functions +/// defined on this struct must only use the stable functions provided in +/// FFI_SchemaProvider to interact with the foreign table provider. +#[derive(Debug)] +pub struct ForeignSchemaProvider(pub FFI_SchemaProvider); + +unsafe impl Send for ForeignSchemaProvider {} +unsafe impl Sync for ForeignSchemaProvider {} + +impl From<&FFI_SchemaProvider> for ForeignSchemaProvider { + fn from(provider: &FFI_SchemaProvider) -> Self { + Self(provider.clone()) + } +} + +impl Clone for FFI_SchemaProvider { + fn clone(&self) -> Self { + unsafe { (self.clone)(self) } + } +} + +#[async_trait] +impl SchemaProvider for ForeignSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn owner_name(&self) -> Option<&str> { + let name: Option<&RString> = self.0.owner_name.as_ref().into(); + name.map(|s| s.as_str()) + } + + fn table_names(&self) -> Vec { + unsafe { + (self.0.table_names)(&self.0) + .into_iter() + .map(|s| s.into()) + .collect() + } + } + + async fn table( + &self, + name: &str, + ) -> Result>, DataFusionError> { + unsafe { + let table: Option = + df_result!((self.0.table)(&self.0, name.into()).await)?.into(); + + let table = table.as_ref().map(|t| { + Arc::new(ForeignTableProvider::from(t)) as Arc + }); + + Ok(table) + } + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> Result>> { + unsafe { + let ffi_table = match table.as_any().downcast_ref::() { + Some(t) => t.0.clone(), + None => FFI_TableProvider::new(table, true, None), + }; + + let returned_provider: Option = + df_result!((self.0.register_table)(&self.0, name.into(), ffi_table))? + .into(); + + Ok(returned_provider + .map(|t| Arc::new(ForeignTableProvider(t)) as Arc)) + } + } + + fn deregister_table(&self, name: &str) -> Result>> { + let returned_provider: Option = unsafe { + df_result!((self.0.deregister_table)(&self.0, name.into()))?.into() + }; + + Ok(returned_provider + .map(|t| Arc::new(ForeignTableProvider(t)) as Arc)) + } + + /// Returns true if table exist in the schema provider, false otherwise. + fn table_exist(&self, name: &str) -> bool { + unsafe { (self.0.table_exist)(&self.0, name.into()) } + } +} + +#[cfg(test)] +mod tests { + use arrow::datatypes::Schema; + use datafusion::{catalog::MemorySchemaProvider, datasource::empty::EmptyTable}; + + use super::*; + + fn empty_table() -> Arc { + Arc::new(EmptyTable::new(Arc::new(Schema::empty()))) + } + + #[tokio::test] + async fn test_round_trip_ffi_schema_provider() { + let schema_provider = Arc::new(MemorySchemaProvider::new()); + assert!(schema_provider + .as_ref() + .register_table("prior_table".to_string(), empty_table()) + .unwrap() + .is_none()); + + let ffi_schema_provider = FFI_SchemaProvider::new(schema_provider, None); + + let foreign_schema_provider: ForeignSchemaProvider = + (&ffi_schema_provider).into(); + + let prior_table_names = foreign_schema_provider.table_names(); + assert_eq!(prior_table_names.len(), 1); + assert_eq!(prior_table_names[0], "prior_table"); + + // Replace an existing table with one of the same name generates an error + let returned_schema = foreign_schema_provider + .register_table("prior_table".to_string(), empty_table()); + assert!(returned_schema.is_err()); + assert_eq!(foreign_schema_provider.table_names().len(), 1); + + // Add a new table + let returned_schema = foreign_schema_provider + .register_table("second_table".to_string(), empty_table()) + .expect("Unable to register table"); + assert!(returned_schema.is_none()); + assert_eq!(foreign_schema_provider.table_names().len(), 2); + + // Remove a table + let returned_schema = foreign_schema_provider + .deregister_table("prior_table") + .expect("Unable to deregister table"); + assert!(returned_schema.is_some()); + assert_eq!(foreign_schema_provider.table_names().len(), 1); + + // Retrieve non-existant table + let returned_schema = foreign_schema_provider + .table("prior_table") + .await + .expect("Unable to query table"); + assert!(returned_schema.is_none()); + assert!(!foreign_schema_provider.table_exist("prior_table")); + + // Retrieve valid table + let returned_schema = foreign_schema_provider + .table("second_table") + .await + .expect("Unable to query table"); + assert!(returned_schema.is_some()); + assert!(foreign_schema_provider.table_exist("second_table")); + } +} diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 0b4080abcb55..a7391a85031e 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -382,7 +382,7 @@ impl FFI_TableProvider { /// defined on this struct must only use the stable functions provided in /// FFI_TableProvider to interact with the foreign table provider. #[derive(Debug)] -pub struct ForeignTableProvider(FFI_TableProvider); +pub struct ForeignTableProvider(pub FFI_TableProvider); unsafe impl Send for ForeignTableProvider {} unsafe impl Sync for ForeignTableProvider {} diff --git a/datafusion/ffi/src/tests/catalog.rs b/datafusion/ffi/src/tests/catalog.rs new file mode 100644 index 000000000000..f4293adb41b9 --- /dev/null +++ b/datafusion/ffi/src/tests/catalog.rs @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This is an example of an async table provider that will call functions on +//! the tokio runtime of the library providing the function. Since we cannot +//! share a tokio runtime across the ffi boundary and the producer and consumer +//! may have different runtimes, we need to store a reference to the runtime +//! and enter it during streaming calls. The entering of the runtime will +//! occur by the datafusion_ffi crate during the streaming calls. This code +//! serves as an integration test of this feature. If we do not correctly +//! access the runtime, then you will get a panic when trying to do operations +//! such as spawning a tokio task. + +use std::{any::Any, fmt::Debug, sync::Arc}; + +use crate::catalog_provider::FFI_CatalogProvider; +use arrow::datatypes::Schema; +use async_trait::async_trait; +use datafusion::{ + catalog::{ + CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, + TableProvider, + }, + common::exec_err, + datasource::MemTable, + error::{DataFusionError, Result}, +}; + +/// This schema provider is intended only for unit tests. It prepopulates with one +/// table and only allows for tables named sales and purchases. +#[derive(Debug)] +pub struct FixedSchemaProvider { + inner: MemorySchemaProvider, +} + +pub fn fruit_table() -> Arc { + use arrow::datatypes::{DataType, Field}; + use datafusion::common::record_batch; + + let schema = Arc::new(Schema::new(vec![ + Field::new("units", DataType::Int32, true), + Field::new("price", DataType::Float64, true), + ])); + + let partitions = vec![ + record_batch!( + ("units", Int32, vec![10, 20, 30]), + ("price", Float64, vec![1.0, 2.0, 5.0]) + ) + .unwrap(), + record_batch!( + ("units", Int32, vec![5, 7]), + ("price", Float64, vec![1.5, 2.5]) + ) + .unwrap(), + ]; + + Arc::new(MemTable::try_new(schema, vec![partitions]).unwrap()) +} + +impl Default for FixedSchemaProvider { + fn default() -> Self { + let inner = MemorySchemaProvider::new(); + + let table = fruit_table(); + + let _ = inner + .register_table("purchases".to_string(), table) + .unwrap(); + + Self { inner } + } +} + +#[async_trait] +impl SchemaProvider for FixedSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.inner.table_names() + } + + async fn table( + &self, + name: &str, + ) -> Result>, DataFusionError> { + self.inner.table(name).await + } + + fn table_exist(&self, name: &str) -> bool { + self.inner.table_exist(name) + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> Result>> { + if name.as_str() != "sales" && name.as_str() != "purchases" { + return exec_err!( + "FixedSchemaProvider only provides two tables: sales and purchases" + ); + } + + self.inner.register_table(name, table) + } + + fn deregister_table(&self, name: &str) -> Result>> { + self.inner.deregister_table(name) + } +} + +/// This catalog provider is intended only for unit tests. It prepopulates with one +/// schema and only allows for schemas named after four types of fruit. +#[derive(Debug)] +pub struct FixedCatalogProvider { + inner: MemoryCatalogProvider, +} + +impl Default for FixedCatalogProvider { + fn default() -> Self { + let inner = MemoryCatalogProvider::new(); + + let _ = inner.register_schema("apple", Arc::new(FixedSchemaProvider::default())); + + Self { inner } + } +} + +impl CatalogProvider for FixedCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.inner.schema_names() + } + + fn schema(&self, name: &str) -> Option> { + self.inner.schema(name) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> Result>> { + if !["apple", "banana", "cherry", "date"].contains(&name) { + return exec_err!("FixedCatalogProvider only provides four schemas: apple, banana, cherry, date"); + } + + self.inner.register_schema(name, schema) + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> Result>> { + self.inner.deregister_schema(name, cascade) + } +} + +pub(crate) extern "C" fn create_catalog_provider() -> FFI_CatalogProvider { + let catalog_provider = Arc::new(FixedCatalogProvider::default()); + FFI_CatalogProvider::new(catalog_provider, None) +} diff --git a/datafusion/ffi/src/tests/mod.rs b/datafusion/ffi/src/tests/mod.rs index 5a471cb8fe43..4b4a29276d9a 100644 --- a/datafusion/ffi/src/tests/mod.rs +++ b/datafusion/ffi/src/tests/mod.rs @@ -25,6 +25,9 @@ use abi_stable::{ sabi_types::VersionStrings, StableAbi, }; +use catalog::create_catalog_provider; + +use crate::catalog_provider::FFI_CatalogProvider; use super::{table_provider::FFI_TableProvider, udf::FFI_ScalarUDF}; use arrow::array::RecordBatch; @@ -37,6 +40,7 @@ use sync_provider::create_sync_table_provider; use udf_udaf_udwf::create_ffi_abs_func; mod async_provider; +pub mod catalog; mod sync_provider; mod udf_udaf_udwf; @@ -47,6 +51,9 @@ mod udf_udaf_udwf; /// both the module loading program and library that implements the /// module. pub struct ForeignLibraryModule { + /// Construct an opinionated catalog provider + pub create_catalog: extern "C" fn() -> FFI_CatalogProvider, + /// Constructs the table provider pub create_table: extern "C" fn(synchronous: bool) -> FFI_TableProvider, @@ -95,6 +102,7 @@ extern "C" fn construct_table_provider(synchronous: bool) -> FFI_TableProvider { /// This defines the entry point for using the module. pub fn get_foreign_library_module() -> ForeignLibraryModuleRef { ForeignLibraryModule { + create_catalog: create_catalog_provider, create_table: construct_table_provider, create_scalar_udf: create_ffi_abs_func, version: super::version, diff --git a/datafusion/ffi/tests/ffi_integration.rs b/datafusion/ffi/tests/ffi_integration.rs index 84e120df4299..f610f12c8244 100644 --- a/datafusion/ffi/tests/ffi_integration.rs +++ b/datafusion/ffi/tests/ffi_integration.rs @@ -25,6 +25,7 @@ mod tests { use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::ScalarUDF; use datafusion::prelude::{col, SessionContext}; + use datafusion_ffi::catalog_provider::ForeignCatalogProvider; use datafusion_ffi::table_provider::ForeignTableProvider; use datafusion_ffi::tests::{create_record_batch, ForeignLibraryModuleRef}; use datafusion_ffi::udf::ForeignScalarUDF; @@ -179,4 +180,30 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_catalog() -> Result<()> { + let module = get_module()?; + + let ffi_catalog = + module + .create_catalog() + .ok_or(DataFusionError::NotImplemented( + "External catalog provider failed to implement create_catalog" + .to_string(), + ))?(); + let foreign_catalog: ForeignCatalogProvider = (&ffi_catalog).into(); + + let ctx = SessionContext::default(); + let _ = ctx.register_catalog("fruit", Arc::new(foreign_catalog)); + + let df = ctx.table("fruit.apple.purchases").await?; + + let results = df.collect().await?; + + assert!(!results.is_empty()); + assert!(results[0].num_rows() != 0); + + Ok(()) + } } diff --git a/datafusion/functions-aggregate-common/src/lib.rs b/datafusion/functions-aggregate-common/src/lib.rs index 6f9dfca30c19..da718e7ceefe 100644 --- a/datafusion/functions-aggregate-common/src/lib.rs +++ b/datafusion/functions-aggregate-common/src/lib.rs @@ -27,7 +27,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] pub mod accumulator; diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index 1d378fff176f..c97dba1925ca 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -18,7 +18,7 @@ //! Defines physical expressions that can evaluated at runtime during query execution use crate::hyperloglog::HyperLogLog; -use arrow::array::BinaryArray; +use arrow::array::{BinaryArray, StringViewArray}; use arrow::array::{ GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; @@ -126,6 +126,27 @@ where } } +#[derive(Debug)] +struct StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + hll: HyperLogLog, + phantom_data: PhantomData, +} + +impl StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + pub fn new() -> Self { + Self { + hll: HyperLogLog::new(), + phantom_data: PhantomData, + } + } +} + #[derive(Debug)] struct BinaryHLLAccumulator where @@ -197,6 +218,21 @@ where default_accumulator_impl!(); } +impl Accumulator for StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + let array: &StringViewArray = downcast_value!(values[0], StringViewArray); + // flatten because we would skip nulls + self.hll + .extend(array.iter().flatten().map(|s| s.to_string())); + Ok(()) + } + + default_accumulator_impl!(); +} + impl Accumulator for StringHLLAccumulator where T: OffsetSizeTrait, @@ -311,6 +347,7 @@ impl AggregateUDFImpl for ApproxDistinct { DataType::Int64 => Box::new(NumericHLLAccumulator::::new()), DataType::Utf8 => Box::new(StringHLLAccumulator::::new()), DataType::LargeUtf8 => Box::new(StringHLLAccumulator::::new()), + DataType::Utf8View => Box::new(StringViewHLLAccumulator::::new()), DataType::Binary => Box::new(BinaryHLLAccumulator::::new()), DataType::LargeBinary => Box::new(BinaryHLLAccumulator::::new()), other => { diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs index a5c84298e9d5..7944280291eb 100644 --- a/datafusion/functions-aggregate/src/lib.rs +++ b/datafusion/functions-aggregate/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! Aggregate Function packages for [DataFusion]. diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs index 83356e2f9fb4..ea4cad548803 100644 --- a/datafusion/functions-aggregate/src/min_max.rs +++ b/datafusion/functions-aggregate/src/min_max.rs @@ -22,8 +22,9 @@ mod min_max_bytes; use arrow::array::{ ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, Decimal256Array, Float16Array, Float32Array, Float64Array, - Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, + Decimal128Array, Decimal256Array, DurationMicrosecondArray, DurationMillisecondArray, + DurationNanosecondArray, DurationSecondArray, Float16Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray, LargeStringArray, StringArray, StringViewArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, @@ -32,9 +33,10 @@ use arrow::array::{ }; use arrow::compute; use arrow::datatypes::{ - DataType, Decimal128Type, Decimal256Type, Float16Type, Float32Type, Float64Type, - Int16Type, Int32Type, Int64Type, Int8Type, IntervalUnit, UInt16Type, UInt32Type, - UInt64Type, UInt8Type, + DataType, Decimal128Type, Decimal256Type, DurationMicrosecondType, + DurationMillisecondType, DurationNanosecondType, DurationSecondType, Float16Type, + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalUnit, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use datafusion_common::stats::Precision; use datafusion_common::{ @@ -263,6 +265,7 @@ impl AggregateUDFImpl for Max { | Binary | LargeBinary | BinaryView + | Duration(_) ) } @@ -317,6 +320,18 @@ impl AggregateUDFImpl for Max { Timestamp(Nanosecond, _) => { primitive_max_accumulator!(data_type, i64, TimestampNanosecondType) } + Duration(Second) => { + primitive_max_accumulator!(data_type, i64, DurationSecondType) + } + Duration(Millisecond) => { + primitive_max_accumulator!(data_type, i64, DurationMillisecondType) + } + Duration(Microsecond) => { + primitive_max_accumulator!(data_type, i64, DurationMicrosecondType) + } + Duration(Nanosecond) => { + primitive_max_accumulator!(data_type, i64, DurationNanosecondType) + } Decimal128(_, _) => { primitive_max_accumulator!(data_type, i128, Decimal128Type) } @@ -518,6 +533,33 @@ macro_rules! min_max_batch { $OP ) } + DataType::Duration(TimeUnit::Second) => { + typed_min_max_batch!($VALUES, DurationSecondArray, DurationSecond, $OP) + } + DataType::Duration(TimeUnit::Millisecond) => { + typed_min_max_batch!( + $VALUES, + DurationMillisecondArray, + DurationMillisecond, + $OP + ) + } + DataType::Duration(TimeUnit::Microsecond) => { + typed_min_max_batch!( + $VALUES, + DurationMicrosecondArray, + DurationMicrosecond, + $OP + ) + } + DataType::Duration(TimeUnit::Nanosecond) => { + typed_min_max_batch!( + $VALUES, + DurationNanosecondArray, + DurationNanosecond, + $OP + ) + } other => { // This should have been handled before return internal_err!( @@ -1090,6 +1132,7 @@ impl AggregateUDFImpl for Min { | Binary | LargeBinary | BinaryView + | Duration(_) ) } @@ -1144,6 +1187,18 @@ impl AggregateUDFImpl for Min { Timestamp(Nanosecond, _) => { primitive_min_accumulator!(data_type, i64, TimestampNanosecondType) } + Duration(Second) => { + primitive_min_accumulator!(data_type, i64, DurationSecondType) + } + Duration(Millisecond) => { + primitive_min_accumulator!(data_type, i64, DurationMillisecondType) + } + Duration(Microsecond) => { + primitive_min_accumulator!(data_type, i64, DurationMicrosecondType) + } + Duration(Nanosecond) => { + primitive_min_accumulator!(data_type, i64, DurationNanosecondType) + } Decimal128(_, _) => { primitive_min_accumulator!(data_type, i128, Decimal128Type) } @@ -1597,7 +1652,7 @@ mod tests { assert_eq!( min_res, ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( - -2, 4 + -2, 4, ))) ); @@ -1609,7 +1664,7 @@ mod tests { assert_eq!( max_res, ScalarValue::IntervalYearMonth(Some(IntervalYearMonthType::make_value( - 5, 34 + 5, 34, ))) ); diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 5a29cf962817..1857ead8c547 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -439,6 +439,13 @@ fn array_has_all_and_any_dispatch( ) -> Result { let haystack = as_generic_list_array::(haystack)?; let needle = as_generic_list_array::(needle)?; + if needle.values().len() == 0 { + let buffer = match comparison_type { + ComparisonType::All => BooleanBuffer::new_set(haystack.len()), + ComparisonType::Any => BooleanBuffer::new_unset(haystack.len()), + }; + return Ok(Arc::new(BooleanArray::from(buffer))); + } match needle.data_type() { DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { array_has_all_and_any_string_internal::(haystack, needle, comparison_type) diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index 0f50f62dd8d2..321dda55ce09 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -1001,9 +1001,9 @@ where mod tests { use super::array_element_udf; use arrow::datatypes::{DataType, Field}; - use datafusion_common::{Column, DFSchema, ScalarValue}; + use datafusion_common::{Column, DFSchema}; use datafusion_expr::expr::ScalarFunction; - use datafusion_expr::{cast, Expr, ExprSchemable}; + use datafusion_expr::{Expr, ExprSchemable}; use std::collections::HashMap; // Regression test for https://github.com/apache/datafusion/issues/13755 @@ -1037,34 +1037,6 @@ mod tests { fixed_size_list_type ); - // ScalarUDFImpl::return_type_from_exprs with typed exprs - assert_eq!( - udf.return_type_from_exprs( - &[ - cast(Expr::Literal(ScalarValue::Null), array_type.clone()), - cast(Expr::Literal(ScalarValue::Null), index_type.clone()), - ], - &schema, - &[array_type.clone(), index_type.clone()] - ) - .unwrap(), - fixed_size_list_type - ); - - // ScalarUDFImpl::return_type_from_exprs with exprs not carrying type - assert_eq!( - udf.return_type_from_exprs( - &[ - Expr::Column(Column::new_unqualified("my_array")), - Expr::Column(Column::new_unqualified("my_index")), - ], - &schema, - &[array_type.clone(), index_type.clone()] - ) - .unwrap(), - fixed_size_list_type - ); - // Via ExprSchemable::get_type (e.g. SimplifyInfo) let udf_expr = Expr::ScalarFunction(ScalarFunction { func: array_element_udf(), diff --git a/datafusion/functions-nested/src/lib.rs b/datafusion/functions-nested/src/lib.rs index 446cd58865c3..c9a61d98cd44 100644 --- a/datafusion/functions-nested/src/lib.rs +++ b/datafusion/functions-nested/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! Nested type Functions for [DataFusion]. diff --git a/datafusion/functions-nested/src/max.rs b/datafusion/functions-nested/src/max.rs index 22bd14740b5e..32957edc62b5 100644 --- a/datafusion/functions-nested/src/max.rs +++ b/datafusion/functions-nested/src/max.rs @@ -24,7 +24,9 @@ use datafusion_common::cast::as_list_array; use datafusion_common::utils::take_function_args; use datafusion_common::{exec_err, ScalarValue}; use datafusion_doc::Documentation; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; use datafusion_functions_aggregate::min_max; use datafusion_macros::user_doc; use itertools::Itertools; @@ -96,12 +98,11 @@ impl ScalarUDFImpl for ArrayMax { } } - fn invoke_batch( + fn invoke_with_args( &self, - args: &[ColumnarValue], - _number_rows: usize, + args: ScalarFunctionArgs, ) -> datafusion_common::Result { - make_scalar_function(array_max_inner)(args) + make_scalar_function(array_max_inner)(&args.args) } fn aliases(&self) -> &[String] { diff --git a/datafusion/functions-table/src/generate_series.rs b/datafusion/functions-table/src/generate_series.rs index df8357ee1974..5bb56f28bc8d 100644 --- a/datafusion/functions-table/src/generate_series.rs +++ b/datafusion/functions-table/src/generate_series.rs @@ -114,7 +114,8 @@ impl LazyBatchGenerator for GenerateSeriesState { return Ok(None); } - let batch = RecordBatch::try_new(self.schema.clone(), vec![Arc::new(array)])?; + let batch = + RecordBatch::try_new(Arc::clone(&self.schema), vec![Arc::new(array)])?; Ok(Some(batch)) } @@ -127,7 +128,7 @@ impl TableProvider for GenerateSeriesTable { } fn schema(&self) -> SchemaRef { - self.schema.clone() + Arc::clone(&self.schema) } fn table_type(&self) -> TableType { @@ -146,7 +147,7 @@ impl TableProvider for GenerateSeriesTable { let state = match self.args { // if args have null, then return 0 row GenSeriesArgs::ContainsNull { include_end, name } => GenerateSeriesState { - schema: self.schema.clone(), + schema: self.schema(), start: 0, end: 0, step: 1, @@ -162,7 +163,7 @@ impl TableProvider for GenerateSeriesTable { include_end, name, } => GenerateSeriesState { - schema: self.schema.clone(), + schema: self.schema(), start, end, step, @@ -174,7 +175,7 @@ impl TableProvider for GenerateSeriesTable { }; Ok(Arc::new(LazyMemoryExec::try_new( - self.schema.clone(), + self.schema(), vec![Arc::new(RwLock::new(state))], )?)) } diff --git a/datafusion/functions-table/src/lib.rs b/datafusion/functions-table/src/lib.rs index 311b9d310f39..36fcdc7ede56 100644 --- a/datafusion/functions-table/src/lib.rs +++ b/datafusion/functions-table/src/lib.rs @@ -20,6 +20,9 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] pub mod generate_series; diff --git a/datafusion/functions-window-common/src/lib.rs b/datafusion/functions-window-common/src/lib.rs index 6f2a1ac0f33f..7f668a20a76a 100644 --- a/datafusion/functions-window-common/src/lib.rs +++ b/datafusion/functions-window-common/src/lib.rs @@ -20,6 +20,9 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] //! Common user-defined window functionality for [DataFusion] //! diff --git a/datafusion/functions-window/src/lib.rs b/datafusion/functions-window/src/lib.rs index 718b0bf1587b..10e09542d7c5 100644 --- a/datafusion/functions-window/src/lib.rs +++ b/datafusion/functions-window/src/lib.rs @@ -20,6 +20,9 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] //! Window Function packages for [DataFusion]. //! diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 0c921941a6dd..07e4973675a1 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -69,7 +69,7 @@ arrow = { workspace = true } arrow-buffer = { workspace = true } base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } -blake3 = { version = "1.6", optional = true } +blake3 = { version = "1.7", optional = true } chrono = { workspace = true } datafusion-common = { workspace = true } datafusion-doc = { workspace = true } @@ -85,7 +85,7 @@ rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { version = "^0.10.1", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } -uuid = { version = "1.15", features = ["v4"], optional = true } +uuid = { version = "1.16", features = ["v4"], optional = true } [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] } diff --git a/datafusion/functions/benches/date_trunc.rs b/datafusion/functions/benches/date_trunc.rs index b7efe7cc8d0a..e7e96fb7a9fa 100644 --- a/datafusion/functions/benches/date_trunc.rs +++ b/datafusion/functions/benches/date_trunc.rs @@ -46,11 +46,14 @@ fn criterion_benchmark(c: &mut Criterion) { ColumnarValue::Scalar(ScalarValue::Utf8(Some("minute".to_string()))); let timestamps = ColumnarValue::Array(timestamps_array); let udf = date_trunc(); - let return_type = &udf.return_type(&[timestamps.data_type()]).unwrap(); + let args = vec![precision, timestamps]; + let return_type = &udf + .return_type(&args.iter().map(|arg| arg.data_type()).collect::>()) + .unwrap(); b.iter(|| { black_box( udf.invoke_with_args(ScalarFunctionArgs { - args: vec![precision.clone(), timestamps.clone()], + args: args.clone(), number_rows: batch_len, return_type, }) diff --git a/datafusion/functions/src/core/union_extract.rs b/datafusion/functions/src/core/union_extract.rs index 95814197d8df..420eeed42cc3 100644 --- a/datafusion/functions/src/core/union_extract.rs +++ b/datafusion/functions/src/core/union_extract.rs @@ -82,8 +82,8 @@ impl ScalarUDFImpl for UnionExtractFun { } fn return_type(&self, _: &[DataType]) -> Result { - // should be using return_type_from_exprs and not calling the default implementation - internal_err!("union_extract should return type from exprs") + // should be using return_type_from_args and not calling the default implementation + internal_err!("union_extract should return type from args") } fn return_type_from_args(&self, args: ReturnTypeArgs) -> Result { diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index de2571779d42..7753b9a6dc8c 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! Function packages for [DataFusion]. diff --git a/datafusion/macros/Cargo.toml b/datafusion/macros/Cargo.toml index 737d2ed72874..c6532aa04681 100644 --- a/datafusion/macros/Cargo.toml +++ b/datafusion/macros/Cargo.toml @@ -41,5 +41,5 @@ proc-macro = true [dependencies] datafusion-expr = { workspace = true } -quote = "1.0.37" -syn = { version = "2.0.79", features = ["full"] } +quote = "1.0.40" +syn = { version = "2.0.100", features = ["full"] } diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs deleted file mode 100644 index 8015ebfc7534..000000000000 --- a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs +++ /dev/null @@ -1,333 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use crate::AnalyzerRule; -use datafusion_common::config::ConfigOptions; -use datafusion_common::tree_node::{Transformed, TransformedResult}; -use datafusion_common::{Column, Result}; -use datafusion_expr::builder::validate_unique_names; -use datafusion_expr::expr::PlannedReplaceSelectItem; -use datafusion_expr::utils::{ - expand_qualified_wildcard, expand_wildcard, find_base_plan, -}; -use datafusion_expr::{ - Distinct, DistinctOn, Expr, LogicalPlan, Projection, SubqueryAlias, -}; - -#[derive(Default, Debug)] -pub struct ExpandWildcardRule {} - -impl ExpandWildcardRule { - pub fn new() -> Self { - Self {} - } -} - -impl AnalyzerRule for ExpandWildcardRule { - fn analyze(&self, plan: LogicalPlan, _: &ConfigOptions) -> Result { - // Because the wildcard expansion is based on the schema of the input plan, - // using `transform_up_with_subqueries` here. - plan.transform_up_with_subqueries(expand_internal).data() - } - - fn name(&self) -> &str { - "expand_wildcard_rule" - } -} - -fn expand_internal(plan: LogicalPlan) -> Result> { - match plan { - LogicalPlan::Projection(Projection { expr, input, .. }) => { - let projected_expr = expand_exprlist(&input, expr)?; - validate_unique_names("Projections", projected_expr.iter())?; - Ok(Transformed::yes( - Projection::try_new(projected_expr, Arc::clone(&input)) - .map(LogicalPlan::Projection)?, - )) - } - // The schema of the plan should also be updated if the child plan is transformed. - LogicalPlan::SubqueryAlias(SubqueryAlias { input, alias, .. }) => { - Ok(Transformed::yes( - SubqueryAlias::try_new(input, alias).map(LogicalPlan::SubqueryAlias)?, - )) - } - LogicalPlan::Distinct(Distinct::On(distinct_on)) => { - let projected_expr = - expand_exprlist(&distinct_on.input, distinct_on.select_expr)?; - validate_unique_names("Distinct", projected_expr.iter())?; - Ok(Transformed::yes(LogicalPlan::Distinct(Distinct::On( - DistinctOn::try_new( - distinct_on.on_expr, - projected_expr, - distinct_on.sort_expr, - distinct_on.input, - )?, - )))) - } - _ => Ok(Transformed::no(plan)), - } -} - -fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { - let mut projected_expr = vec![]; - let input = find_base_plan(input); - for e in expr { - match e { - #[expect(deprecated)] - Expr::Wildcard { qualifier, options } => { - if let Some(qualifier) = qualifier { - let expanded = expand_qualified_wildcard( - &qualifier, - input.schema(), - Some(&options), - )?; - // If there is a REPLACE statement, replace that column with the given - // replace expression. Column name remains the same. - let replaced = if let Some(replace) = options.replace { - replace_columns(expanded, &replace)? - } else { - expanded - }; - projected_expr.extend(replaced); - } else { - let expanded = - expand_wildcard(input.schema(), input, Some(&options))?; - // If there is a REPLACE statement, replace that column with the given - // replace expression. Column name remains the same. - let replaced = if let Some(replace) = options.replace { - replace_columns(expanded, &replace)? - } else { - expanded - }; - projected_expr.extend(replaced); - } - } - // A workaround to handle the case when the column name is "*". - // We transform the expression to a Expr::Column through [Column::from_name] in many places. - // It would also convert the wildcard expression to a column expression with name "*". - Expr::Column(Column { - ref relation, - ref name, - // TODO Should we use these spans? - spans: _, - }) => { - if name.eq("*") { - if let Some(qualifier) = relation { - projected_expr.extend(expand_qualified_wildcard( - qualifier, - input.schema(), - None, - )?); - } else { - projected_expr.extend(expand_wildcard( - input.schema(), - input, - None, - )?); - } - } else { - projected_expr.push(e.clone()); - } - } - _ => projected_expr.push(e), - } - } - Ok(projected_expr) -} - -/// If there is a REPLACE statement in the projected expression in the form of -/// "REPLACE (some_column_within_an_expr AS some_column)", this function replaces -/// that column with the given replace expression. Column name remains the same. -/// Multiple REPLACEs are also possible with comma separations. -fn replace_columns( - mut exprs: Vec, - replace: &PlannedReplaceSelectItem, -) -> Result> { - for expr in exprs.iter_mut() { - if let Expr::Column(Column { name, .. }) = expr { - if let Some((_, new_expr)) = replace - .items() - .iter() - .zip(replace.expressions().iter()) - .find(|(item, _)| item.column_name.value == *name) - { - *expr = new_expr.clone().alias(name.clone()) - } - } - } - Ok(exprs) -} - -#[cfg(test)] -mod tests { - use arrow::datatypes::{DataType, Field, Schema}; - - use crate::test::{assert_analyzed_plan_eq_display_indent, test_table_scan}; - use crate::Analyzer; - use datafusion_common::{JoinType, TableReference}; - use datafusion_expr::{ - col, in_subquery, qualified_wildcard, table_scan, wildcard, LogicalPlanBuilder, - }; - - use super::*; - - fn assert_plan_eq(plan: LogicalPlan, expected: &str) -> Result<()> { - assert_analyzed_plan_eq_display_indent( - Arc::new(ExpandWildcardRule::new()), - plan, - expected, - ) - } - - #[test] - fn test_expand_wildcard() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .project(vec![wildcard()])? - .build()?; - let expected = - "Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_expand_qualified_wildcard() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .project(vec![qualified_wildcard(TableReference::bare("test"))])? - .build()?; - let expected = - "Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_expand_qualified_wildcard_in_subquery() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .project(vec![qualified_wildcard(TableReference::bare("test"))])? - .build()?; - let plan = LogicalPlanBuilder::from(plan) - .project(vec![wildcard()])? - .build()?; - let expected = - "Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ - \n Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_expand_wildcard_in_subquery() -> Result<()> { - let projection_a = LogicalPlanBuilder::from(test_table_scan()?) - .project(vec![col("a")])? - .build()?; - let subquery = LogicalPlanBuilder::from(projection_a) - .project(vec![wildcard()])? - .build()?; - let plan = LogicalPlanBuilder::from(test_table_scan()?) - .filter(in_subquery(col("a"), Arc::new(subquery)))? - .project(vec![wildcard()])? - .build()?; - let expected = "\ - Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]\ - \n Filter: test.a IN () [a:UInt32, b:UInt32, c:UInt32]\ - \n Subquery: [a:UInt32]\ - \n Projection: test.a [a:UInt32]\ - \n Projection: test.a [a:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_expand_wildcard_in_distinct_on() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .distinct_on(vec![col("a")], vec![wildcard()], None)? - .build()?; - let expected = "\ - DistinctOn: on_expr=[[test.a]], select_expr=[[test.a, test.b, test.c]], sort_expr=[[]] [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - assert_plan_eq(plan, expected) - } - - #[test] - fn test_subquery_schema() -> Result<()> { - let analyzer = Analyzer::with_rules(vec![Arc::new(ExpandWildcardRule::new())]); - let options = ConfigOptions::default(); - let subquery = LogicalPlanBuilder::from(test_table_scan()?) - .project(vec![wildcard()])? - .build()?; - let plan = LogicalPlanBuilder::from(subquery) - .alias("sub")? - .project(vec![wildcard()])? - .build()?; - let analyzed_plan = analyzer.execute_and_check(plan, &options, |_, _| {})?; - for x in analyzed_plan.inputs() { - for field in x.schema().fields() { - assert_ne!(field.name(), "*"); - } - } - Ok(()) - } - - fn employee_schema() -> Schema { - Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, false), - Field::new("state", DataType::Utf8, false), - Field::new("salary", DataType::Int32, false), - ]) - } - - #[test] - fn plan_using_join_wildcard_projection() -> Result<()> { - let t2 = table_scan(Some("t2"), &employee_schema(), None)?.build()?; - - let plan = table_scan(Some("t1"), &employee_schema(), None)? - .join_using(t2, JoinType::Inner, vec!["id"])? - .project(vec![wildcard()])? - .build()?; - - let expected = "Projection: *\ - \n Inner Join: Using t1.id = t2.id\ - \n TableScan: t1\ - \n TableScan: t2"; - - assert_eq!(expected, format!("{plan}")); - - let analyzer = Analyzer::with_rules(vec![Arc::new(ExpandWildcardRule::new())]); - let options = ConfigOptions::default(); - - let analyzed_plan = analyzer.execute_and_check(plan, &options, |_, _| {})?; - - // id column should only show up once in projection - let expected = "Projection: t1.id, t1.first_name, t1.last_name, t1.state, t1.salary, t2.first_name, t2.last_name, t2.state, t2.salary\ - \n Inner Join: Using t1.id = t2.id\ - \n TableScan: t1\ - \n TableScan: t2"; - assert_eq!(expected, format!("{analyzed_plan}")); - - Ok(()) - } -} diff --git a/datafusion/optimizer/src/analyzer/inline_table_scan.rs b/datafusion/optimizer/src/analyzer/inline_table_scan.rs deleted file mode 100644 index 95781b395f3c..000000000000 --- a/datafusion/optimizer/src/analyzer/inline_table_scan.rs +++ /dev/null @@ -1,205 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Analyzed rule to replace TableScan references -//! such as DataFrames and Views and inlines the LogicalPlan. - -use crate::analyzer::AnalyzerRule; - -use datafusion_common::config::ConfigOptions; -use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_common::{Column, Result}; -use datafusion_expr::{logical_plan::LogicalPlan, wildcard, Expr, LogicalPlanBuilder}; - -/// Analyzed rule that inlines TableScan that provide a [`LogicalPlan`] -/// (DataFrame / ViewTable) -#[derive(Default, Debug)] -pub struct InlineTableScan; - -impl InlineTableScan { - pub fn new() -> Self { - Self {} - } -} - -impl AnalyzerRule for InlineTableScan { - fn analyze(&self, plan: LogicalPlan, _: &ConfigOptions) -> Result { - plan.transform_up(analyze_internal).data() - } - - fn name(&self) -> &str { - "inline_table_scan" - } -} - -fn analyze_internal(plan: LogicalPlan) -> Result> { - // rewrite any subqueries in the plan first - let transformed_plan = - plan.map_subqueries(|plan| plan.transform_up(analyze_internal))?; - - let transformed_plan = transformed_plan.transform_data(|plan| { - match plan { - // Match only on scans without filter / projection / fetch - // Views and DataFrames won't have those added - // during the early stage of planning. - LogicalPlan::TableScan(table_scan) if table_scan.filters.is_empty() => { - if let Some(sub_plan) = table_scan.source.get_logical_plan() { - let sub_plan = sub_plan.into_owned(); - let projection_exprs = - generate_projection_expr(&table_scan.projection, &sub_plan)?; - LogicalPlanBuilder::from(sub_plan) - .project(projection_exprs)? - // Ensures that the reference to the inlined table remains the - // same, meaning we don't have to change any of the parent nodes - // that reference this table. - .alias(table_scan.table_name)? - .build() - .map(Transformed::yes) - } else { - Ok(Transformed::no(LogicalPlan::TableScan(table_scan))) - } - } - _ => Ok(Transformed::no(plan)), - } - })?; - - Ok(transformed_plan) -} - -fn generate_projection_expr( - projection: &Option>, - sub_plan: &LogicalPlan, -) -> Result> { - let mut exprs = vec![]; - if let Some(projection) = projection { - for i in projection { - exprs.push(Expr::Column(Column::from( - sub_plan.schema().qualified_field(*i), - ))); - } - } else { - exprs.push(wildcard()); - } - Ok(exprs) -} - -#[cfg(test)] -mod tests { - use std::{borrow::Cow, sync::Arc, vec}; - - use crate::analyzer::inline_table_scan::InlineTableScan; - use crate::test::assert_analyzed_plan_eq; - - use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_expr::{col, lit, Expr, LogicalPlan, LogicalPlanBuilder, TableSource}; - - pub struct RawTableSource {} - - impl TableSource for RawTableSource { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn schema(&self) -> arrow::datatypes::SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int64, false), - Field::new("b", DataType::Int64, false), - ])) - } - - fn supports_filters_pushdown( - &self, - filters: &[&Expr], - ) -> datafusion_common::Result> - { - Ok((0..filters.len()) - .map(|_| datafusion_expr::TableProviderFilterPushDown::Inexact) - .collect()) - } - } - - pub struct CustomSource { - plan: LogicalPlan, - } - - impl CustomSource { - fn new() -> Self { - Self { - plan: LogicalPlanBuilder::scan("y", Arc::new(RawTableSource {}), None) - .unwrap() - .build() - .unwrap(), - } - } - } - - impl TableSource for CustomSource { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn supports_filters_pushdown( - &self, - filters: &[&Expr], - ) -> datafusion_common::Result> - { - Ok((0..filters.len()) - .map(|_| datafusion_expr::TableProviderFilterPushDown::Exact) - .collect()) - } - - fn schema(&self) -> arrow::datatypes::SchemaRef { - Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)])) - } - - fn get_logical_plan(&self) -> Option> { - Some(Cow::Borrowed(&self.plan)) - } - } - - #[test] - fn inline_table_scan() -> datafusion_common::Result<()> { - let scan = LogicalPlanBuilder::scan( - "x".to_string(), - Arc::new(CustomSource::new()), - None, - )?; - let plan = scan.filter(col("x.a").eq(lit(1)))?.build()?; - let expected = "Filter: x.a = Int32(1)\ - \n SubqueryAlias: x\ - \n Projection: *\ - \n TableScan: y"; - - assert_analyzed_plan_eq(Arc::new(InlineTableScan::new()), plan, expected) - } - - #[test] - fn inline_table_scan_with_projection() -> datafusion_common::Result<()> { - let scan = LogicalPlanBuilder::scan( - "x".to_string(), - Arc::new(CustomSource::new()), - Some(vec![0]), - )?; - - let plan = scan.build()?; - let expected = "SubqueryAlias: x\ - \n Projection: y.a\ - \n TableScan: y"; - - assert_analyzed_plan_eq(Arc::new(InlineTableScan::new()), plan, expected) - } -} diff --git a/datafusion/optimizer/src/analyzer/mod.rs b/datafusion/optimizer/src/analyzer/mod.rs index c506616d142e..2517e3c3a400 100644 --- a/datafusion/optimizer/src/analyzer/mod.rs +++ b/datafusion/optimizer/src/analyzer/mod.rs @@ -28,17 +28,13 @@ use datafusion_common::Result; use datafusion_expr::expr_rewriter::FunctionRewrite; use datafusion_expr::{InvariantLevel, LogicalPlan}; -use crate::analyzer::expand_wildcard_rule::ExpandWildcardRule; -use crate::analyzer::inline_table_scan::InlineTableScan; use crate::analyzer::resolve_grouping_function::ResolveGroupingFunction; use crate::analyzer::type_coercion::TypeCoercion; use crate::utils::log_plan; use self::function_rewrite::ApplyFunctionRewrites; -pub mod expand_wildcard_rule; pub mod function_rewrite; -pub mod inline_table_scan; pub mod resolve_grouping_function; pub mod type_coercion; @@ -98,10 +94,6 @@ impl Analyzer { /// Create a new analyzer using the recommended list of rules pub fn new() -> Self { let rules: Vec> = vec![ - Arc::new(InlineTableScan::new()), - // Every rule that will generate [Expr::Wildcard] should be placed in front of [ExpandWildcardRule]. - Arc::new(ExpandWildcardRule::new()), - // [Expr::Wildcard] should be expanded before [TypeCoercion] Arc::new(ResolveGroupingFunction::new()), Arc::new(TypeCoercion::new()), ]; diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 538ef98ac7be..07eb795462c1 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -46,7 +46,7 @@ use datafusion_expr::type_coercion::functions::{ use datafusion_expr::type_coercion::other::{ get_coerce_type_for_case_expression, get_coerce_type_for_list, }; -use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_large_utf8}; +use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_utf8view_or_large_utf8}; use datafusion_expr::utils::merge_schema; use datafusion_expr::{ is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not, @@ -214,7 +214,10 @@ impl<'a> TypeCoercionRewriter<'a> { /// Coerce the union’s inputs to a common schema compatible with all inputs. /// This occurs after wildcard expansion and the coercion of the input expressions. pub fn coerce_union(union_plan: Union) -> Result { - let union_schema = Arc::new(coerce_union_schema(&union_plan.inputs)?); + let union_schema = Arc::new(coerce_union_schema_with_schema( + &union_plan.inputs, + &union_plan.schema, + )?); let new_inputs = union_plan .inputs .into_iter() @@ -311,12 +314,14 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> { Expr::ScalarSubquery(Subquery { subquery, outer_ref_columns, + spans, }) => { let new_plan = analyze_internal(self.schema, Arc::unwrap_or_clone(subquery))?.data; Ok(Transformed::yes(Expr::ScalarSubquery(Subquery { subquery: Arc::new(new_plan), outer_ref_columns, + spans, }))) } Expr::Exists(Exists { subquery, negated }) => { @@ -329,6 +334,7 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> { subquery: Subquery { subquery: Arc::new(new_plan), outer_ref_columns: subquery.outer_ref_columns, + spans: subquery.spans, }, negated, }))) @@ -352,6 +358,7 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> { let new_subquery = Subquery { subquery: Arc::new(new_plan), outer_ref_columns: subquery.outer_ref_columns, + spans: subquery.spans, }; Ok(Transformed::yes(Expr::InSubquery(InSubquery::new( Box::new(expr.cast_to(&common_type, self.schema)?), @@ -709,7 +716,7 @@ fn coerce_frame_bound( fn extract_window_frame_target_type(col_type: &DataType) -> Result { if col_type.is_numeric() - || is_utf8_or_large_utf8(col_type) + || is_utf8_or_utf8view_or_large_utf8(col_type) || matches!(col_type, DataType::Null) || matches!(col_type, DataType::Boolean) { @@ -930,7 +937,12 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { /// This method presumes that the wildcard expansion is unneeded, or has already /// been applied. pub fn coerce_union_schema(inputs: &[Arc]) -> Result { - let base_schema = inputs[0].schema(); + coerce_union_schema_with_schema(&inputs[1..], inputs[0].schema()) +} +fn coerce_union_schema_with_schema( + inputs: &[Arc], + base_schema: &DFSchemaRef, +) -> Result { let mut union_datatypes = base_schema .fields() .iter() @@ -949,7 +961,7 @@ pub fn coerce_union_schema(inputs: &[Arc]) -> Result { let mut metadata = base_schema.metadata().clone(); - for (i, plan) in inputs.iter().enumerate().skip(1) { + for (i, plan) in inputs.iter().enumerate() { let plan_schema = plan.schema(); metadata.extend(plan_schema.metadata().clone()); @@ -989,15 +1001,15 @@ pub fn coerce_union_schema(inputs: &[Arc]) -> Result { } } let union_qualified_fields = izip!( - base_schema.iter(), + base_schema.fields(), union_datatypes.into_iter(), union_nullabilities, union_field_meta.into_iter() ) - .map(|((qualifier, field), datatype, nullable, metadata)| { + .map(|(field, datatype, nullable, metadata)| { let mut field = Field::new(field.name().clone(), datatype, nullable); field.set_metadata(metadata); - (qualifier.cloned(), field.into()) + (None, field.into()) }) .collect::>(); @@ -1041,15 +1053,16 @@ mod test { use std::sync::Arc; use arrow::datatypes::DataType::Utf8; - use arrow::datatypes::{DataType, Field, TimeUnit}; + use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use crate::analyzer::type_coercion::{ coerce_case_expression, TypeCoercion, TypeCoercionRewriter, }; + use crate::analyzer::Analyzer; use crate::test::{assert_analyzed_plan_eq, assert_analyzed_plan_with_config_eq}; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{TransformedResult, TreeNode}; - use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue, Spans}; use datafusion_expr::expr::{self, InSubquery, Like, ScalarFunction}; use datafusion_expr::logical_plan::{EmptyRelation, Projection, Sort}; use datafusion_expr::test::function_stub::avg_udaf; @@ -1057,9 +1070,10 @@ mod test { cast, col, create_udaf, is_true, lit, AccumulatorFactoryFunction, AggregateUDF, BinaryExpr, Case, ColumnarValue, Expr, ExprSchemable, Filter, LogicalPlan, Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, - SimpleAggregateUDF, Subquery, Volatility, + SimpleAggregateUDF, Subquery, Union, Volatility, }; use datafusion_functions_aggregate::average::AvgAccumulator; + use datafusion_sql::TableReference; fn empty() -> Arc { Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { @@ -1090,6 +1104,42 @@ mod test { assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected) } + #[test] + fn test_coerce_union() -> Result<()> { + let left_plan = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: Arc::new( + DFSchema::try_from_qualified_schema( + TableReference::full("datafusion", "test", "foo"), + &Schema::new(vec![Field::new("a", DataType::Int32, false)]), + ) + .unwrap(), + ), + })); + let right_plan = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: Arc::new( + DFSchema::try_from_qualified_schema( + TableReference::full("datafusion", "test", "foo"), + &Schema::new(vec![Field::new("a", DataType::Int64, false)]), + ) + .unwrap(), + ), + })); + let union = LogicalPlan::Union(Union::try_new_with_loose_types(vec![ + left_plan, right_plan, + ])?); + let analyzed_union = Analyzer::with_rules(vec![Arc::new(TypeCoercion::new())]) + .execute_and_check(union, &ConfigOptions::default(), |_, _| {})?; + let top_level_plan = LogicalPlan::Projection(Projection::try_new( + vec![col("a")], + Arc::new(analyzed_union), + )?); + + let expected = "Projection: a\n Union\n Projection: CAST(datafusion.test.foo.a AS Int64) AS a\n EmptyRelation\n EmptyRelation"; + assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), top_level_plan, expected) + } + fn coerce_on_output_if_viewtype(plan: LogicalPlan, expected: &str) -> Result<()> { let mut options = ConfigOptions::default(); options.optimizer.expand_views_at_output = true; @@ -2089,6 +2139,7 @@ mod test { Subquery { subquery: empty_int32, outer_ref_columns: vec![], + spans: Spans::new(), }, false, )); @@ -2114,6 +2165,7 @@ mod test { Subquery { subquery: empty_int64, outer_ref_columns: vec![], + spans: Spans::new(), }, false, )); @@ -2138,6 +2190,7 @@ mod test { Subquery { subquery: empty_inside, outer_ref_columns: vec![], + spans: Spans::new(), }, false, )); diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 5dc1a7e5ac5b..69b5fbb9f8c0 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -797,8 +797,8 @@ mod test { use datafusion_expr::logical_plan::{table_scan, JoinType}; use datafusion_expr::{ grouping_set, is_null, not, AccumulatorFactoryFunction, AggregateUDF, - ColumnarValue, ScalarUDF, ScalarUDFImpl, Signature, SimpleAggregateUDF, - Volatility, + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, + SimpleAggregateUDF, Volatility, }; use datafusion_expr::{lit, logical_plan::builder::LogicalPlanBuilder}; @@ -1598,7 +1598,7 @@ mod test { Ok(DataType::Int32) } - fn invoke(&self, _: &[ColumnarValue]) -> Result { + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { panic!("not implemented") } } @@ -1705,5 +1705,9 @@ mod test { fn return_type(&self, _arg_types: &[DataType]) -> Result { Ok(DataType::Float64) } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + panic!("dummy - not implemented") + } } } diff --git a/datafusion/optimizer/src/eliminate_limit.rs b/datafusion/optimizer/src/eliminate_limit.rs index 267615c3e0d9..5d3a1b223b7a 100644 --- a/datafusion/optimizer/src/eliminate_limit.rs +++ b/datafusion/optimizer/src/eliminate_limit.rs @@ -77,6 +77,7 @@ impl OptimizerRule for EliminateLimit { } else if matches!(limit.get_skip_type()?, SkipType::Literal(0)) { // If fetch is `None` and skip is 0, then Limit takes no effect and // we can remove it. Its input also can be Limit, so we should apply again. + #[allow(clippy::used_underscore_binding)] return self.rewrite(Arc::unwrap_or_clone(limit.input), _config); } Ok(Transformed::no(LogicalPlan::Limit(limit))) diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs index ce198560805a..893cb249a2a8 100644 --- a/datafusion/optimizer/src/lib.rs +++ b/datafusion/optimizer/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! # DataFusion Optimizer diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index b7dd391586a1..b3a09e2dcbcc 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -492,8 +492,11 @@ fn merge_consecutive_projections(proj: Projection) -> Result rewrite_expr(*expr, &prev_projection).map(|result| { - result.update_data(|expr| Expr::Alias(Alias::new(expr, relation, name))) + result.update_data(|expr| { + Expr::Alias(Alias::new(expr, relation, name).with_metadata(metadata)) + }) }), e => rewrite_expr(e, &prev_projection), } diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 3a69bd91e749..04d73fe3ab87 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -447,7 +447,7 @@ fn assert_valid_optimization( plan: &LogicalPlan, prev_schema: &Arc, ) -> Result<()> { - // verify invariant: optimizer passes should not change the schema + // verify invariant: optimizer passes should not change the schema if the schema can't be cast from the previous schema. // Refer to assert_expected_schema(prev_schema, plan)?; @@ -459,7 +459,9 @@ mod tests { use std::sync::{Arc, Mutex}; use datafusion_common::tree_node::Transformed; - use datafusion_common::{plan_err, DFSchema, DFSchemaRef, DataFusionError, Result}; + use datafusion_common::{ + assert_contains, plan_err, DFSchema, DFSchemaRef, DataFusionError, Result, + }; use datafusion_expr::logical_plan::EmptyRelation; use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, Projection}; @@ -505,28 +507,9 @@ mod tests { schema: Arc::new(DFSchema::empty()), }); let err = opt.optimize(plan, &config, &observe).unwrap_err(); - assert!(err.strip_backtrace().starts_with( - "Optimizer rule 'get table_scan rule' failed\n\ - caused by\n\ - Check optimizer-specific invariants after optimizer rule: get table_scan rule\n\ - caused by\n\ - Internal error: Failed due to a difference in schemas, \ - original schema: DFSchema { inner: Schema { \ - fields: [], \ - metadata: {} }, \ - field_qualifiers: [], \ - functional_dependencies: FunctionalDependencies { deps: [] } \ - }, \ - new schema: DFSchema { inner: Schema { \ - fields: [\ - Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }\ - ], \ - metadata: {} }, \ - field_qualifiers: [Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" })], \ - functional_dependencies: FunctionalDependencies { deps: [] } }", - )); + + // Simplify assert to check the error message contains the expected message, which is only the schema length mismatch + assert_contains!(err.strip_backtrace(), "Schema mismatch: the schema length are not same Expected schema length: 3, got: 0"); } #[test] diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index d26df073dc6f..344707ae8dbe 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -316,7 +316,7 @@ mod tests { let plan = LogicalPlanBuilder::from(left).union(right)?.build()?; - let expected = "TableScan: test"; + let expected = "Projection: a, b, c\n TableScan: test"; assert_together_optimized_plan(plan, expected, true) } @@ -406,7 +406,7 @@ mod tests { let plan = LogicalPlanBuilder::from(left).union(right)?.build()?; - let expected = "TableScan: test"; + let expected = "Projection: a, b, c\n TableScan: test"; assert_together_optimized_plan(plan, expected, true) } diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 6b408521c5cf..c9617514e453 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -799,6 +799,7 @@ impl OptimizerRule for PushDownFilter { new_predicate, child_filter.input, )?); + #[allow(clippy::used_underscore_binding)] self.rewrite(new_filter, _config) } LogicalPlan::Repartition(repartition) => { @@ -1140,6 +1141,12 @@ impl OptimizerRule for PushDownFilter { }) } LogicalPlan::Extension(extension_plan) => { + // This check prevents the Filter from being removed when the extension node has no children, + // so we return the original Filter unchanged. + if extension_plan.node.inputs().is_empty() { + filter.input = Arc::new(LogicalPlan::Extension(extension_plan)); + return Ok(Transformed::no(LogicalPlan::Filter(filter))); + } let prevent_cols = extension_plan.node.prevent_predicate_push_down_columns(); @@ -3786,4 +3793,83 @@ Projection: a, b \n TableScan: test"; assert_optimized_plan_eq(plan, expected_after) } + + #[test] + fn test_push_down_filter_to_user_defined_node() -> Result<()> { + // Define a custom user-defined logical node + #[derive(Debug, Hash, Eq, PartialEq)] + struct TestUserNode { + schema: DFSchemaRef, + } + + impl PartialOrd for TestUserNode { + fn partial_cmp(&self, _other: &Self) -> Option { + None + } + } + + impl TestUserNode { + fn new() -> Self { + let schema = Arc::new( + DFSchema::new_with_metadata( + vec![(None, Field::new("a", DataType::Int64, false).into())], + Default::default(), + ) + .unwrap(), + ); + + Self { schema } + } + } + + impl UserDefinedLogicalNodeCore for TestUserNode { + fn name(&self) -> &str { + "test_node" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "TestUserNode") + } + + fn with_exprs_and_inputs( + &self, + exprs: Vec, + inputs: Vec, + ) -> Result { + assert!(exprs.is_empty()); + assert!(inputs.is_empty()); + Ok(Self { + schema: Arc::clone(&self.schema), + }) + } + } + + // Create a node and build a plan with a filter + let node = LogicalPlan::Extension(Extension { + node: Arc::new(TestUserNode::new()), + }); + + let plan = LogicalPlanBuilder::from(node).filter(lit(false))?.build()?; + + // Check the original plan format (not part of the test assertions) + let expected_before = "Filter: Boolean(false)\ + \n TestUserNode"; + assert_eq!(format!("{plan}"), expected_before); + + // Check that the filter is pushed down to the user-defined node + let expected_after = "Filter: Boolean(false)\n TestUserNode"; + assert_optimized_plan_eq(plan, expected_after) + } } diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 4da112d5153a..04ff94347247 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -82,6 +82,7 @@ impl OptimizerRule for PushDownLimit { }); // recursively reapply the rule on the new plan + #[allow(clippy::used_underscore_binding)] return self.rewrite(plan, _config); } diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 3a8aef267be5..33f10400d341 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -731,9 +731,7 @@ mod tests { .project(vec![col("customer.c_custkey")])? .build()?; - let expected = "Invalid (non-executable) plan after Analyzer\ - \ncaused by\ - \nError during planning: Scalar subquery should only return one column"; + let expected = "Error during planning: Scalar subquery should only return one column, but found 4: orders.o_orderkey, orders.o_custkey, orders.o_orderstatus, orders.o_totalprice"; assert_analyzer_check_err(vec![], plan, expected); Ok(()) } @@ -793,9 +791,7 @@ mod tests { .project(vec![col("customer.c_custkey")])? .build()?; - let expected = "Invalid (non-executable) plan after Analyzer\ - \ncaused by\ - \nError during planning: Scalar subquery should only return one column"; + let expected = "Error during planning: Scalar subquery should only return one column, but found 2: orders.o_custkey, orders.o_orderkey"; assert_analyzer_check_err(vec![], plan, expected); Ok(()) } diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index d5a1b84e6aff..ce10c7e5c631 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -4307,6 +4307,10 @@ mod tests { fn return_type(&self, _arg_types: &[DataType]) -> Result { Ok(DataType::Int16) } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + panic!("dummy - not implemented") + } } #[test] diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 6c99f18ab0f6..0b47cdee212f 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -22,6 +22,8 @@ use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; +const ANY_CHAR_REGEX_PATTERN: &str = ".*"; + /// Tries to convert a regexp expression to a `LIKE` or `Eq`/`NotEq` expression. /// /// This function also validates the regex pattern. And will return error if the @@ -33,6 +35,8 @@ const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; /// - full anchored regex patterns (e.g. `^foo$`) to `= 'foo'` /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'` /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND` +/// - `EQ .*` to NotNull +/// - `NE .*` means IS EMPTY /// /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`. pub fn simplify_regex_expr( @@ -43,6 +47,23 @@ pub fn simplify_regex_expr( let mode = OperatorMode::new(&op); if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = right.as_ref() { + // Handle the special case for ".*" pattern + if pattern == ANY_CHAR_REGEX_PATTERN { + let new_expr = if mode.not { + // not empty + let empty_lit = Box::new(lit("")); + Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right: empty_lit, + }) + } else { + // not null + left.is_not_null() + }; + return Ok(new_expr); + } + match regex_syntax::Parser::new().parse(pattern) { Ok(hir) => { let kind = hir.kind(); diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index 709d8f79c3d9..e33869ca2b63 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -765,4 +765,51 @@ mod tests { assert_optimized_plan_eq(plan, expected) } + + #[test] + fn test_simplify_regex_special_cases() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, false), + ]); + let table_scan = table_scan(Some("test"), &schema, None)?.build()?; + + // Test `= ".*"` transforms to true (except for empty strings) + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr(col("a"), Operator::RegexMatch, lit(".*")))? + .build()?; + let expected = "Filter: test.a IS NOT NULL\ + \n TableScan: test"; + + assert_optimized_plan_eq(plan, expected)?; + + // Test `!= ".*"` transforms to checking if the column is empty + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))? + .build()?; + let expected = "Filter: test.a = Utf8(\"\")\ + \n TableScan: test"; + + assert_optimized_plan_eq(plan, expected)?; + + // Test case-insensitive versions + + // Test `=~ ".*"` (case-insensitive) transforms to true (except for empty strings) + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr(col("b"), Operator::RegexIMatch, lit(".*")))? + .build()?; + let expected = "Filter: Boolean(true)\ + \n TableScan: test"; + + assert_optimized_plan_eq(plan, expected)?; + + // Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))? + .build()?; + let expected = "Filter: test.a = Utf8(\"\")\ + \n TableScan: test"; + + assert_optimized_plan_eq(plan, expected) + } } diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index 66bd6b75123e..5e66c7ec0313 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -22,16 +22,14 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; use datafusion_common::config::ConfigOptions; -use datafusion_common::{assert_contains, plan_err, Result, TableReference}; +use datafusion_common::{plan_err, Result, TableReference}; use datafusion_expr::planner::ExprPlanner; -use datafusion_expr::sqlparser::dialect::PostgreSqlDialect; use datafusion_expr::test::function_stub::sum_udaf; use datafusion_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource, WindowUDF}; use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::planner::AggregateFunctionPlanner; use datafusion_functions_window::planner::WindowFunctionPlanner; -use datafusion_optimizer::analyzer::type_coercion::TypeCoercionRewriter; use datafusion_optimizer::analyzer::Analyzer; use datafusion_optimizer::optimizer::Optimizer; use datafusion_optimizer::{OptimizerConfig, OptimizerContext, OptimizerRule}; @@ -344,16 +342,6 @@ fn test_propagate_empty_relation_inner_join_and_unions() { assert_eq!(expected, format!("{plan}")); } -#[test] -fn select_wildcard_with_repeated_column() { - let sql = "SELECT *, col_int32 FROM test"; - let err = test_sql(sql).expect_err("query should have failed"); - assert_eq!( - "Schema error: Schema contains duplicate qualified field name test.col_int32", - err.strip_backtrace() - ); -} - #[test] fn select_wildcard_with_repeated_column_but_is_aliased() { let sql = "SELECT *, col_int32 as col_32 FROM test"; @@ -390,32 +378,6 @@ fn select_correlated_predicate_subquery_with_uppercase_ident() { assert_eq!(expected, format!("{plan}")); } -// The test should return an error -// because the wildcard didn't be expanded before type coercion -#[test] -fn test_union_coercion_with_wildcard() -> Result<()> { - let dialect = PostgreSqlDialect {}; - let context_provider = MyContextProvider::default(); - let sql = "select * from (SELECT col_int32, col_uint32 FROM test) union all select * from(SELECT col_uint32, col_int32 FROM test)"; - let statements = Parser::parse_sql(&dialect, sql)?; - let sql_to_rel = SqlToRel::new(&context_provider); - let logical_plan = sql_to_rel.sql_statement_to_plan(statements[0].clone())?; - - if let LogicalPlan::Union(union) = logical_plan { - let err = TypeCoercionRewriter::coerce_union(union) - .err() - .unwrap() - .to_string(); - assert_contains!( - err, - "Error during planning: Wildcard should be expanded before type coercion" - ); - } else { - panic!("Expected Union plan"); - } - Ok(()) -} - fn test_sql(sql: &str) -> Result { // parse the SQL let dialect = GenericDialect {}; // or AnsiDialect, or your own dialect ... diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs index 440f044d88eb..86d4487f4c12 100644 --- a/datafusion/physical-expr-common/src/lib.rs +++ b/datafusion/physical-expr-common/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! Physical Expr Common packages for [DataFusion] diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index cc2ff2f24790..43f214607f9f 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -16,6 +16,7 @@ // under the License. use std::any::Any; +use std::fmt; use std::fmt::{Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; use std::sync::Arc; @@ -53,6 +54,12 @@ pub type PhysicalExprRef = Arc; /// * [`SessionContext::create_physical_expr`]: A high level API /// * [`create_physical_expr`]: A low level API /// +/// # Formatting `PhysicalExpr` as strings +/// There are three ways to format `PhysicalExpr` as a string: +/// * [`Debug`]: Standard Rust debugging format (e.g. `Constant { value: ... }`) +/// * [`Display`]: Detailed SQL-like format that shows expression structure (e.g. (`Utf8 ("foobar")`). This is often used for debugging and tests +/// * [`Self::fmt_sql`]: SQL-like human readable format (e.g. ('foobar')`), See also [`sql_fmt`] +/// /// [`SessionContext::create_physical_expr`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.create_physical_expr /// [`PhysicalPlanner`]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html /// [`Expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html @@ -266,6 +273,16 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash { fn get_properties(&self, _children: &[ExprProperties]) -> Result { Ok(ExprProperties::new_unknown()) } + + /// Format this `PhysicalExpr` in nice human readable "SQL" format + /// + /// Specifically, this format is designed to be readable by humans, at the + /// expense of details. Use `Display` or `Debug` for more detailed + /// representation. + /// + /// See the [`fmt_sql`] function for an example of printing `PhysicalExpr`s as SQL. + /// + fn fmt_sql(&self, f: &mut Formatter<'_>) -> fmt::Result; } /// [`PhysicalExpr`] can't be constrained by [`Eq`] directly because it must remain object @@ -363,7 +380,7 @@ where I: Iterator + Clone, I::Item: Display, { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { let mut iter = self.0.clone(); write!(f, "[")?; if let Some(expr) = iter.next() { @@ -379,3 +396,53 @@ where DisplayWrapper(exprs.into_iter()) } + +/// Prints a [`PhysicalExpr`] in a SQL-like format +/// +/// # Example +/// ``` +/// # // The boiler plate needed to create a `PhysicalExpr` for the example +/// # use std::any::Any; +/// # use std::fmt::Formatter; +/// # use std::sync::Arc; +/// # use arrow::array::RecordBatch; +/// # use arrow::datatypes::{DataType, Schema}; +/// # use datafusion_common::Result; +/// # use datafusion_expr_common::columnar_value::ColumnarValue; +/// # use datafusion_physical_expr_common::physical_expr::{fmt_sql, DynEq, PhysicalExpr}; +/// # #[derive(Debug, Hash, PartialOrd, PartialEq)] +/// # struct MyExpr {}; +/// # impl PhysicalExpr for MyExpr {fn as_any(&self) -> &dyn Any { unimplemented!() } +/// # fn data_type(&self, input_schema: &Schema) -> Result { unimplemented!() } +/// # fn nullable(&self, input_schema: &Schema) -> Result { unimplemented!() } +/// # fn evaluate(&self, batch: &RecordBatch) -> Result { unimplemented!() } +/// # fn children(&self) -> Vec<&Arc>{ unimplemented!() } +/// # fn with_new_children(self: Arc, children: Vec>) -> Result> { unimplemented!() } +/// # fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "CASE a > b THEN 1 ELSE 0 END") } +/// # } +/// # impl std::fmt::Display for MyExpr {fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { unimplemented!() } } +/// # impl DynEq for MyExpr {fn dyn_eq(&self, other: &dyn Any) -> bool { unimplemented!() } } +/// # fn make_physical_expr() -> Arc { Arc::new(MyExpr{}) } +/// let expr: Arc = make_physical_expr(); +/// // wrap the expression in `sql_fmt` which can be used with +/// // `format!`, `to_string()`, etc +/// let expr_as_sql = fmt_sql(expr.as_ref()); +/// assert_eq!( +/// "The SQL: CASE a > b THEN 1 ELSE 0 END", +/// format!("The SQL: {expr_as_sql}") +/// ); +/// ``` +pub fn fmt_sql(expr: &dyn PhysicalExpr) -> impl Display + '_ { + struct Wrapper<'a> { + expr: &'a dyn PhysicalExpr, + } + + impl Display for Wrapper<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + self.expr.fmt_sql(f)?; + Ok(()) + } + } + + Wrapper { expr } +} diff --git a/datafusion/physical-expr-common/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs index 38b820edc544..3a54b5b40399 100644 --- a/datafusion/physical-expr-common/src/sort_expr.rs +++ b/datafusion/physical-expr-common/src/sort_expr.rs @@ -17,7 +17,7 @@ //! Sort expressions -use crate::physical_expr::PhysicalExpr; +use crate::physical_expr::{fmt_sql, PhysicalExpr}; use std::fmt; use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; @@ -37,7 +37,7 @@ use itertools::Itertools; /// Example: /// ``` /// # use std::any::Any; -/// # use std::fmt::Display; +/// # use std::fmt::{Display, Formatter}; /// # use std::hash::Hasher; /// # use std::sync::Arc; /// # use arrow::array::RecordBatch; @@ -58,6 +58,7 @@ use itertools::Itertools; /// # fn evaluate(&self, batch: &RecordBatch) -> Result {todo!() } /// # fn children(&self) -> Vec<&Arc> {todo!()} /// # fn with_new_children(self: Arc, children: Vec>) -> Result> {todo!()} +/// # fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result { todo!() } /// # } /// # impl Display for MyPhysicalExpr { /// # fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "a") } @@ -116,6 +117,16 @@ impl PhysicalSortExpr { self.options.nulls_first = false; self } + + /// Like [`PhysicalExpr::fmt_sql`] prints a [`PhysicalSortExpr`] in a SQL-like format. + pub fn fmt_sql(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "{} {}", + fmt_sql(self.expr.as_ref()), + to_str(&self.options) + ) + } } /// Access the PhysicalSortExpr as a PhysicalExpr diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs index 34c4e52d517e..ae3d9050fa62 100644 --- a/datafusion/physical-expr/src/aggregate.rs +++ b/datafusion/physical-expr/src/aggregate.rs @@ -65,6 +65,8 @@ pub struct AggregateExprBuilder { /// Physical expressions of the aggregate function args: Vec>, alias: Option, + /// A human readable name + human_display: String, /// Arrow Schema for the aggregate function schema: SchemaRef, /// The physical order by expressions @@ -83,6 +85,7 @@ impl AggregateExprBuilder { fun, args, alias: None, + human_display: String::default(), schema: Arc::new(Schema::empty()), ordering_req: LexOrdering::default(), ignore_nulls: false, @@ -99,6 +102,7 @@ impl AggregateExprBuilder { fun, args, alias, + human_display, schema, ordering_req, ignore_nulls, @@ -148,6 +152,7 @@ impl AggregateExprBuilder { args, data_type, name, + human_display, schema: Arc::unwrap_or_clone(schema), ordering_req, ignore_nulls, @@ -164,6 +169,11 @@ impl AggregateExprBuilder { self } + pub fn human_display(mut self, name: String) -> Self { + self.human_display = name; + self + } + pub fn schema(mut self, schema: SchemaRef) -> Self { self.schema = schema; self @@ -214,7 +224,10 @@ pub struct AggregateFunctionExpr { args: Vec>, /// Output / return type of this aggregate data_type: DataType, + /// Output column name that this expression creates name: String, + /// Simplified name for `tree` explain. + human_display: String, schema: Schema, // The physical order by expressions ordering_req: LexOrdering, @@ -245,6 +258,11 @@ impl AggregateFunctionExpr { &self.name } + /// Simplified name for `tree` explain. + pub fn human_display(&self) -> &str { + &self.human_display + } + /// Return if the aggregation is distinct pub fn is_distinct(&self) -> bool { self.is_distinct diff --git a/datafusion/physical-expr/src/equivalence/properties/mod.rs b/datafusion/physical-expr/src/equivalence/properties/mod.rs index 080587c0e231..c7c33ba5b2ba 100644 --- a/datafusion/physical-expr/src/equivalence/properties/mod.rs +++ b/datafusion/physical-expr/src/equivalence/properties/mod.rs @@ -52,7 +52,7 @@ use datafusion_physical_expr_common::utils::ExprPropertiesNode; use indexmap::IndexSet; use itertools::Itertools; -/// A `EquivalenceProperties` object stores information known about the output +/// `EquivalenceProperties` stores information about the output /// of a plan node, that can be used to optimize the plan. /// /// Currently, it keeps track of: @@ -61,6 +61,10 @@ use itertools::Itertools; /// - Constants expressions: expressions that are known to contain a single /// constant value. /// +/// Please see the [Using Ordering for Better Plans] blog for more details. +/// +/// [Using Ordering for Better Plans]: https://datafusion.apache.org/blog/2025/03/11/ordering-analysis/ +/// /// # Example equivalent sort expressions /// /// Consider table below: diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 1f16c5471ed7..a00d135ef3c1 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -168,9 +168,12 @@ fn boolean_op( macro_rules! binary_string_array_flag_op { ($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{ match $LEFT.data_type() { - DataType::Utf8View | DataType::Utf8 => { + DataType::Utf8 => { compute_utf8_flag_op!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG) }, + DataType::Utf8View => { + compute_utf8view_flag_op!($LEFT, $RIGHT, $OP, StringViewArray, $NOT, $FLAG) + } DataType::LargeUtf8 => { compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG) }, @@ -207,14 +210,42 @@ macro_rules! compute_utf8_flag_op { }}; } +/// Invoke a compute kernel on a pair of binary data arrays with flags +macro_rules! compute_utf8view_flag_op { + ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ + let ll = $LEFT + .as_any() + .downcast_ref::<$ARRAYTYPE>() + .expect("compute_utf8view_flag_op failed to downcast array"); + let rr = $RIGHT + .as_any() + .downcast_ref::<$ARRAYTYPE>() + .expect("compute_utf8view_flag_op failed to downcast array"); + + let flag = if $FLAG { + Some($ARRAYTYPE::from(vec!["i"; ll.len()])) + } else { + None + }; + let mut array = $OP(ll, rr, flag.as_ref())?; + if $NOT { + array = not(&array).unwrap(); + } + Ok(Arc::new(array)) + }}; +} + macro_rules! binary_string_array_flag_op_scalar { ($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{ // This macro is slightly different from binary_string_array_flag_op because, when comparing with a scalar value, // the query can be optimized in such a way that operands will be dicts, so we need to support it here let result: Result> = match $LEFT.data_type() { - DataType::Utf8View | DataType::Utf8 => { + DataType::Utf8 => { compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG) }, + DataType::Utf8View => { + compute_utf8view_flag_op_scalar!($LEFT, $RIGHT, $OP, StringViewArray, $NOT, $FLAG) + } DataType::LargeUtf8 => { compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG) }, @@ -222,7 +253,8 @@ macro_rules! binary_string_array_flag_op_scalar { let values = $LEFT.as_any_dictionary().values(); match values.data_type() { - DataType::Utf8View | DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG), + DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG), + DataType::Utf8View => compute_utf8view_flag_op_scalar!(values, $RIGHT, $OP, StringViewArray, $NOT, $FLAG), DataType::LargeUtf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG), other => internal_err!( "Data type {:?} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array", @@ -276,6 +308,34 @@ macro_rules! compute_utf8_flag_op_scalar { }}; } +/// Invoke a compute kernel on a data array and a scalar value with flag +macro_rules! compute_utf8view_flag_op_scalar { + ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ + let ll = $LEFT + .as_any() + .downcast_ref::<$ARRAYTYPE>() + .expect("compute_utf8view_flag_op_scalar failed to downcast array"); + + let string_value = match $RIGHT.try_as_str() { + Some(Some(string_value)) => string_value, + // null literal or non string + _ => return internal_err!( + "compute_utf8view_flag_op_scalar failed to cast literal value {} for operation '{}'", + $RIGHT, stringify!($OP) + ) + }; + + let flag = $FLAG.then_some("i"); + let mut array = + paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?; + if $NOT { + array = not(&array).unwrap(); + } + + Ok(Arc::new(array)) + }}; +} + impl PhysicalExpr for BinaryExpr { /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { @@ -571,6 +631,32 @@ impl PhysicalExpr for BinaryExpr { _ => Ok(ExprProperties::new_unknown()), } } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn write_child( + f: &mut std::fmt::Formatter, + expr: &dyn PhysicalExpr, + precedence: u8, + ) -> std::fmt::Result { + if let Some(child) = expr.as_any().downcast_ref::() { + let p = child.op.precedence(); + if p == 0 || p < precedence { + write!(f, "(")?; + child.fmt_sql(f)?; + write!(f, ")") + } else { + child.fmt_sql(f) + } + } else { + expr.fmt_sql(f) + } + } + + let precedence = self.op.precedence(); + write_child(f, self.left.as_ref(), precedence)?; + write!(f, " {} ", self.op)?; + write_child(f, self.right.as_ref(), precedence) + } } /// Casts dictionary array to result type for binary numerical operators. Such operators @@ -770,6 +856,7 @@ mod tests { use crate::expressions::{col, lit, try_cast, Column, Literal}; use datafusion_common::plan_datafusion_err; + use datafusion_physical_expr_common::physical_expr::fmt_sql; /// Performs a binary operation, applying any type coercion necessary fn binary_op( @@ -4672,4 +4759,72 @@ mod tests { Ok(()) } + + #[test] + fn test_fmt_sql() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ]); + + // Test basic binary expressions + let simple_expr = binary_expr( + col("a", &schema)?, + Operator::Plus, + col("b", &schema)?, + &schema, + )?; + let display_string = simple_expr.to_string(); + assert_eq!(display_string, "a@0 + b@1"); + let sql_string = fmt_sql(&simple_expr).to_string(); + assert_eq!(sql_string, "a + b"); + + // Test nested expressions with different operator precedence + let nested_expr = binary_expr( + Arc::new(binary_expr( + col("a", &schema)?, + Operator::Plus, + col("b", &schema)?, + &schema, + )?), + Operator::Multiply, + col("b", &schema)?, + &schema, + )?; + let display_string = nested_expr.to_string(); + assert_eq!(display_string, "(a@0 + b@1) * b@1"); + let sql_string = fmt_sql(&nested_expr).to_string(); + assert_eq!(sql_string, "(a + b) * b"); + + // Test nested expressions with same operator precedence + let nested_same_prec = binary_expr( + Arc::new(binary_expr( + col("a", &schema)?, + Operator::Plus, + col("b", &schema)?, + &schema, + )?), + Operator::Plus, + col("b", &schema)?, + &schema, + )?; + let display_string = nested_same_prec.to_string(); + assert_eq!(display_string, "a@0 + b@1 + b@1"); + let sql_string = fmt_sql(&nested_same_prec).to_string(); + assert_eq!(sql_string, "a + b + b"); + + // Test with literals + let lit_expr = binary_expr( + col("a", &schema)?, + Operator::Eq, + lit(ScalarValue::Int32(Some(42))), + &schema, + )?; + let display_string = lit_expr.to_string(); + assert_eq!(display_string, "a@0 = 42"); + let sql_string = fmt_sql(&lit_expr).to_string(); + assert_eq!(sql_string, "a = 42"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 78606f05ae81..67fab3912c6a 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -559,6 +559,29 @@ impl PhysicalExpr for CaseExpr { )?)) } } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "CASE ")?; + if let Some(e) = &self.expr { + e.fmt_sql(f)?; + write!(f, " ")?; + } + + for (w, t) in &self.when_then_expr { + write!(f, "WHEN ")?; + w.fmt_sql(f)?; + write!(f, " THEN ")?; + t.fmt_sql(f)?; + write!(f, " ")?; + } + + if let Some(e) = &self.else_expr { + write!(f, "ELSE ")?; + e.fmt_sql(f)?; + write!(f, " ")?; + } + write!(f, "END") + } } /// Create a CASE expression @@ -583,6 +606,7 @@ mod tests { use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_expr::type_coercion::binary::comparison_coercion; use datafusion_expr::Operator; + use datafusion_physical_expr_common::physical_expr::fmt_sql; #[test] fn case_with_expr() -> Result<()> { @@ -1378,4 +1402,35 @@ mod tests { comparison_coercion(&left_type, right_type) }) } + + #[test] + fn test_fmt_sql() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); + + // CASE WHEN a = 'foo' THEN 123.3 ELSE 999 END + let when = binary(col("a", &schema)?, Operator::Eq, lit("foo"), &schema)?; + let then = lit(123.3f64); + let else_value = lit(999i32); + + let expr = generate_case_when_with_type_coercion( + None, + vec![(when, then)], + Some(else_value), + &schema, + )?; + + let display_string = expr.to_string(); + assert_eq!( + display_string, + "CASE WHEN a@0 = foo THEN 123.3 ELSE TRY_CAST(999 AS Float64) END" + ); + + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!( + sql_string, + "CASE WHEN a = foo THEN 123.3 ELSE TRY_CAST(999 AS Float64) END" + ); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 8a093e0ae92e..a6766687a881 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -194,6 +194,14 @@ impl PhysicalExpr for CastExpr { Ok(ExprProperties::new_unknown().with_range(unbounded)) } } + + fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "CAST(")?; + self.expr.fmt_sql(f)?; + write!(f, " AS {:?}", self.cast_type)?; + + write!(f, ")") + } } /// Return a PhysicalExpression representing `expr` casted to @@ -243,6 +251,7 @@ mod tests { datatypes::*, }; use datafusion_common::assert_contains; + use datafusion_physical_expr_common::physical_expr::fmt_sql; // runs an end-to-end test of physical type cast // 1. construct a record batch with a column "a" of type A @@ -766,4 +775,26 @@ mod tests { expression.evaluate(&batch)?; Ok(()) } + + #[test] + fn test_fmt_sql() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", Int32, true)]); + + // Test numeric casting + let expr = cast(col("a", &schema)?, &schema, Int64)?; + let display_string = expr.to_string(); + assert_eq!(display_string, "CAST(a@0 AS Int64)"); + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "CAST(a AS Int64)"); + + // Test string casting + let schema = Schema::new(vec![Field::new("b", Utf8, true)]); + let expr = cast(col("b", &schema)?, &schema, Int32)?; + let display_string = expr.to_string(); + assert_eq!(display_string, "CAST(b@0 AS Int32)"); + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "CAST(b AS Int32)"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/column.rs b/datafusion/physical-expr/src/expressions/column.rs index 0ec985887c3f..ab5b35984753 100644 --- a/datafusion/physical-expr/src/expressions/column.rs +++ b/datafusion/physical-expr/src/expressions/column.rs @@ -137,6 +137,10 @@ impl PhysicalExpr for Column { ) -> Result> { Ok(self) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.name) + } } impl Column { diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index dfe9a905dfea..469f7bbee317 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -398,6 +398,22 @@ impl PhysicalExpr for InListExpr { self.static_filter.clone(), ))) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.expr.fmt_sql(f)?; + if self.negated { + write!(f, " NOT")?; + } + + write!(f, " IN (")?; + for (i, expr) in self.list.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + expr.fmt_sql(f)?; + } + write!(f, ")") + } } impl PartialEq for InListExpr { @@ -453,6 +469,7 @@ mod tests { use crate::expressions::{col, lit, try_cast}; use datafusion_common::plan_err; use datafusion_expr::type_coercion::binary::comparison_coercion; + use datafusion_physical_expr_common::physical_expr::fmt_sql; type InListCastResult = (Arc, Vec>); @@ -1422,4 +1439,44 @@ mod tests { Ok(()) } + + #[test] + fn test_fmt_sql() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); + let col_a = col("a", &schema)?; + + // Test: a IN ('a', 'b') + let list = vec![lit("a"), lit("b")]; + let expr = in_list(Arc::clone(&col_a), list, &false, &schema)?; + let sql_string = fmt_sql(expr.as_ref()).to_string(); + let display_string = expr.to_string(); + assert_eq!(sql_string, "a IN (a, b)"); + assert_eq!(display_string, "Use a@0 IN (SET) ([Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"b\") }])"); + + // Test: a NOT IN ('a', 'b') + let list = vec![lit("a"), lit("b")]; + let expr = in_list(Arc::clone(&col_a), list, &true, &schema)?; + let sql_string = fmt_sql(expr.as_ref()).to_string(); + let display_string = expr.to_string(); + assert_eq!(sql_string, "a NOT IN (a, b)"); + assert_eq!(display_string, "a@0 NOT IN (SET) ([Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"b\") }])"); + + // Test: a IN ('a', 'b', NULL) + let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))]; + let expr = in_list(Arc::clone(&col_a), list, &false, &schema)?; + let sql_string = fmt_sql(expr.as_ref()).to_string(); + let display_string = expr.to_string(); + assert_eq!(sql_string, "a IN (a, b, NULL)"); + assert_eq!(display_string, "Use a@0 IN (SET) ([Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"b\") }, Literal { value: Utf8(NULL) }])"); + + // Test: a NOT IN ('a', 'b', NULL) + let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))]; + let expr = in_list(Arc::clone(&col_a), list, &true, &schema)?; + let sql_string = fmt_sql(expr.as_ref()).to_string(); + let display_string = expr.to_string(); + assert_eq!(sql_string, "a NOT IN (a, b, NULL)"); + assert_eq!(display_string, "a@0 NOT IN (SET) ([Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"b\") }, Literal { value: Utf8(NULL) }])"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/is_not_null.rs b/datafusion/physical-expr/src/expressions/is_not_null.rs index 47dc53d12555..0619e7248858 100644 --- a/datafusion/physical-expr/src/expressions/is_not_null.rs +++ b/datafusion/physical-expr/src/expressions/is_not_null.rs @@ -104,6 +104,11 @@ impl PhysicalExpr for IsNotNullExpr { ) -> Result> { Ok(Arc::new(IsNotNullExpr::new(Arc::clone(&children[0])))) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.arg.fmt_sql(f)?; + write!(f, " IS NOT NULL") + } } /// Create an IS NOT NULL expression @@ -121,6 +126,7 @@ mod tests { use arrow::buffer::ScalarBuffer; use arrow::datatypes::*; use datafusion_common::cast::as_boolean_array; + use datafusion_physical_expr_common::physical_expr::fmt_sql; #[test] fn is_not_null_op() -> Result<()> { @@ -187,4 +193,29 @@ mod tests { assert_eq!(expected, actual); } + + #[test] + fn test_fmt_sql() -> Result<()> { + let union_fields: UnionFields = [ + (0, Arc::new(Field::new("A", DataType::Int32, true))), + (1, Arc::new(Field::new("B", DataType::Float64, true))), + ] + .into_iter() + .collect(); + + let field = Field::new( + "my_union", + DataType::Union(union_fields, UnionMode::Sparse), + true, + ); + + let schema = Schema::new(vec![field]); + let expr = is_not_null(col("my_union", &schema).unwrap()).unwrap(); + let display_string = expr.to_string(); + assert_eq!(display_string, "my_union@0 IS NOT NULL"); + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "my_union IS NOT NULL"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/is_null.rs b/datafusion/physical-expr/src/expressions/is_null.rs index 5e883dff997a..4c6081f35cad 100644 --- a/datafusion/physical-expr/src/expressions/is_null.rs +++ b/datafusion/physical-expr/src/expressions/is_null.rs @@ -103,6 +103,11 @@ impl PhysicalExpr for IsNullExpr { ) -> Result> { Ok(Arc::new(IsNullExpr::new(Arc::clone(&children[0])))) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.arg.fmt_sql(f)?; + write!(f, " IS NULL") + } } /// Create an IS NULL expression @@ -120,6 +125,7 @@ mod tests { use arrow::buffer::ScalarBuffer; use arrow::datatypes::*; use datafusion_common::cast::as_boolean_array; + use datafusion_physical_expr_common::physical_expr::fmt_sql; #[test] fn is_null_op() -> Result<()> { @@ -209,4 +215,18 @@ mod tests { let expected = &BooleanArray::from(vec![false, true, false, true, false, true]); assert_eq!(expected, &result); } + + #[test] + fn test_fmt_sql() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); + + // expression: "a is null" + let expr = is_null(col("a", &schema)?).unwrap(); + let display_string = expr.to_string(); + assert_eq!(display_string, "a@0 IS NULL"); + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "a IS NULL"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr/src/expressions/like.rs index b26927b77f1f..ebf9882665ba 100644 --- a/datafusion/physical-expr/src/expressions/like.rs +++ b/datafusion/physical-expr/src/expressions/like.rs @@ -145,6 +145,12 @@ impl PhysicalExpr for LikeExpr { Arc::clone(&children[1]), ))) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.expr.fmt_sql(f)?; + write!(f, " {} ", self.op_name())?; + self.pattern.fmt_sql(f) + } } /// used for optimize Dictionary like @@ -185,6 +191,7 @@ mod test { use arrow::array::*; use arrow::datatypes::Field; use datafusion_common::cast::as_boolean_array; + use datafusion_physical_expr_common::physical_expr::fmt_sql; macro_rules! test_like { ($A_VEC:expr, $B_VEC:expr, $VEC:expr, $NULLABLE: expr, $NEGATED:expr, $CASE_INSENSITIVE:expr,) => {{ @@ -256,4 +263,30 @@ mod test { Ok(()) } + + #[test] + fn test_fmt_sql() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + ]); + + let expr = like( + false, + false, + col("a", &schema)?, + col("b", &schema)?, + &schema, + )?; + + // Display format + let display_string = expr.to_string(); + assert_eq!(display_string, "a@0 LIKE b@1"); + + // fmt_sql format + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "a LIKE b"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/literal.rs b/datafusion/physical-expr/src/expressions/literal.rs index 232f9769b056..0d0c0ecc62c7 100644 --- a/datafusion/physical-expr/src/expressions/literal.rs +++ b/datafusion/physical-expr/src/expressions/literal.rs @@ -93,6 +93,10 @@ impl PhysicalExpr for Literal { preserves_lex_ordering: true, }) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self, f) + } } /// Create a literal expression @@ -110,6 +114,7 @@ mod tests { use arrow::array::Int32Array; use arrow::datatypes::*; use datafusion_common::cast::as_int32_array; + use datafusion_physical_expr_common::physical_expr::fmt_sql; #[test] fn literal_i32() -> Result<()> { @@ -136,4 +141,16 @@ mod tests { Ok(()) } + + #[test] + fn test_fmt_sql() -> Result<()> { + // create and evaluate a literal expression + let expr = lit(42i32); + let display_string = expr.to_string(); + assert_eq!(display_string, "42"); + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "42"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs index 8795545274a2..33a1bae14d42 100644 --- a/datafusion/physical-expr/src/expressions/negative.rs +++ b/datafusion/physical-expr/src/expressions/negative.rs @@ -167,6 +167,12 @@ impl PhysicalExpr for NegativeExpr { preserves_lex_ordering: false, }) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "(- ")?; + self.arg.fmt_sql(f)?; + write!(f, ")") + } } /// Creates a unary expression NEGATIVE @@ -202,6 +208,7 @@ mod tests { use datafusion_common::cast::as_primitive_array; use datafusion_common::{DataFusionError, ScalarValue}; + use datafusion_physical_expr_common::physical_expr::fmt_sql; use paste::paste; macro_rules! test_array_negative_op { @@ -379,4 +386,15 @@ mod tests { matches!(expr, DataFusionError::Plan(_)); Ok(()) } + + #[test] + fn test_fmt_sql() -> Result<()> { + let expr = NegativeExpr::new(Arc::new(Column::new("a", 0))); + let display_string = expr.to_string(); + assert_eq!(display_string, "(- a@0)"); + let sql_string = fmt_sql(&expr).to_string(); + assert_eq!(sql_string, "(- a)"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/no_op.rs b/datafusion/physical-expr/src/expressions/no_op.rs index c17b52f5cdff..24d2f4d9e074 100644 --- a/datafusion/physical-expr/src/expressions/no_op.rs +++ b/datafusion/physical-expr/src/expressions/no_op.rs @@ -77,4 +77,8 @@ impl PhysicalExpr for NoOp { ) -> Result> { Ok(self) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self, f) + } } diff --git a/datafusion/physical-expr/src/expressions/not.rs b/datafusion/physical-expr/src/expressions/not.rs index ddf7c739b692..8a3348b43d20 100644 --- a/datafusion/physical-expr/src/expressions/not.rs +++ b/datafusion/physical-expr/src/expressions/not.rs @@ -175,6 +175,11 @@ impl PhysicalExpr for NotExpr { _ => internal_err!("NotExpr can only operate on Boolean datatypes"), } } + + fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "NOT ")?; + self.arg.fmt_sql(f) + } } /// Creates a unary expression NOT @@ -190,6 +195,7 @@ mod tests { use crate::expressions::{col, Column}; use arrow::{array::BooleanArray, datatypes::*}; + use datafusion_physical_expr_common::physical_expr::fmt_sql; #[test] fn neg_op() -> Result<()> { @@ -322,6 +328,21 @@ mod tests { Ok(()) } + #[test] + fn test_fmt_sql() -> Result<()> { + let schema = schema(); + + let expr = not(col("a", &schema)?)?; + + let display_string = expr.to_string(); + assert_eq!(display_string, "NOT a@0"); + + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "NOT a"); + + Ok(()) + } + fn schema() -> SchemaRef { static SCHEMA: LazyLock = LazyLock::new(|| { Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, true)])) diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr/src/expressions/try_cast.rs index 06f4e929992e..e49815cd8b64 100644 --- a/datafusion/physical-expr/src/expressions/try_cast.rs +++ b/datafusion/physical-expr/src/expressions/try_cast.rs @@ -123,6 +123,12 @@ impl PhysicalExpr for TryCastExpr { self.cast_type.clone(), ))) } + + fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "TRY_CAST(")?; + self.expr.fmt_sql(f)?; + write!(f, " AS {:?})", self.cast_type) + } } /// Return a PhysicalExpression representing `expr` casted to @@ -158,6 +164,7 @@ mod tests { }, datatypes::*, }; + use datafusion_physical_expr_common::physical_expr::fmt_sql; // runs an end-to-end test of physical type cast // 1. construct a record batch with a column "a" of type A @@ -573,4 +580,26 @@ mod tests { .with_precision_and_scale(precision, scale) .unwrap() } + + #[test] + fn test_fmt_sql() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + + // Test numeric casting + let expr = try_cast(col("a", &schema)?, &schema, DataType::Int64)?; + let display_string = expr.to_string(); + assert_eq!(display_string, "TRY_CAST(a@0 AS Int64)"); + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "TRY_CAST(a AS Int64)"); + + // Test string casting + let schema = Schema::new(vec![Field::new("b", DataType::Utf8, true)]); + let expr = try_cast(col("b", &schema)?, &schema, DataType::Int32)?; + let display_string = expr.to_string(); + assert_eq!(display_string, "TRY_CAST(b@0 AS Int32)"); + let sql_string = fmt_sql(expr.as_ref()).to_string(); + assert_eq!(sql_string, "TRY_CAST(b AS Int32)"); + + Ok(()) + } } diff --git a/datafusion/physical-expr/src/expressions/unknown_column.rs b/datafusion/physical-expr/src/expressions/unknown_column.rs index a63caf7e1305..2face4eb6bdb 100644 --- a/datafusion/physical-expr/src/expressions/unknown_column.rs +++ b/datafusion/physical-expr/src/expressions/unknown_column.rs @@ -86,6 +86,10 @@ impl PhysicalExpr for UnKnownColumn { ) -> Result> { Ok(self) } + + fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self, f) + } } impl Hash for UnKnownColumn { diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 0a448fa6a2e9..93ced2eb628d 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] // Backward compatibility @@ -36,10 +37,6 @@ mod partitioning; mod physical_expr; pub mod planner; mod scalar_function; -pub mod udf { - #[allow(deprecated)] - pub use crate::scalar_function::create_physical_expr; -} pub mod statistics; pub mod utils; pub mod window; @@ -57,6 +54,7 @@ pub use equivalence::{ }; pub use partitioning::{Distribution, Partitioning}; pub use physical_expr::{ + create_ordering, create_physical_sort_expr, create_physical_sort_exprs, physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, PhysicalExprRef, }; diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index a4184845a0de..63c4ccbb4b38 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -17,7 +17,9 @@ use std::sync::Arc; -use datafusion_common::HashMap; +use crate::create_physical_expr; +use datafusion_common::{DFSchema, HashMap}; +use datafusion_expr::execution_props::ExecutionProps; pub(crate) use datafusion_physical_expr_common::physical_expr::PhysicalExpr; pub use datafusion_physical_expr_common::physical_expr::PhysicalExprRef; use itertools::izip; @@ -58,6 +60,126 @@ pub fn physical_exprs_bag_equal( multi_set_lhs == multi_set_rhs } +use crate::{expressions, LexOrdering, PhysicalSortExpr}; +use arrow::compute::SortOptions; +use arrow::datatypes::Schema; +use datafusion_common::plan_err; +use datafusion_common::Result; +use datafusion_expr::{Expr, SortExpr}; + +/// Converts logical sort expressions to physical sort expressions +/// +/// This function transforms a collection of logical sort expressions into their physical +/// representation that can be used during query execution. +/// +/// # Arguments +/// +/// * `schema` - The schema containing column definitions +/// * `sort_order` - A collection of logical sort expressions grouped into lexicographic orderings +/// +/// # Returns +/// +/// A vector of lexicographic orderings for physical execution, or an error if the transformation fails +/// +/// # Examples +/// +/// ``` +/// // Create orderings from columns "id" and "name" +/// # use arrow::datatypes::{Schema, Field, DataType}; +/// # use datafusion_physical_expr::create_ordering; +/// # use datafusion_common::Column; +/// # use datafusion_expr::{Expr, SortExpr}; +/// # +/// // Create a schema with two fields +/// let schema = Schema::new(vec![ +/// Field::new("id", DataType::Int32, false), +/// Field::new("name", DataType::Utf8, false), +/// ]); +/// +/// let sort_exprs = vec![ +/// vec![ +/// SortExpr { expr: Expr::Column(Column::new(Some("t"), "id")), asc: true, nulls_first: false } +/// ], +/// vec![ +/// SortExpr { expr: Expr::Column(Column::new(Some("t"), "name")), asc: false, nulls_first: true } +/// ] +/// ]; +/// let result = create_ordering(&schema, &sort_exprs).unwrap(); +/// ``` +pub fn create_ordering( + schema: &Schema, + sort_order: &[Vec], +) -> Result> { + let mut all_sort_orders = vec![]; + + for (group_idx, exprs) in sort_order.iter().enumerate() { + // Construct PhysicalSortExpr objects from Expr objects: + let mut sort_exprs = LexOrdering::default(); + for (expr_idx, sort) in exprs.iter().enumerate() { + match &sort.expr { + Expr::Column(col) => match expressions::col(&col.name, schema) { + Ok(expr) => { + sort_exprs.push(PhysicalSortExpr { + expr, + options: SortOptions { + descending: !sort.asc, + nulls_first: sort.nulls_first, + }, + }); + } + // Cannot find expression in the projected_schema, stop iterating + // since rest of the orderings are violated + Err(_) => break, + }, + expr => { + return plan_err!( + "Expected single column reference in sort_order[{}][{}], got {}", + group_idx, + expr_idx, + expr + ); + } + } + } + if !sort_exprs.is_empty() { + all_sort_orders.push(sort_exprs); + } + } + Ok(all_sort_orders) +} + +/// Create a physical sort expression from a logical expression +pub fn create_physical_sort_expr( + e: &SortExpr, + input_dfschema: &DFSchema, + execution_props: &ExecutionProps, +) -> Result { + let SortExpr { + expr, + asc, + nulls_first, + } = e; + Ok(PhysicalSortExpr { + expr: create_physical_expr(expr, input_dfschema, execution_props)?, + options: SortOptions { + descending: !asc, + nulls_first: *nulls_first, + }, + }) +} + +/// Create vector of physical sort expression from a vector of logical expression +pub fn create_physical_sort_exprs( + exprs: &[SortExpr], + input_dfschema: &DFSchema, + execution_props: &ExecutionProps, +) -> Result { + exprs + .iter() + .map(|expr| create_physical_sort_expr(expr, input_dfschema, execution_props)) + .collect::>() +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs index bd38fb22ccbc..44bbcc4928c6 100644 --- a/datafusion/physical-expr/src/scalar_function.rs +++ b/datafusion/physical-expr/src/scalar_function.rs @@ -39,12 +39,12 @@ use crate::PhysicalExpr; use arrow::array::{Array, RecordBatch}; use arrow::datatypes::{DataType, Schema}; -use datafusion_common::{internal_err, DFSchema, Result, ScalarValue}; +use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::ExprProperties; use datafusion_expr::type_coercion::functions::data_types_with_scalar_udf; use datafusion_expr::{ - expr_vec_fmt, ColumnarValue, Expr, ReturnTypeArgs, ScalarFunctionArgs, ScalarUDF, + expr_vec_fmt, ColumnarValue, ReturnTypeArgs, ScalarFunctionArgs, ScalarUDF, }; /// Physical expression of a scalar function @@ -260,36 +260,15 @@ impl PhysicalExpr for ScalarFunctionExpr { preserves_lex_ordering, }) } -} -/// Create a physical expression for the UDF. -#[deprecated(since = "45.0.0", note = "use ScalarFunctionExpr::new() instead")] -pub fn create_physical_expr( - fun: &ScalarUDF, - input_phy_exprs: &[Arc], - input_schema: &Schema, - args: &[Expr], - input_dfschema: &DFSchema, -) -> Result> { - let input_expr_types = input_phy_exprs - .iter() - .map(|e| e.data_type(input_schema)) - .collect::>>()?; - - // verify that input data types is consistent with function's `TypeSignature` - data_types_with_scalar_udf(&input_expr_types, fun)?; - - // Since we have arg_types, we don't need args and schema. - let return_type = - fun.return_type_from_exprs(args, input_dfschema, &input_expr_types)?; - - Ok(Arc::new( - ScalarFunctionExpr::new( - fun.name(), - Arc::new(fun.clone()), - input_phy_exprs.to_vec(), - return_type, - ) - .with_nullable(fun.is_nullable(args, input_dfschema)), - )) + fn fmt_sql(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}(", self.name)?; + for (i, expr) in self.args.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + expr.fmt_sql(f)?; + } + write!(f, ")") + } } diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index e8473e6556d1..aaadb09bcc98 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -53,3 +53,4 @@ recursive = { workspace = true, optional = true } [dev-dependencies] datafusion-expr = { workspace = true } datafusion-functions-nested = { workspace = true } +insta = { workspace = true } diff --git a/datafusion/physical-optimizer/src/aggregate_statistics.rs b/datafusion/physical-optimizer/src/aggregate_statistics.rs index a9b02188a7a2..0d3d83c58373 100644 --- a/datafusion/physical-optimizer/src/aggregate_statistics.rs +++ b/datafusion/physical-optimizer/src/aggregate_statistics.rs @@ -45,7 +45,7 @@ impl PhysicalOptimizerRule for AggregateStatistics { fn optimize( &self, plan: Arc, - _config: &ConfigOptions, + config: &ConfigOptions, ) -> Result> { if let Some(partial_agg_exec) = take_optimizable(&*plan) { let partial_agg_exec = partial_agg_exec @@ -83,12 +83,12 @@ impl PhysicalOptimizerRule for AggregateStatistics { )?)) } else { plan.map_children(|child| { - self.optimize(child, _config).map(Transformed::yes) + self.optimize(child, config).map(Transformed::yes) }) .data() } } else { - plan.map_children(|child| self.optimize(child, _config).map(Transformed::yes)) + plan.map_children(|child| self.optimize(child, config).map(Transformed::yes)) .data() } } diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs index 2613b95bbdc0..35503f3b0b5f 100644 --- a/datafusion/physical-optimizer/src/lib.rs +++ b/datafusion/physical-optimizer/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] pub mod aggregate_statistics; diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 2004aeafb893..b5287f3d33f3 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1884,8 +1884,9 @@ mod tests { use std::ops::{Not, Rem}; use super::*; - use datafusion_common::assert_batches_eq; + use datafusion_common::test_util::batches_to_string; use datafusion_expr::{col, lit}; + use insta::assert_snapshot; use arrow::array::Decimal128Array; use arrow::{ @@ -2466,18 +2467,16 @@ mod tests { let batch = build_statistics_record_batch(&statistics, &required_columns).unwrap(); - let expected = [ - "+--------+--------+--------+--------+", - "| s1_min | s2_max | s3_max | s3_min |", - "+--------+--------+--------+--------+", - "| | 20 | q | a |", - "| | | | |", - "| 9 | | r | |", - "| | | | |", - "+--------+--------+--------+--------+", - ]; - - assert_batches_eq!(expected, &[batch]); + assert_snapshot!(batches_to_string(&[batch]), @r" + +--------+--------+--------+--------+ + | s1_min | s2_max | s3_max | s3_min | + +--------+--------+--------+--------+ + | | 20 | q | a | + | | | | | + | 9 | | r | | + | | | | | + +--------+--------+--------+--------+ + "); } #[test] @@ -2505,15 +2504,14 @@ mod tests { let batch = build_statistics_record_batch(&statistics, &required_columns).unwrap(); - let expected = [ - "+-------------------------------+", - "| s1_min |", - "+-------------------------------+", - "| 1970-01-01T00:00:00.000000010 |", - "+-------------------------------+", - ]; - assert_batches_eq!(expected, &[batch]); + assert_snapshot!(batches_to_string(&[batch]), @r" + +-------------------------------+ + | s1_min | + +-------------------------------+ + | 1970-01-01T00:00:00.000000010 | + +-------------------------------+ + "); } #[test] @@ -2551,15 +2549,13 @@ mod tests { let batch = build_statistics_record_batch(&statistics, &required_columns).unwrap(); - let expected = [ - "+--------+", - "| s1_min |", - "+--------+", - "| |", - "+--------+", - ]; - - assert_batches_eq!(expected, &[batch]); + assert_snapshot!(batches_to_string(&[batch]), @r" + +--------+ + | s1_min | + +--------+ + | | + +--------+ + "); } #[test] diff --git a/datafusion/physical-optimizer/src/topk_aggregation.rs b/datafusion/physical-optimizer/src/topk_aggregation.rs index 0e5fb82d9e93..faedea55ca15 100644 --- a/datafusion/physical-optimizer/src/topk_aggregation.rs +++ b/datafusion/physical-optimizer/src/topk_aggregation.rs @@ -56,7 +56,11 @@ impl TopKAggregation { } let group_key = aggr.group_expr().expr().iter().exactly_one().ok()?; let kt = group_key.0.data_type(&aggr.input().schema()).ok()?; - if !kt.is_primitive() && kt != DataType::Utf8 { + if !kt.is_primitive() + && kt != DataType::Utf8 + && kt != DataType::Utf8View + && kt != DataType::LargeUtf8 + { return None; } if aggr.filter_expr().iter().any(|e| e.is_some()) { diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 4a10398e5a9e..1f38e2ed3126 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -68,6 +68,7 @@ tokio = { workspace = true } criterion = { workspace = true, features = ["async_futures"] } datafusion-functions-aggregate = { workspace = true } datafusion-functions-window = { workspace = true } +insta = { workspace = true } rand = { workspace = true } rstest = { workspace = true } rstest_reuse = "0.7.0" diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 7d4837d04774..8906468f68db 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -48,6 +48,7 @@ use datafusion_physical_expr::{ PhysicalSortRequirement, }; +use datafusion_physical_expr_common::physical_expr::fmt_sql; use itertools::Itertools; pub(crate) mod group_values; @@ -744,19 +745,22 @@ impl DisplayAs for AggregateExec { ) -> std::fmt::Result { match t { DisplayFormatType::Default | DisplayFormatType::Verbose => { + let format_expr_with_alias = + |(e, alias): &(Arc, String)| -> String { + let e = e.to_string(); + if &e != alias { + format!("{e} as {alias}") + } else { + e + } + }; + write!(f, "AggregateExec: mode={:?}", self.mode)?; let g: Vec = if self.group_by.is_single() { self.group_by .expr .iter() - .map(|(e, alias)| { - let e = e.to_string(); - if &e != alias { - format!("{e} as {alias}") - } else { - e - } - }) + .map(format_expr_with_alias) .collect() } else { self.group_by @@ -768,21 +772,11 @@ impl DisplayAs for AggregateExec { .enumerate() .map(|(idx, is_null)| { if *is_null { - let (e, alias) = &self.group_by.null_expr[idx]; - let e = e.to_string(); - if &e != alias { - format!("{e} as {alias}") - } else { - e - } + format_expr_with_alias( + &self.group_by.null_expr[idx], + ) } else { - let (e, alias) = &self.group_by.expr[idx]; - let e = e.to_string(); - if &e != alias { - format!("{e} as {alias}") - } else { - e - } + format_expr_with_alias(&self.group_by.expr[idx]) } }) .collect::>() @@ -809,8 +803,57 @@ impl DisplayAs for AggregateExec { } } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "")?; + let format_expr_with_alias = + |(e, alias): &(Arc, String)| -> String { + let expr_sql = fmt_sql(e.as_ref()).to_string(); + if &expr_sql != alias { + format!("{expr_sql} as {alias}") + } else { + expr_sql + } + }; + + let g: Vec = if self.group_by.is_single() { + self.group_by + .expr + .iter() + .map(format_expr_with_alias) + .collect() + } else { + self.group_by + .groups + .iter() + .map(|group| { + let terms = group + .iter() + .enumerate() + .map(|(idx, is_null)| { + if *is_null { + format_expr_with_alias( + &self.group_by.null_expr[idx], + ) + } else { + format_expr_with_alias(&self.group_by.expr[idx]) + } + }) + .collect::>() + .join(", "); + format!("({terms})") + }) + .collect() + }; + let a: Vec = self + .aggr_expr + .iter() + .map(|agg| agg.human_display().to_string()) + .collect(); + writeln!(f, "mode={:?}", self.mode)?; + if !g.is_empty() { + writeln!(f, "group_by={}", g.join(", "))?; + } + if !a.is_empty() { + writeln!(f, "aggr={}", a.join(", "))?; + } } } Ok(()) @@ -1382,10 +1425,8 @@ mod tests { }; use arrow::compute::{concat_batches, SortOptions}; use arrow::datatypes::{DataType, Int32Type}; - use datafusion_common::{ - assert_batches_eq, assert_batches_sorted_eq, internal_err, DataFusionError, - ScalarValue, - }; + use datafusion_common::test_util::{batches_to_sort_string, batches_to_string}; + use datafusion_common::{internal_err, DataFusionError, ScalarValue}; use datafusion_execution::config::SessionConfig; use datafusion_execution::memory_pool::FairSpillPool; use datafusion_execution::runtime_env::RuntimeEnvBuilder; @@ -1402,6 +1443,7 @@ mod tests { use datafusion_physical_expr::PhysicalSortExpr; use futures::{FutureExt, Stream}; + use insta::{allow_duplicates, assert_snapshot}; // Generate a schema which consists of 5 columns (a, b, c, d, e) fn create_test_schema() -> Result { @@ -1558,56 +1600,63 @@ mod tests { let result = collect(partial_aggregate.execute(0, Arc::clone(&task_ctx))?).await?; - let expected = if spill { + if spill { // In spill mode, we test with the limited memory, if the mem usage exceeds, // we trigger the early emit rule, which turns out the partial aggregate result. - vec![ - "+---+-----+---------------+-----------------+", - "| a | b | __grouping_id | COUNT(1)[count] |", - "+---+-----+---------------+-----------------+", - "| | 1.0 | 2 | 1 |", - "| | 1.0 | 2 | 1 |", - "| | 2.0 | 2 | 1 |", - "| | 2.0 | 2 | 1 |", - "| | 3.0 | 2 | 1 |", - "| | 3.0 | 2 | 1 |", - "| | 4.0 | 2 | 1 |", - "| | 4.0 | 2 | 1 |", - "| 2 | | 1 | 1 |", - "| 2 | | 1 | 1 |", - "| 2 | 1.0 | 0 | 1 |", - "| 2 | 1.0 | 0 | 1 |", - "| 3 | | 1 | 1 |", - "| 3 | | 1 | 2 |", - "| 3 | 2.0 | 0 | 2 |", - "| 3 | 3.0 | 0 | 1 |", - "| 4 | | 1 | 1 |", - "| 4 | | 1 | 2 |", - "| 4 | 3.0 | 0 | 1 |", - "| 4 | 4.0 | 0 | 2 |", - "+---+-----+---------------+-----------------+", - ] + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&result), + @r" ++---+-----+---------------+-----------------+ +| a | b | __grouping_id | COUNT(1)[count] | ++---+-----+---------------+-----------------+ +| | 1.0 | 2 | 1 | +| | 1.0 | 2 | 1 | +| | 2.0 | 2 | 1 | +| | 2.0 | 2 | 1 | +| | 3.0 | 2 | 1 | +| | 3.0 | 2 | 1 | +| | 4.0 | 2 | 1 | +| | 4.0 | 2 | 1 | +| 2 | | 1 | 1 | +| 2 | | 1 | 1 | +| 2 | 1.0 | 0 | 1 | +| 2 | 1.0 | 0 | 1 | +| 3 | | 1 | 1 | +| 3 | | 1 | 2 | +| 3 | 2.0 | 0 | 2 | +| 3 | 3.0 | 0 | 1 | +| 4 | | 1 | 1 | +| 4 | | 1 | 2 | +| 4 | 3.0 | 0 | 1 | +| 4 | 4.0 | 0 | 2 | ++---+-----+---------------+-----------------+ + " + ); + } } else { - vec![ - "+---+-----+---------------+-----------------+", - "| a | b | __grouping_id | COUNT(1)[count] |", - "+---+-----+---------------+-----------------+", - "| | 1.0 | 2 | 2 |", - "| | 2.0 | 2 | 2 |", - "| | 3.0 | 2 | 2 |", - "| | 4.0 | 2 | 2 |", - "| 2 | | 1 | 2 |", - "| 2 | 1.0 | 0 | 2 |", - "| 3 | | 1 | 3 |", - "| 3 | 2.0 | 0 | 2 |", - "| 3 | 3.0 | 0 | 1 |", - "| 4 | | 1 | 3 |", - "| 4 | 3.0 | 0 | 1 |", - "| 4 | 4.0 | 0 | 2 |", - "+---+-----+---------------+-----------------+", - ] + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&result), + @r" ++---+-----+---------------+-----------------+ +| a | b | __grouping_id | COUNT(1)[count] | ++---+-----+---------------+-----------------+ +| | 1.0 | 2 | 2 | +| | 2.0 | 2 | 2 | +| | 3.0 | 2 | 2 | +| | 4.0 | 2 | 2 | +| 2 | | 1 | 2 | +| 2 | 1.0 | 0 | 2 | +| 3 | | 1 | 3 | +| 3 | 2.0 | 0 | 2 | +| 3 | 3.0 | 0 | 1 | +| 4 | | 1 | 3 | +| 4 | 3.0 | 0 | 1 | +| 4 | 4.0 | 0 | 2 | ++---+-----+---------------+-----------------+ + " + ); + } }; - assert_batches_sorted_eq!(expected, &result); let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); @@ -1633,26 +1682,29 @@ mod tests { assert_eq!(batch.num_columns(), 4); assert_eq!(batch.num_rows(), 12); - let expected = vec![ - "+---+-----+---------------+----------+", - "| a | b | __grouping_id | COUNT(1) |", - "+---+-----+---------------+----------+", - "| | 1.0 | 2 | 2 |", - "| | 2.0 | 2 | 2 |", - "| | 3.0 | 2 | 2 |", - "| | 4.0 | 2 | 2 |", - "| 2 | | 1 | 2 |", - "| 2 | 1.0 | 0 | 2 |", - "| 3 | | 1 | 3 |", - "| 3 | 2.0 | 0 | 2 |", - "| 3 | 3.0 | 0 | 1 |", - "| 4 | | 1 | 3 |", - "| 4 | 3.0 | 0 | 1 |", - "| 4 | 4.0 | 0 | 2 |", - "+---+-----+---------------+----------+", - ]; - - assert_batches_sorted_eq!(&expected, &result); + allow_duplicates! { + assert_snapshot!( + batches_to_sort_string(&result), + @r" + +---+-----+---------------+----------+ + | a | b | __grouping_id | COUNT(1) | + +---+-----+---------------+----------+ + | | 1.0 | 2 | 2 | + | | 2.0 | 2 | 2 | + | | 3.0 | 2 | 2 | + | | 4.0 | 2 | 2 | + | 2 | | 1 | 2 | + | 2 | 1.0 | 0 | 2 | + | 3 | | 1 | 3 | + | 3 | 2.0 | 0 | 2 | + | 3 | 3.0 | 0 | 1 | + | 4 | | 1 | 3 | + | 4 | 3.0 | 0 | 1 | + | 4 | 4.0 | 0 | 2 | + +---+-----+---------------+----------+ + " + ); + } let metrics = merged_aggregate.metrics().unwrap(); let output_rows = metrics.output_rows().unwrap(); @@ -1697,30 +1749,33 @@ mod tests { let result = collect(partial_aggregate.execute(0, Arc::clone(&task_ctx))?).await?; - let expected = if spill { - vec![ - "+---+---------------+-------------+", - "| a | AVG(b)[count] | AVG(b)[sum] |", - "+---+---------------+-------------+", - "| 2 | 1 | 1.0 |", - "| 2 | 1 | 1.0 |", - "| 3 | 1 | 2.0 |", - "| 3 | 2 | 5.0 |", - "| 4 | 3 | 11.0 |", - "+---+---------------+-------------+", - ] + if spill { + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&result), @r" + +---+---------------+-------------+ + | a | AVG(b)[count] | AVG(b)[sum] | + +---+---------------+-------------+ + | 2 | 1 | 1.0 | + | 2 | 1 | 1.0 | + | 3 | 1 | 2.0 | + | 3 | 2 | 5.0 | + | 4 | 3 | 11.0 | + +---+---------------+-------------+ + "); + } } else { - vec![ - "+---+---------------+-------------+", - "| a | AVG(b)[count] | AVG(b)[sum] |", - "+---+---------------+-------------+", - "| 2 | 2 | 2.0 |", - "| 3 | 3 | 7.0 |", - "| 4 | 3 | 11.0 |", - "+---+---------------+-------------+", - ] + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&result), @r" + +---+---------------+-------------+ + | a | AVG(b)[count] | AVG(b)[sum] | + +---+---------------+-------------+ + | 2 | 2 | 2.0 | + | 3 | 3 | 7.0 | + | 4 | 3 | 11.0 | + +---+---------------+-------------+ + "); + } }; - assert_batches_sorted_eq!(expected, &result); let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); @@ -1746,17 +1801,19 @@ mod tests { assert_eq!(batch.num_columns(), 2); assert_eq!(batch.num_rows(), 3); - let expected = vec![ - "+---+--------------------+", - "| a | AVG(b) |", - "+---+--------------------+", - "| 2 | 1.0 |", - "| 3 | 2.3333333333333335 |", // 3, (2 + 3 + 2) / 3 - "| 4 | 3.6666666666666665 |", // 4, (3 + 4 + 4) / 3 - "+---+--------------------+", - ]; - - assert_batches_sorted_eq!(&expected, &result); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&result), @r" + +---+--------------------+ + | a | AVG(b) | + +---+--------------------+ + | 2 | 1.0 | + | 3 | 2.3333333333333335 | + | 4 | 3.6666666666666665 | + +---+--------------------+ + "); + // For row 2: 3, (2 + 3 + 2) / 3 + // For row 3: 4, (3 + 4 + 4) / 3 + } let metrics = merged_aggregate.metrics().unwrap(); let output_rows = metrics.output_rows().unwrap(); @@ -2270,27 +2327,29 @@ mod tests { let result = crate::collect(aggregate_final, task_ctx).await?; if is_first_acc { - let expected = [ - "+---+--------------------------------------------+", - "| a | first_value(b) ORDER BY [b ASC NULLS LAST] |", - "+---+--------------------------------------------+", - "| 2 | 0.0 |", - "| 3 | 1.0 |", - "| 4 | 3.0 |", - "+---+--------------------------------------------+", - ]; - assert_batches_eq!(expected, &result); + allow_duplicates! { + assert_snapshot!(batches_to_string(&result), @r" + +---+--------------------------------------------+ + | a | first_value(b) ORDER BY [b ASC NULLS LAST] | + +---+--------------------------------------------+ + | 2 | 0.0 | + | 3 | 1.0 | + | 4 | 3.0 | + +---+--------------------------------------------+ + "); + } } else { - let expected = [ - "+---+-------------------------------------------+", - "| a | last_value(b) ORDER BY [b ASC NULLS LAST] |", - "+---+-------------------------------------------+", - "| 2 | 3.0 |", - "| 3 | 5.0 |", - "| 4 | 6.0 |", - "+---+-------------------------------------------+", - ]; - assert_batches_eq!(expected, &result); + allow_duplicates! { + assert_snapshot!(batches_to_string(&result), @r" + +---+-------------------------------------------+ + | a | last_value(b) ORDER BY [b ASC NULLS LAST] | + +---+-------------------------------------------+ + | 2 | 3.0 | + | 3 | 5.0 | + | 4 | 6.0 | + +---+-------------------------------------------+ + "); + } }; Ok(()) } @@ -2484,16 +2543,17 @@ mod tests { let output = collect(aggregate_exec.execute(0, Arc::new(TaskContext::default()))?).await?; - let expected = [ - "+-----+-----+-------+---------------+-------+", - "| a | b | const | __grouping_id | 1 |", - "+-----+-----+-------+---------------+-------+", - "| | | 1 | 6 | 32768 |", - "| | 0.0 | | 5 | 32768 |", - "| 0.0 | | | 3 | 32768 |", - "+-----+-----+-------+---------------+-------+", - ]; - assert_batches_sorted_eq!(expected, &output); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&output), @r" + +-----+-----+-------+---------------+-------+ + | a | b | const | __grouping_id | 1 | + +-----+-----+-------+---------------+-------+ + | | | 1 | 6 | 32768 | + | | 0.0 | | 5 | 32768 | + | 0.0 | | | 3 | 32768 | + +-----+-----+-------+---------------+-------+ + "); + } Ok(()) } @@ -2599,15 +2659,16 @@ mod tests { let ctx = TaskContext::default().with_session_config(session_config); let output = collect(aggregate_exec.execute(0, Arc::new(ctx))?).await?; - let expected = [ - "+--------------+------------+", - "| labels | SUM(value) |", - "+--------------+------------+", - "| {a: a, b: b} | 2 |", - "| {a: , b: c} | 1 |", - "+--------------+------------+", - ]; - assert_batches_eq!(expected, &output); + allow_duplicates! { + assert_snapshot!(batches_to_string(&output), @r" + +--------------+------------+ + | labels | SUM(value) | + +--------------+------------+ + | {a: a, b: b} | 2 | + | {a: , b: c} | 1 | + +--------------+------------+ + "); + } Ok(()) } @@ -2674,19 +2735,20 @@ mod tests { let ctx = TaskContext::default().with_session_config(session_config); let output = collect(aggregate_exec.execute(0, Arc::new(ctx))?).await?; - let expected = [ - "+-----+-------------------+", - "| key | COUNT(val)[count] |", - "+-----+-------------------+", - "| 1 | 1 |", - "| 2 | 1 |", - "| 3 | 1 |", - "| 2 | 1 |", - "| 3 | 1 |", - "| 4 | 1 |", - "+-----+-------------------+", - ]; - assert_batches_eq!(expected, &output); + allow_duplicates! { + assert_snapshot!(batches_to_string(&output), @r" + +-----+-------------------+ + | key | COUNT(val)[count] | + +-----+-------------------+ + | 1 | 1 | + | 2 | 1 | + | 3 | 1 | + | 2 | 1 | + | 3 | 1 | + | 4 | 1 | + +-----+-------------------+ + "); + } Ok(()) } @@ -2761,20 +2823,21 @@ mod tests { let ctx = TaskContext::default().with_session_config(session_config); let output = collect(aggregate_exec.execute(0, Arc::new(ctx))?).await?; - let expected = [ - "+-----+-------------------+", - "| key | COUNT(val)[count] |", - "+-----+-------------------+", - "| 1 | 1 |", - "| 2 | 2 |", - "| 3 | 2 |", - "| 4 | 1 |", - "| 2 | 1 |", - "| 3 | 1 |", - "| 4 | 1 |", - "+-----+-------------------+", - ]; - assert_batches_eq!(expected, &output); + allow_duplicates! { + assert_snapshot!(batches_to_string(&output), @r" + +-----+-------------------+ + | key | COUNT(val)[count] | + +-----+-------------------+ + | 1 | 1 | + | 2 | 2 | + | 3 | 2 | + | 4 | 1 | + | 2 | 1 | + | 3 | 1 | + | 4 | 1 | + +-----+-------------------+ + "); + } Ok(()) } @@ -2905,19 +2968,17 @@ mod tests { assert_spill_count_metric(expect_spill, single_aggregate); - #[rustfmt::skip] - assert_batches_sorted_eq!( - [ - "+---+--------+--------+", - "| a | MIN(b) | AVG(b) |", - "+---+--------+--------+", - "| 2 | 1.0 | 1.0 |", - "| 3 | 2.0 | 2.0 |", - "| 4 | 3.0 | 3.5 |", - "+---+--------+--------+", - ], - &result - ); + allow_duplicates! { + assert_snapshot!(batches_to_string(&result), @r" + +---+--------+--------+ + | a | MIN(b) | AVG(b) | + +---+--------+--------+ + | 2 | 1.0 | 1.0 | + | 3 | 2.0 | 2.0 | + | 4 | 3.0 | 3.5 | + +---+--------+--------+ + "); + } Ok(()) } diff --git a/datafusion/physical-plan/src/aggregates/topk/hash_table.rs b/datafusion/physical-plan/src/aggregates/topk/hash_table.rs index c818b4608de7..ae44eb35e6d0 100644 --- a/datafusion/physical-plan/src/aggregates/topk/hash_table.rs +++ b/datafusion/physical-plan/src/aggregates/topk/hash_table.rs @@ -23,7 +23,7 @@ use ahash::RandomState; use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano}; use arrow::array::{ builder::PrimitiveBuilder, cast::AsArray, downcast_primitive, Array, ArrayRef, - ArrowPrimitiveType, PrimitiveArray, StringArray, + ArrowPrimitiveType, LargeStringArray, PrimitiveArray, StringArray, StringViewArray, }; use arrow::datatypes::{i256, DataType}; use datafusion_common::DataFusionError; @@ -88,6 +88,7 @@ pub struct StringHashTable { owned: ArrayRef, map: TopKHashTable>, rnd: RandomState, + data_type: DataType, } // An implementation of ArrowHashTable for any `ArrowPrimitiveType` key @@ -101,13 +102,20 @@ where } impl StringHashTable { - pub fn new(limit: usize) -> Self { + pub fn new(limit: usize, data_type: DataType) -> Self { let vals: Vec<&str> = Vec::new(); - let owned = Arc::new(StringArray::from(vals)); + let owned: ArrayRef = match data_type { + DataType::Utf8 => Arc::new(StringArray::from(vals)), + DataType::Utf8View => Arc::new(StringViewArray::from(vals)), + DataType::LargeUtf8 => Arc::new(LargeStringArray::from(vals)), + _ => panic!("Unsupported data type"), + }; + Self { owned, map: TopKHashTable::new(limit, limit * 10), rnd: RandomState::default(), + data_type, } } } @@ -131,7 +139,12 @@ impl ArrowHashTable for StringHashTable { unsafe fn take_all(&mut self, indexes: Vec) -> ArrayRef { let ids = self.map.take_all(indexes); - Arc::new(StringArray::from(ids)) + match self.data_type { + DataType::Utf8 => Arc::new(StringArray::from(ids)), + DataType::LargeUtf8 => Arc::new(LargeStringArray::from(ids)), + DataType::Utf8View => Arc::new(StringViewArray::from(ids)), + _ => unreachable!(), + } } unsafe fn find_or_insert( @@ -140,15 +153,44 @@ impl ArrowHashTable for StringHashTable { replace_idx: usize, mapper: &mut Vec<(usize, usize)>, ) -> (usize, bool) { - let ids = self - .owned - .as_any() - .downcast_ref::() - .expect("StringArray required"); - let id = if ids.is_null(row_idx) { - None - } else { - Some(ids.value(row_idx)) + let id = match self.data_type { + DataType::Utf8 => { + let ids = self + .owned + .as_any() + .downcast_ref::() + .expect("Expected StringArray for DataType::Utf8"); + if ids.is_null(row_idx) { + None + } else { + Some(ids.value(row_idx)) + } + } + DataType::LargeUtf8 => { + let ids = self + .owned + .as_any() + .downcast_ref::() + .expect("Expected LargeStringArray for DataType::LargeUtf8"); + if ids.is_null(row_idx) { + None + } else { + Some(ids.value(row_idx)) + } + } + DataType::Utf8View => { + let ids = self + .owned + .as_any() + .downcast_ref::() + .expect("Expected StringViewArray for DataType::Utf8View"); + if ids.is_null(row_idx) { + None + } else { + Some(ids.value(row_idx)) + } + } + _ => panic!("Unsupported data type"), }; let hash = self.rnd.hash_one(id); @@ -377,7 +419,9 @@ pub fn new_hash_table( downcast_primitive! { kt => (downcast_helper, kt), - DataType::Utf8 => return Ok(Box::new(StringHashTable::new(limit))), + DataType::Utf8 => return Ok(Box::new(StringHashTable::new(limit, DataType::Utf8))), + DataType::LargeUtf8 => return Ok(Box::new(StringHashTable::new(limit, DataType::LargeUtf8))), + DataType::Utf8View => return Ok(Box::new(StringHashTable::new(limit, DataType::Utf8View))), _ => {} } diff --git a/datafusion/physical-plan/src/aggregates/topk/priority_map.rs b/datafusion/physical-plan/src/aggregates/topk/priority_map.rs index 3b954c4c72d3..a09d70f7471f 100644 --- a/datafusion/physical-plan/src/aggregates/topk/priority_map.rs +++ b/datafusion/physical-plan/src/aggregates/topk/priority_map.rs @@ -108,11 +108,68 @@ impl PriorityMap { #[cfg(test)] mod tests { use super::*; - use arrow::array::{Int64Array, RecordBatch, StringArray}; + use arrow::array::{ + Int64Array, LargeStringArray, RecordBatch, StringArray, StringViewArray, + }; use arrow::datatypes::{Field, Schema, SchemaRef}; use arrow::util::pretty::pretty_format_batches; + use insta::assert_snapshot; use std::sync::Arc; + #[test] + fn should_append_with_utf8view() -> Result<()> { + let ids: ArrayRef = Arc::new(StringViewArray::from(vec!["1"])); + let vals: ArrayRef = Arc::new(Int64Array::from(vec![1])); + let mut agg = PriorityMap::new(DataType::Utf8View, DataType::Int64, 1, false)?; + agg.set_batch(ids, vals); + agg.insert(0)?; + + let cols = agg.emit()?; + let batch = RecordBatch::try_new(test_schema_utf8view(), cols)?; + let batch_schema = batch.schema(); + assert_eq!(batch_schema.fields[0].data_type(), &DataType::Utf8View); + + let actual = format!("{}", pretty_format_batches(&[batch])?); + let expected = r#" ++----------+--------------+ +| trace_id | timestamp_ms | ++----------+--------------+ +| 1 | 1 | ++----------+--------------+ + "# + .trim(); + assert_eq!(actual, expected); + + Ok(()) + } + + #[test] + fn should_append_with_large_utf8() -> Result<()> { + let ids: ArrayRef = Arc::new(LargeStringArray::from(vec!["1"])); + let vals: ArrayRef = Arc::new(Int64Array::from(vec![1])); + let mut agg = PriorityMap::new(DataType::LargeUtf8, DataType::Int64, 1, false)?; + agg.set_batch(ids, vals); + agg.insert(0)?; + + let cols = agg.emit()?; + let batch = RecordBatch::try_new(test_large_schema(), cols)?; + let batch_schema = batch.schema(); + assert_eq!(batch_schema.fields[0].data_type(), &DataType::LargeUtf8); + + let actual = format!("{}", pretty_format_batches(&[batch])?); + let expected = r#" ++----------+--------------+ +| trace_id | timestamp_ms | ++----------+--------------+ +| 1 | 1 | ++----------+--------------+ + "# + .trim(); + assert_eq!(actual, expected); + + Ok(()) + } + #[test] fn should_append() -> Result<()> { let ids: ArrayRef = Arc::new(StringArray::from(vec!["1"])); @@ -124,15 +181,15 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 1 | 1 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -149,15 +206,15 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 1 | 1 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -174,15 +231,14 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 2 | 2 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -199,15 +255,14 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 1 | 1 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -224,15 +279,14 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 1 | 2 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -249,15 +303,14 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 1 | 1 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -274,15 +327,14 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 2 | 2 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -299,15 +351,14 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 1 | 1 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -324,15 +375,14 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ | 1 | 2 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -350,7 +400,7 @@ mod tests { let cols = agg.emit()?; let batch = RecordBatch::try_new(test_schema(), cols)?; let actual = format!("{}", pretty_format_batches(&[batch])?); - let expected = r#" + assert_snapshot!(actual, @r#" +----------+--------------+ | trace_id | timestamp_ms | +----------+--------------+ @@ -358,8 +408,7 @@ mod tests { | 1 | 1 | +----------+--------------+ "# - .trim(); - assert_eq!(actual, expected); + ); Ok(()) } @@ -370,4 +419,18 @@ mod tests { Field::new("timestamp_ms", DataType::Int64, true), ])) } + + fn test_schema_utf8view() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("trace_id", DataType::Utf8View, true), + Field::new("timestamp_ms", DataType::Int64, true), + ])) + } + + fn test_large_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("trace_id", DataType::LargeUtf8, true), + Field::new("timestamp_ms", DataType::Int64, true), + ])) + } } diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 0eb95bb66598..5244038b9ae2 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -123,8 +123,11 @@ impl DisplayAs for CoalesceBatchesExec { Ok(()) } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + writeln!(f, "target_batch_size={}", self.target_batch_size)?; + if let Some(fetch) = self.fetch { + write!(f, "limit={fetch}")?; + }; + Ok(()) } } } diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index 8fb40640dcc0..95a0c8f6ce83 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -92,10 +92,12 @@ impl DisplayAs for CoalescePartitionsExec { } None => write!(f, "CoalescePartitionsExec"), }, - DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") - } + DisplayFormatType::TreeRender => match self.fetch { + Some(fetch) => { + write!(f, "limit: {fetch}") + } + None => write!(f, ""), + }, } } } diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index 564f7ac45928..f437295a3555 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -19,13 +19,12 @@ //! [`crate::displayable`] for examples of how to format use std::collections::{BTreeMap, HashMap}; +use std::fmt; use std::fmt::Formatter; -use std::{fmt, str::FromStr}; use arrow::datatypes::SchemaRef; use datafusion_common::display::{GraphvizBuilder, PlanType, StringifiedPlan}; -use datafusion_common::DataFusionError; use datafusion_expr::display_schema; use datafusion_physical_expr::LexOrdering; @@ -39,7 +38,7 @@ pub enum DisplayFormatType { /// Default, compact format. Example: `FilterExec: c12 < 10.0` /// /// This format is designed to provide a detailed textual description - /// of all rele + /// of all parts of the plan. Default, /// Verbose, showing all available details. /// @@ -79,21 +78,6 @@ pub enum DisplayFormatType { TreeRender, } -impl FromStr for DisplayFormatType { - type Err = DataFusionError; - - fn from_str(s: &str) -> Result { - match s.to_lowercase().as_str() { - "indent" => Ok(Self::Default), - "tree" => Ok(Self::TreeRender), - _ => Err(DataFusionError::Configuration(format!( - "Invalid explain format: {}", - s - ))), - } - } -} - /// Wraps an `ExecutionPlan` with various methods for formatting /// /// @@ -280,6 +264,9 @@ impl<'a> DisplayableExecutionPlan<'a> { } } + /// Formats the plan using a ASCII art like tree + /// + /// See [`DisplayFormatType::TreeRender`] for more details. pub fn tree_render(&self) -> impl fmt::Display + 'a { struct Wrapper<'a> { plan: &'a dyn ExecutionPlan, @@ -326,7 +313,7 @@ impl<'a> DisplayableExecutionPlan<'a> { } } - /// format as a `StringifiedPlan` + #[deprecated(since = "47.0.0", note = "indent() or tree_render() instead")] pub fn to_stringified( &self, verbose: bool, diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index 851e504b69af..2bc5706ee0e1 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -27,7 +27,7 @@ pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; pub use datafusion_expr::{Accumulator, ColumnarValue}; pub use datafusion_physical_expr::window::WindowExpr; pub use datafusion_physical_expr::{ - expressions, udf, Distribution, Partitioning, PhysicalExpr, + expressions, Distribution, Partitioning, PhysicalExpr, }; use std::any::Any; @@ -46,12 +46,12 @@ use arrow::array::{Array, RecordBatch}; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::{exec_err, Constraints, Result}; +use datafusion_common_runtime::JoinSet; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; use datafusion_physical_expr_common::sort_expr::LexRequirement; use futures::stream::{StreamExt, TryStreamExt}; -use tokio::task::JoinSet; /// Represent nodes in the DataFusion Physical Plan. /// @@ -260,13 +260,32 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// used. /// Thus, [`spawn`] is disallowed, and instead use [`SpawnedTask`]. /// + /// To enable timely cancellation, the [`Stream`] that is returned must not + /// block the CPU indefinitely and must yield back to the tokio runtime regularly. + /// In a typical [`ExecutionPlan`], this automatically happens unless there are + /// special circumstances; e.g. when the computational complexity of processing a + /// batch is superlinear. See this [general guideline][async-guideline] for more context + /// on this point, which explains why one should avoid spending a long time without + /// reaching an `await`/yield point in asynchronous runtimes. + /// This can be achieved by manually returning [`Poll::Pending`] and setting up wakers + /// appropriately, or the use of [`tokio::task::yield_now()`] when appropriate. + /// In special cases that warrant manual yielding, determination for "regularly" may be + /// made using a timer (being careful with the overhead-heavy system call needed to + /// take the time), or by counting rows or batches. + /// + /// The [cancellation benchmark] tracks some cases of how quickly queries can + /// be cancelled. + /// /// For more details see [`SpawnedTask`], [`JoinSet`] and [`RecordBatchReceiverStreamBuilder`] /// for structures to help ensure all background tasks are cancelled. /// /// [`spawn`]: tokio::task::spawn - /// [`JoinSet`]: tokio::task::JoinSet + /// [cancellation benchmark]: https://github.com/apache/datafusion/blob/main/benchmarks/README.md#cancellation + /// [`JoinSet`]: datafusion_common_runtime::JoinSet /// [`SpawnedTask`]: datafusion_common_runtime::SpawnedTask /// [`RecordBatchReceiverStreamBuilder`]: crate::stream::RecordBatchReceiverStreamBuilder + /// [`Poll::Pending`]: std::task::Poll::Pending + /// [async-guideline]: https://ryhl.io/blog/async-what-is-blocking/ /// /// # Implementation Examples /// diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index ffcda1d888b0..a8a9973ea043 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -54,6 +54,7 @@ use datafusion_physical_expr::{ ExprBoundaries, PhysicalExpr, }; +use datafusion_physical_expr_common::physical_expr::fmt_sql; use futures::stream::{Stream, StreamExt}; use log::trace; @@ -330,7 +331,7 @@ impl DisplayAs for FilterExec { write!(f, "FilterExec: {}{}", self.predicate, display_projections) } DisplayFormatType::TreeRender => { - write!(f, "predicate={}", self.predicate) + write!(f, "predicate={}", fmt_sql(self.predicate.as_ref())) } } } diff --git a/datafusion/physical-plan/src/insert.rs b/datafusion/physical-plan/src/insert.rs index ff65f6154b4a..5272f0ab1867 100644 --- a/datafusion/physical-plan/src/insert.rs +++ b/datafusion/physical-plan/src/insert.rs @@ -153,10 +153,7 @@ impl DisplayAs for DataSinkExec { write!(f, "DataSinkExec: sink=")?; self.sink.fmt_as(t, f) } - DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") - } + DisplayFormatType::TreeRender => self.sink().fmt_as(t, f), } } } diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 35c8961065a5..639fae7615af 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -645,8 +645,9 @@ mod tests { use crate::common; use crate::test::build_table_scan_i32; - use datafusion_common::{assert_batches_sorted_eq, assert_contains}; + use datafusion_common::{assert_contains, test_util::batches_to_sort_string}; use datafusion_execution::runtime_env::RuntimeEnvBuilder; + use insta::assert_snapshot; async fn join_collect( left: Arc, @@ -829,20 +830,19 @@ mod tests { let (columns, batches) = join_collect(left, right, task_ctx).await?; assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 12 | 14 |", - "| 1 | 4 | 7 | 11 | 13 | 15 |", - "| 2 | 5 | 8 | 10 | 12 | 14 |", - "| 2 | 5 | 8 | 11 | 13 | 15 |", - "| 3 | 6 | 9 | 10 | 12 | 14 |", - "| 3 | 6 | 9 | 11 | 13 | 15 |", - "+----+----+----+----+----+----+", - ]; - - assert_batches_sorted_eq!(expected, &batches); + + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b2 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | 10 | 12 | 14 | + | 1 | 4 | 7 | 11 | 13 | 15 | + | 2 | 5 | 8 | 10 | 12 | 14 | + | 2 | 5 | 8 | 11 | 13 | 15 | + | 3 | 6 | 9 | 10 | 12 | 14 | + | 3 | 6 | 9 | 11 | 13 | 15 | + +----+----+----+----+----+----+ + "#); Ok(()) } diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 39a15037260d..376c3590b88f 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -82,6 +82,7 @@ use datafusion_physical_expr::PhysicalExprRef; use datafusion_physical_expr_common::datum::compare_op_for_nested; use ahash::RandomState; +use datafusion_physical_expr_common::physical_expr::fmt_sql; use futures::{ready, Stream, StreamExt, TryStreamExt}; use parking_lot::Mutex; @@ -672,7 +673,9 @@ impl DisplayAs for HashJoinExec { let on = self .on .iter() - .map(|(c1, c2)| format!("({} = {})", c1, c2)) + .map(|(c1, c2)| { + format!("({} = {})", fmt_sql(c1.as_ref()), fmt_sql(c2.as_ref())) + }) .collect::>() .join(", "); @@ -1660,6 +1663,7 @@ mod tests { use arrow::array::{Date32Array, Int32Array, StructArray}; use arrow::buffer::NullBuffer; use arrow::datatypes::{DataType, Field}; + use datafusion_common::test_util::{batches_to_sort_string, batches_to_string}; use datafusion_common::{ assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_err, ScalarValue, @@ -1670,6 +1674,7 @@ mod tests { use datafusion_physical_expr::expressions::{BinaryExpr, Literal}; use datafusion_physical_expr::PhysicalExpr; use hashbrown::HashTable; + use insta::{allow_duplicates, assert_snapshot}; use rstest::*; use rstest_reuse::*; @@ -1880,18 +1885,18 @@ mod tests { assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 3 | 5 | 9 | 20 | 5 | 80 |", - "+----+----+----+----+----+----+", - ]; - - // Inner join output is expected to preserve both inputs order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + // Inner join output is expected to preserve both inputs order + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 3 | 5 | 9 | 20 | 5 | 80 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -1927,16 +1932,17 @@ mod tests { assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 3 | 5 | 9 | 20 | 5 | 80 |", - "+----+----+----+----+----+----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 3 | 5 | 9 | 20 | 5 | 80 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -1964,18 +1970,18 @@ mod tests { assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 3 | 5 | 9 | 20 | 5 | 80 |", - "+----+----+----+----+----+----+", - ]; - // Inner join output is expected to preserve both inputs order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b2 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 3 | 5 | 9 | 20 | 5 | 80 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -2003,19 +2009,19 @@ mod tests { assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| 3 | 5 | 9 | 20 | 5 | 80 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 0 | 4 | 6 | 10 | 4 | 70 |", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "+----+----+----+----+----+----+", - ]; - // Inner join output is expected to preserve both inputs order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b2 | c2 | + +----+----+----+----+----+----+ + | 3 | 5 | 9 | 20 | 5 | 80 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 0 | 4 | 6 | 10 | 4 | 70 | + | 1 | 4 | 7 | 10 | 4 | 70 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -2066,18 +2072,18 @@ mod tests { assert_eq!(batches.len(), expected_batch_count); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b2 | c1 | a1 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 1 | 7 | 1 | 1 | 70 |", - "| 2 | 2 | 8 | 2 | 2 | 80 |", - "| 2 | 2 | 9 | 2 | 2 | 80 |", - "+----+----+----+----+----+----+", - ]; - // Inner join output is expected to preserve both inputs order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b2 | c1 | a1 | b2 | c2 | + +----+----+----+----+----+----+ + | 1 | 1 | 7 | 1 | 1 | 70 | + | 2 | 2 | 8 | 2 | 2 | 80 | + | 2 | 2 | 9 | 2 | 2 | 80 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -2136,18 +2142,18 @@ mod tests { assert_eq!(batches.len(), expected_batch_count); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b2 | c1 | a1 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 1 | 7 | 1 | 1 | 70 |", - "| 2 | 2 | 8 | 2 | 2 | 80 |", - "| 2 | 2 | 9 | 2 | 2 | 80 |", - "+----+----+----+----+----+----+", - ]; - // Inner join output is expected to preserve both inputs order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b2 | c1 | a1 | b2 | c2 | + +----+----+----+----+----+----+ + | 1 | 1 | 7 | 1 | 1 | 70 | + | 2 | 2 | 8 | 2 | 2 | 80 | + | 2 | 2 | 9 | 2 | 2 | 80 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -2185,19 +2191,19 @@ mod tests { assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| 3 | 5 | 9 | 20 | 5 | 80 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 0 | 4 | 6 | 10 | 4 | 70 |", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "+----+----+----+----+----+----+", - ]; - // Inner join output is expected to preserve both inputs order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b2 | c2 | + +----+----+----+----+----+----+ + | 3 | 5 | 9 | 20 | 5 | 80 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 0 | 4 | 6 | 10 | 4 | 70 | + | 1 | 4 | 7 | 10 | 4 | 70 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -2254,16 +2260,16 @@ mod tests { }; assert_eq!(batches.len(), expected_batch_count); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "+----+----+----+----+----+----+", - ]; - // Inner join output is expected to preserve both inputs order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | 10 | 4 | 70 | + +----+----+----+----+----+----+ + "#); + } // second part let stream = join.execute(1, Arc::clone(&task_ctx))?; @@ -2279,17 +2285,17 @@ mod tests { }; assert_eq!(batches.len(), expected_batch_count); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| 2 | 5 | 8 | 30 | 5 | 90 |", - "| 3 | 5 | 9 | 30 | 5 | 90 |", - "+----+----+----+----+----+----+", - ]; - // Inner join output is expected to preserve both inputs order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | 2 | 5 | 8 | 30 | 5 | 90 | + | 3 | 5 | 9 | 30 | 5 | 90 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -2331,19 +2337,19 @@ mod tests { let stream = join.execute(0, task_ctx).unwrap(); let batches = common::collect(stream).await.unwrap(); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 3 | 7 | 9 | | | |", - "+----+----+----+----+----+----+", - ]; - - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | 10 | 4 | 70 | + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 3 | 7 | 9 | | | | + +----+----+----+----+----+----+ + "#); + } } #[apply(batch_sizes)] @@ -2374,21 +2380,21 @@ mod tests { let stream = join.execute(0, task_ctx).unwrap(); let batches = common::collect(stream).await.unwrap(); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| | | | 30 | 6 | 90 |", - "| | | | 30 | 6 | 90 |", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 3 | 7 | 9 | | | |", - "+----+----+----+----+----+----+", - ]; - - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b2 | c2 | + +----+----+----+----+----+----+ + | | | | 30 | 6 | 90 | + | | | | 30 | 6 | 90 | + | 1 | 4 | 7 | 10 | 4 | 70 | + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 3 | 7 | 9 | | | | + +----+----+----+----+----+----+ + "#); + } } #[apply(batch_sizes)] @@ -2415,17 +2421,17 @@ mod tests { let stream = join.execute(0, task_ctx).unwrap(); let batches = common::collect(stream).await.unwrap(); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | | | |", - "| 2 | 5 | 8 | | | |", - "| 3 | 7 | 9 | | | |", - "+----+----+----+----+----+----+", - ]; - - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | | | | + | 2 | 5 | 8 | | | | + | 3 | 7 | 9 | | | | + +----+----+----+----+----+----+ + "#); + } } #[apply(batch_sizes)] @@ -2452,17 +2458,17 @@ mod tests { let stream = join.execute(0, task_ctx).unwrap(); let batches = common::collect(stream).await.unwrap(); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | | | |", - "| 2 | 5 | 8 | | | |", - "| 3 | 7 | 9 | | | |", - "+----+----+----+----+----+----+", - ]; - - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b2 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | | | | + | 2 | 5 | 8 | | | | + | 3 | 7 | 9 | | | | + +----+----+----+----+----+----+ + "#); + } } #[apply(batch_sizes)] @@ -2495,16 +2501,17 @@ mod tests { .await?; assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 3 | 7 | 9 | | | |", - "+----+----+----+----+----+----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 3 | 7 | 9 | | | | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -2539,16 +2546,17 @@ mod tests { .await?; assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 3 | 7 | 9 | | | |", - "+----+----+----+----+----+----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 3 | 7 | 9 | | | | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -2594,16 +2602,17 @@ mod tests { let batches = common::collect(stream).await?; // ignore the order - let expected = [ - "+----+----+-----+", - "| a1 | b1 | c1 |", - "+----+----+-----+", - "| 11 | 8 | 110 |", - "| 13 | 10 | 130 |", - "| 9 | 8 | 90 |", - "+----+----+-----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+-----+ + | a1 | b1 | c1 | + +----+----+-----+ + | 11 | 8 | 110 | + | 13 | 10 | 130 | + | 9 | 8 | 90 | + +----+----+-----+ + "#); + } Ok(()) } @@ -2655,16 +2664,17 @@ mod tests { let stream = join.execute(0, Arc::clone(&task_ctx))?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a1 | b1 | c1 |", - "+----+----+-----+", - "| 11 | 8 | 110 |", - "| 13 | 10 | 130 |", - "| 9 | 8 | 90 |", - "+----+----+-----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r" + +----+----+-----+ + | a1 | b1 | c1 | + +----+----+-----+ + | 11 | 8 | 110 | + | 13 | 10 | 130 | + | 9 | 8 | 90 | + +----+----+-----+ + "); + } // left_table left semi join right_table on left_table.b1 = right_table.b2 and right_table.a2 > 10 let filter_expression = Arc::new(BinaryExpr::new( @@ -2686,14 +2696,15 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a1 | b1 | c1 |", - "+----+----+-----+", - "| 13 | 10 | 130 |", - "+----+----+-----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+-----+ + | a1 | b1 | c1 | + +----+----+-----+ + | 13 | 10 | 130 | + +----+----+-----+ + "#); + } Ok(()) } @@ -2719,18 +2730,18 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a2 | b2 | c2 |", - "+----+----+-----+", - "| 8 | 8 | 20 |", - "| 12 | 10 | 40 |", - "| 10 | 10 | 100 |", - "+----+----+-----+", - ]; - // RightSemi join output is expected to preserve right input order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+-----+ + | a2 | b2 | c2 | + +----+----+-----+ + | 8 | 8 | 20 | + | 12 | 10 | 40 | + | 10 | 10 | 100 | + +----+----+-----+ + "#); + } Ok(()) } @@ -2782,18 +2793,18 @@ mod tests { let stream = join.execute(0, Arc::clone(&task_ctx))?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a2 | b2 | c2 |", - "+----+----+-----+", - "| 8 | 8 | 20 |", - "| 12 | 10 | 40 |", - "| 10 | 10 | 100 |", - "+----+----+-----+", - ]; - // RightSemi join output is expected to preserve right input order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+-----+ + | a2 | b2 | c2 | + +----+----+-----+ + | 8 | 8 | 20 | + | 12 | 10 | 40 | + | 10 | 10 | 100 | + +----+----+-----+ + "#); + } // left_table right semi join right_table on left_table.b1 = right_table.b2 on left_table.a1!=9 let filter_expression = Arc::new(BinaryExpr::new( @@ -2813,17 +2824,17 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a2 | b2 | c2 |", - "+----+----+-----+", - "| 12 | 10 | 40 |", - "| 10 | 10 | 100 |", - "+----+----+-----+", - ]; - // RightSemi join output is expected to preserve right input order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+-----+ + | a2 | b2 | c2 | + +----+----+-----+ + | 12 | 10 | 40 | + | 10 | 10 | 100 | + +----+----+-----+ + "#); + } Ok(()) } @@ -2848,17 +2859,18 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+----+", - "| a1 | b1 | c1 |", - "+----+----+----+", - "| 1 | 1 | 10 |", - "| 3 | 3 | 30 |", - "| 5 | 5 | 50 |", - "| 7 | 7 | 70 |", - "+----+----+----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+ + | a1 | b1 | c1 | + +----+----+----+ + | 1 | 1 | 10 | + | 3 | 3 | 30 | + | 5 | 5 | 50 | + | 7 | 7 | 70 | + +----+----+----+ + "#); + } Ok(()) } @@ -2907,19 +2919,20 @@ mod tests { let stream = join.execute(0, Arc::clone(&task_ctx))?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a1 | b1 | c1 |", - "+----+----+-----+", - "| 1 | 1 | 10 |", - "| 11 | 8 | 110 |", - "| 3 | 3 | 30 |", - "| 5 | 5 | 50 |", - "| 7 | 7 | 70 |", - "| 9 | 8 | 90 |", - "+----+----+-----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+-----+ + | a1 | b1 | c1 | + +----+----+-----+ + | 1 | 1 | 10 | + | 11 | 8 | 110 | + | 3 | 3 | 30 | + | 5 | 5 | 50 | + | 7 | 7 | 70 | + | 9 | 8 | 90 | + +----+----+-----+ + "#); + } // left_table left anti join right_table on left_table.b1 = right_table.b2 and right_table.a2 != 13 let filter_expression = Arc::new(BinaryExpr::new( @@ -2942,19 +2955,20 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a1 | b1 | c1 |", - "+----+----+-----+", - "| 1 | 1 | 10 |", - "| 11 | 8 | 110 |", - "| 3 | 3 | 30 |", - "| 5 | 5 | 50 |", - "| 7 | 7 | 70 |", - "| 9 | 8 | 90 |", - "+----+----+-----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+-----+ + | a1 | b1 | c1 | + +----+----+-----+ + | 1 | 1 | 10 | + | 11 | 8 | 110 | + | 3 | 3 | 30 | + | 5 | 5 | 50 | + | 7 | 7 | 70 | + | 9 | 8 | 90 | + +----+----+-----+ + "#); + } Ok(()) } @@ -2978,18 +2992,18 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a2 | b2 | c2 |", - "+----+----+-----+", - "| 6 | 6 | 60 |", - "| 2 | 2 | 80 |", - "| 4 | 4 | 120 |", - "+----+----+-----+", - ]; - // RightAnti join output is expected to preserve right input order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+-----+ + | a2 | b2 | c2 | + +----+----+-----+ + | 6 | 6 | 60 | + | 2 | 2 | 80 | + | 4 | 4 | 120 | + +----+----+-----+ + "#); + } Ok(()) } @@ -3039,20 +3053,20 @@ mod tests { let stream = join.execute(0, Arc::clone(&task_ctx))?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a2 | b2 | c2 |", - "+----+----+-----+", - "| 12 | 10 | 40 |", - "| 6 | 6 | 60 |", - "| 2 | 2 | 80 |", - "| 10 | 10 | 100 |", - "| 4 | 4 | 120 |", - "+----+----+-----+", - ]; - // RightAnti join output is expected to preserve right input order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+-----+ + | a2 | b2 | c2 | + +----+----+-----+ + | 12 | 10 | 40 | + | 6 | 6 | 60 | + | 2 | 2 | 80 | + | 10 | 10 | 100 | + | 4 | 4 | 120 | + +----+----+-----+ + "#); + } // left_table right anti join right_table on left_table.b1 = right_table.b2 and right_table.b2!=8 let column_indices = vec![ColumnIndex { @@ -3080,19 +3094,19 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+-----+", - "| a2 | b2 | c2 |", - "+----+----+-----+", - "| 8 | 8 | 20 |", - "| 6 | 6 | 60 |", - "| 2 | 2 | 80 |", - "| 4 | 4 | 120 |", - "+----+----+-----+", - ]; - // RightAnti join output is expected to preserve right input order - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +----+----+-----+ + | a2 | b2 | c2 | + +----+----+-----+ + | 8 | 8 | 20 | + | 6 | 6 | 60 | + | 2 | 2 | 80 | + | 4 | 4 | 120 | + +----+----+-----+ + "#); + } Ok(()) } @@ -3121,17 +3135,17 @@ mod tests { assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| | | | 30 | 6 | 90 |", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "+----+----+----+----+----+----+", - ]; - - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | | | | 30 | 6 | 90 | + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -3161,17 +3175,17 @@ mod tests { assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+----+", - "| | | | 30 | 6 | 90 |", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "+----+----+----+----+----+----+", - ]; - - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b1 | c2 | + +----+----+----+----+----+----+ + | | | | 30 | 6 | 90 | + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -3203,17 +3217,18 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+----+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | b2 | c2 |", - "+----+----+----+----+----+----+", - "| | | | 30 | 6 | 90 |", - "| 1 | 4 | 7 | 10 | 4 | 70 |", - "| 2 | 5 | 8 | 20 | 5 | 80 |", - "| 3 | 7 | 9 | | | |", - "+----+----+----+----+----+----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+----+----+----+ + | a1 | b1 | c1 | a2 | b2 | c2 | + +----+----+----+----+----+----+ + | | | | 30 | 6 | 90 | + | 1 | 4 | 7 | 10 | 4 | 70 | + | 2 | 5 | 8 | 20 | 5 | 80 | + | 3 | 7 | 9 | | | | + +----+----+----+----+----+----+ + "#); + } Ok(()) } @@ -3248,16 +3263,17 @@ mod tests { .await?; assert_eq!(columns, vec!["a1", "b1", "c1", "mark"]); - let expected = [ - "+----+----+----+-------+", - "| a1 | b1 | c1 | mark |", - "+----+----+----+-------+", - "| 1 | 4 | 7 | true |", - "| 2 | 5 | 8 | true |", - "| 3 | 7 | 9 | false |", - "+----+----+----+-------+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+-------+ + | a1 | b1 | c1 | mark | + +----+----+----+-------+ + | 1 | 4 | 7 | true | + | 2 | 5 | 8 | true | + | 3 | 7 | 9 | false | + +----+----+----+-------+ + "#); + } Ok(()) } @@ -3292,16 +3308,17 @@ mod tests { .await?; assert_eq!(columns, vec!["a1", "b1", "c1", "mark"]); - let expected = [ - "+----+----+----+-------+", - "| a1 | b1 | c1 | mark |", - "+----+----+----+-------+", - "| 1 | 4 | 7 | true |", - "| 2 | 5 | 8 | true |", - "| 3 | 7 | 9 | false |", - "+----+----+----+-------+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +----+----+----+-------+ + | a1 | b1 | c1 | mark | + +----+----+----+-------+ + | 1 | 4 | 7 | true | + | 2 | 5 | 8 | true | + | 3 | 7 | 9 | false | + +----+----+----+-------+ + "#); + } Ok(()) } @@ -3398,15 +3415,16 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+---+---+---+----+---+----+", - "| a | b | c | a | b | c |", - "+---+---+---+----+---+----+", - "| 1 | 4 | 7 | 10 | 1 | 70 |", - "| 2 | 5 | 8 | 20 | 2 | 80 |", - "+---+---+---+----+---+----+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +---+---+---+----+---+----+ + | a | b | c | a | b | c | + +---+---+---+----+---+----+ + | 1 | 4 | 7 | 10 | 1 | 70 | + | 2 | 5 | 8 | 20 | 2 | 80 | + +---+---+---+----+---+----+ + "#); + } Ok(()) } @@ -3467,15 +3485,16 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+---+---+---+----+---+---+", - "| a | b | c | a | b | c |", - "+---+---+---+----+---+---+", - "| 2 | 7 | 9 | 10 | 2 | 7 |", - "| 2 | 7 | 9 | 20 | 2 | 5 |", - "+---+---+---+----+---+---+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +---+---+---+----+---+---+ + | a | b | c | a | b | c | + +---+---+---+----+---+---+ + | 2 | 7 | 9 | 10 | 2 | 7 | + | 2 | 7 | 9 | 20 | 2 | 5 | + +---+---+---+----+---+---+ + "#); + } Ok(()) } @@ -3508,18 +3527,19 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+---+---+---+----+---+---+", - "| a | b | c | a | b | c |", - "+---+---+---+----+---+---+", - "| 0 | 4 | 7 | | | |", - "| 1 | 5 | 8 | | | |", - "| 2 | 7 | 9 | 10 | 2 | 7 |", - "| 2 | 7 | 9 | 20 | 2 | 5 |", - "| 2 | 8 | 1 | | | |", - "+---+---+---+----+---+---+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +---+---+---+----+---+---+ + | a | b | c | a | b | c | + +---+---+---+----+---+---+ + | 0 | 4 | 7 | | | | + | 1 | 5 | 8 | | | | + | 2 | 7 | 9 | 10 | 2 | 7 | + | 2 | 7 | 9 | 20 | 2 | 5 | + | 2 | 8 | 1 | | | | + +---+---+---+----+---+---+ + "#); + } Ok(()) } @@ -3552,17 +3572,18 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+---+---+---+----+---+---+", - "| a | b | c | a | b | c |", - "+---+---+---+----+---+---+", - "| | | | 30 | 3 | 6 |", - "| | | | 40 | 4 | 4 |", - "| 2 | 7 | 9 | 10 | 2 | 7 |", - "| 2 | 7 | 9 | 20 | 2 | 5 |", - "+---+---+---+----+---+---+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +---+---+---+----+---+---+ + | a | b | c | a | b | c | + +---+---+---+----+---+---+ + | | | | 30 | 3 | 6 | + | | | | 40 | 4 | 4 | + | 2 | 7 | 9 | 10 | 2 | 7 | + | 2 | 7 | 9 | 20 | 2 | 5 | + +---+---+---+----+---+---+ + "#); + } Ok(()) } @@ -3610,6 +3631,23 @@ mod tests { ]; assert_batches_sorted_eq!(expected, &batches); + // THIS MIGRATION HAULTED DUE TO ISSUE #15312 + //allow_duplicates! { + // assert_snapshot!(batches_to_sort_string(&batches), @r#" + // +---+---+---+----+---+---+ + // | a | b | c | a | b | c | + // +---+---+---+----+---+---+ + // | | | | 30 | 3 | 6 | + // | | | | 40 | 4 | 4 | + // | 2 | 7 | 9 | 10 | 2 | 7 | + // | 2 | 7 | 9 | 20 | 2 | 5 | + // | 0 | 4 | 7 | | | | + // | 1 | 5 | 8 | | | | + // | 2 | 8 | 1 | | | | + // +---+---+---+----+---+---+ + // "#) + //} + Ok(()) } @@ -3765,16 +3803,17 @@ mod tests { let stream = join.execute(0, task_ctx)?; let batches = common::collect(stream).await?; - let expected = [ - "+------------+---+------------+---+", - "| date | n | date | n |", - "+------------+---+------------+---+", - "| 2022-04-26 | 2 | 2022-04-26 | 4 |", - "| 2022-04-26 | 2 | 2022-04-26 | 5 |", - "| 2022-04-27 | 3 | 2022-04-27 | 6 |", - "+------------+---+------------+---+", - ]; - assert_batches_sorted_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r#" + +------------+---+------------+---+ + | date | n | date | n | + +------------+---+------------+---+ + | 2022-04-26 | 2 | 2022-04-26 | 4 | + | 2022-04-26 | 2 | 2022-04-26 | 5 | + | 2022-04-27 | 3 | 2022-04-27 | 6 | + +------------+---+------------+---+ + "#); + } Ok(()) } @@ -4159,16 +4198,17 @@ mod tests { assert_eq!(columns, vec!["n1", "n2"]); - let expected = [ - "+--------+--------+", - "| n1 | n2 |", - "+--------+--------+", - "| {a: } | {a: } |", - "| {a: 1} | {a: 1} |", - "| {a: 2} | {a: 2} |", - "+--------+--------+", - ]; - assert_batches_eq!(expected, &batches); + allow_duplicates! { + assert_snapshot!(batches_to_string(&batches), @r#" + +--------+--------+ + | n1 | n2 | + +--------+--------+ + | {a: } | {a: } | + | {a: 1} | {a: 1} | + | {a: 2} | {a: 2} | + +--------+--------+ + "#); + } Ok(()) } @@ -4195,14 +4235,15 @@ mod tests { ) .await?; - let expected_null_eq = [ - "+----+----+", - "| n1 | n2 |", - "+----+----+", - "| | |", - "+----+----+", - ]; - assert_batches_eq!(expected_null_eq, &batches_null_eq); + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches_null_eq), @r#" + +----+----+ + | n1 | n2 | + +----+----+ + | | | + +----+----+ + "#); + } let (_, batches_null_neq) = join_collect(left, right, on, &JoinType::Inner, false, task_ctx).await?; diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index f6fa8878e033..88d3ea9e7e1e 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -75,7 +75,9 @@ struct JoinLeftData { probe_threads_counter: AtomicUsize, /// Memory reservation for tracking batch and bitmap /// Cleared on `JoinLeftData` drop - _reservation: MemoryReservation, + /// reservation is cleared on Drop + #[expect(dead_code)] + reservation: MemoryReservation, } impl JoinLeftData { @@ -83,13 +85,13 @@ impl JoinLeftData { batch: RecordBatch, bitmap: SharedBitmapBuilder, probe_threads_counter: AtomicUsize, - _reservation: MemoryReservation, + reservation: MemoryReservation, ) -> Self { Self { batch, bitmap, probe_threads_counter, - _reservation, + reservation, } } diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index d8446fb332b1..7fb8a2d73600 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -70,6 +70,7 @@ use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_physical_expr::equivalence::join_equivalence_properties; use datafusion_physical_expr::PhysicalExprRef; +use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement}; use futures::{Stream, StreamExt}; @@ -373,7 +374,9 @@ impl DisplayAs for SortMergeJoinExec { let on = self .on .iter() - .map(|(c1, c2)| format!("({} = {})", c1, c2)) + .map(|(c1, c2)| { + format!("({} = {})", fmt_sql(c1.as_ref()), fmt_sql(c2.as_ref())) + }) .collect::>() .join(", "); diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 63e95c7a3018..0dcb42169e00 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -74,6 +74,7 @@ use datafusion_expr::interval_arithmetic::Interval; use datafusion_physical_expr::equivalence::join_equivalence_properties; use datafusion_physical_expr::intervals::cp_solver::ExprIntervalGraph; use datafusion_physical_expr::PhysicalExprRef; +use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement}; use ahash::RandomState; @@ -384,7 +385,9 @@ impl DisplayAs for SymmetricHashJoinExec { let on = self .on .iter() - .map(|(c1, c2)| format!("({} = {})", c1, c2)) + .map(|(c1, c2)| { + format!("({} = {})", fmt_sql(c1.as_ref()), fmt_sql(c2.as_ref())) + }) .collect::>() .join(", "); diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 2cd9e8b52ab8..bc2017fb20d3 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! Traits for physical query plan, supporting parallel execution for partitioned relations. @@ -35,7 +36,7 @@ pub use datafusion_expr::{Accumulator, ColumnarValue}; pub use datafusion_physical_expr::window::WindowExpr; use datafusion_physical_expr::PhysicalSortExpr; pub use datafusion_physical_expr::{ - expressions, udf, Distribution, Partitioning, PhysicalExpr, + expressions, Distribution, Partitioning, PhysicalExpr, }; pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay}; diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index b9464e3a88fb..89cf47a6d650 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -268,8 +268,7 @@ impl DisplayAs for LocalLimitExec { write!(f, "LocalLimitExec: fetch={}", self.fetch) } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + write!(f, "limit={}", self.fetch) } } } diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index ae878fdab37b..1bc872a56e76 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -193,8 +193,17 @@ impl DisplayAs for LazyMemoryExec { ) } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + //TODO: remove batch_size, add one line per generator + writeln!( + f, + "batch_generators={}", + self.batch_generators + .iter() + .map(|g| g.read().to_string()) + .collect::>() + .join(", ") + )?; + Ok(()) } } } diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs index 72f2c13d2040..eecd980d09f8 100644 --- a/datafusion/physical-plan/src/placeholder_row.rs +++ b/datafusion/physical-plan/src/placeholder_row.rs @@ -113,10 +113,7 @@ impl DisplayAs for PlaceholderRowExec { write!(f, "PlaceholderRowExec") } - DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") - } + DisplayFormatType::TreeRender => Ok(()), } } } diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 3f901311a053..1d3e23ea9097 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -48,6 +48,7 @@ use datafusion_physical_expr::equivalence::ProjectionMapping; use datafusion_physical_expr::utils::collect_columns; use datafusion_physical_expr::PhysicalExprRef; +use datafusion_physical_expr_common::physical_expr::fmt_sql; use futures::stream::{Stream, StreamExt}; use itertools::Itertools; use log::trace; @@ -169,13 +170,14 @@ impl DisplayAs for ProjectionExec { } DisplayFormatType::TreeRender => { for (i, (e, alias)) in self.expr().iter().enumerate() { - let e = e.to_string(); - if &e == alias { - writeln!(f, "expr{i}={e}")?; + let expr_sql = fmt_sql(e.as_ref()); + if &e.to_string() == alias { + writeln!(f, "expr{i}={expr_sql}")?; } else { - writeln!(f, "{alias}={e}")?; + writeln!(f, "{alias}={expr_sql}")?; } } + Ok(()) } } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index e9a360c2ece3..c27de77401eb 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -507,8 +507,16 @@ impl DisplayAs for RepartitionExec { Ok(()) } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + writeln!(f, "partitioning_scheme={}", self.partitioning(),)?; + writeln!( + f, + "output_partition_count={}", + self.input.output_partitioning().partition_count() + )?; + if self.preserve_order { + writeln!(f, "preserve_order={}", self.preserve_order)?; + } + Ok(()) } } } @@ -1071,10 +1079,9 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::cast::as_string_array; use datafusion_common::{arrow_datafusion_err, assert_batches_sorted_eq, exec_err}; + use datafusion_common_runtime::JoinSet; use datafusion_execution::runtime_env::RuntimeEnvBuilder; - use tokio::task::JoinSet; - #[tokio::test] async fn one_to_many_round_robin() -> Result<()> { // define input partitions diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 00fa78ce5229..ca06a029e8db 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -186,8 +186,17 @@ impl DisplayAs for SortPreservingMergeExec { Ok(()) } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + for (i, e) in self.expr().iter().enumerate() { + e.fmt_sql(f)?; + if i != self.expr().len() - 1 { + write!(f, ", ")?; + } + } + if let Some(fetch) = self.fetch { + writeln!(f, "limit={fetch}")?; + }; + + Ok(()) } } } diff --git a/datafusion/physical-plan/src/stream.rs b/datafusion/physical-plan/src/stream.rs index f2324dfd9bac..338ac7d048a3 100644 --- a/datafusion/physical-plan/src/stream.rs +++ b/datafusion/physical-plan/src/stream.rs @@ -27,7 +27,8 @@ use super::{ExecutionPlan, RecordBatchStream, SendableRecordBatchStream}; use crate::displayable; use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; -use datafusion_common::{internal_err, Result}; +use datafusion_common::{exec_err, Result}; +use datafusion_common_runtime::JoinSet; use datafusion_execution::TaskContext; use futures::stream::BoxStream; @@ -35,7 +36,6 @@ use futures::{Future, Stream, StreamExt}; use log::debug; use pin_project_lite::pin_project; use tokio::sync::mpsc::{Receiver, Sender}; -use tokio::task::JoinSet; /// Creates a stream from a collection of producing tasks, routing panics to the stream. /// @@ -128,7 +128,7 @@ impl ReceiverStreamBuilder { // the JoinSet were aborted, which in turn // would imply that the receiver has been // dropped and this code is not running - return Some(internal_err!("Non Panic Task error: {e}")); + return Some(exec_err!("Non Panic Task error: {e}")); } } } diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 791370917523..2b666093f29e 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -157,10 +157,7 @@ impl DisplayAs for UnionExec { DisplayFormatType::Default | DisplayFormatType::Verbose => { write!(f, "UnionExec") } - DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") - } + DisplayFormatType::TreeRender => Ok(()), } } } @@ -391,10 +388,7 @@ impl DisplayAs for InterleaveExec { DisplayFormatType::Default | DisplayFormatType::Verbose => { write!(f, "InterleaveExec") } - DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") - } + DisplayFormatType::TreeRender => Ok(()), } } } diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 4e70d2dc4ee5..c06b09f2fecd 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -958,7 +958,8 @@ mod tests { }; use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::datatypes::{Field, Int32Type}; - use datafusion_common::assert_batches_eq; + use datafusion_common::test_util::batches_to_string; + use insta::assert_snapshot; // Create a GenericListArray with the following list values: // [A, B, C], [], NULL, [D], NULL, [NULL, F] @@ -1145,33 +1146,33 @@ mod tests { )? .unwrap(); - let expected = &[ -"+---------------------------------+---------------------------------+---------------------------------+", -"| col1_unnest_placeholder_depth_1 | col1_unnest_placeholder_depth_2 | col2_unnest_placeholder_depth_1 |", -"+---------------------------------+---------------------------------+---------------------------------+", -"| [1, 2, 3] | 1 | a |", -"| | 2 | b |", -"| [4, 5] | 3 | |", -"| [1, 2, 3] | | a |", -"| | | b |", -"| [4, 5] | | |", -"| [1, 2, 3] | 4 | a |", -"| | 5 | b |", -"| [4, 5] | | |", -"| [7, 8, 9, 10] | 7 | c |", -"| | 8 | d |", -"| [11, 12, 13] | 9 | |", -"| | 10 | |", -"| [7, 8, 9, 10] | | c |", -"| | | d |", -"| [11, 12, 13] | | |", -"| [7, 8, 9, 10] | 11 | c |", -"| | 12 | d |", -"| [11, 12, 13] | 13 | |", -"| | | e |", -"+---------------------------------+---------------------------------+---------------------------------+", - ]; - assert_batches_eq!(expected, &[ret]); + assert_snapshot!(batches_to_string(&[ret]), + @r###" ++---------------------------------+---------------------------------+---------------------------------+ +| col1_unnest_placeholder_depth_1 | col1_unnest_placeholder_depth_2 | col2_unnest_placeholder_depth_1 | ++---------------------------------+---------------------------------+---------------------------------+ +| [1, 2, 3] | 1 | a | +| | 2 | b | +| [4, 5] | 3 | | +| [1, 2, 3] | | a | +| | | b | +| [4, 5] | | | +| [1, 2, 3] | 4 | a | +| | 5 | b | +| [4, 5] | | | +| [7, 8, 9, 10] | 7 | c | +| | 8 | d | +| [11, 12, 13] | 9 | | +| | 10 | | +| [7, 8, 9, 10] | | c | +| | | d | +| [11, 12, 13] | | | +| [7, 8, 9, 10] | 11 | c | +| | 12 | d | +| [11, 12, 13] | 13 | | +| | | e | ++---------------------------------+---------------------------------+---------------------------------+ + "###); Ok(()) } diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs index f082b05410dd..126a7d0bba29 100644 --- a/datafusion/physical-plan/src/work_table.rs +++ b/datafusion/physical-plan/src/work_table.rs @@ -163,8 +163,7 @@ impl DisplayAs for WorkTableExec { write!(f, "WorkTableExec: name={}", self.name) } DisplayFormatType::TreeRender => { - // TODO: collect info - write!(f, "") + write!(f, "name={}", self.name) } } } diff --git a/datafusion/proto-common/src/lib.rs b/datafusion/proto-common/src/lib.rs index 56cd42ee5067..6400e4bdc66d 100644 --- a/datafusion/proto-common/src/lib.rs +++ b/datafusion/proto-common/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! Serialize / Deserialize DataFusion Primitive Types to bytes diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 39897cfcf2de..553fccf7d428 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -39,7 +39,7 @@ name = "datafusion_proto" [features] default = ["parquet"] -json = ["pbjson", "serde", "serde_json"] +json = ["pbjson", "serde", "serde_json", "datafusion-proto-common/json"] parquet = ["datafusion/parquet", "datafusion-common/parquet"] avro = ["datafusion/avro", "datafusion-common/avro"] diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 1cdfe6d216e3..2e028eb29118 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -273,7 +273,6 @@ message DmlNode{ INSERT_APPEND = 3; INSERT_OVERWRITE = 4; INSERT_REPLACE = 5; - } Type dml_type = 1; LogicalPlanNode input = 2; @@ -484,6 +483,7 @@ message AliasNode { LogicalExprNode expr = 1; string alias = 2; repeated TableReference relation = 3; + map metadata = 4; } message BinaryExprNode { @@ -725,6 +725,7 @@ message PhysicalPlanNode { CsvSinkExecNode csv_sink = 28; ParquetSinkExecNode parquet_sink = 29; UnnestExecNode unnest = 30; + JsonScanExecNode json_scan = 31; } } @@ -996,6 +997,7 @@ message FileScanExecConf { reserved 10; datafusion_common.Constraints constraints = 11; + optional uint64 batch_size = 12; } message ParquetScanExecNode { @@ -1023,6 +1025,10 @@ message CsvScanExecNode { bool newlines_in_values = 7; } +message JsonScanExecNode { + FileScanExecConf base_conf = 1; +} + message AvroScanExecNode { FileScanExecConf base_conf = 1; } diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 6e09e9a797ea..6166b6ec4796 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -770,6 +770,9 @@ impl serde::Serialize for AliasNode { if !self.relation.is_empty() { len += 1; } + if !self.metadata.is_empty() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.AliasNode", len)?; if let Some(v) = self.expr.as_ref() { struct_ser.serialize_field("expr", v)?; @@ -780,6 +783,9 @@ impl serde::Serialize for AliasNode { if !self.relation.is_empty() { struct_ser.serialize_field("relation", &self.relation)?; } + if !self.metadata.is_empty() { + struct_ser.serialize_field("metadata", &self.metadata)?; + } struct_ser.end() } } @@ -793,6 +799,7 @@ impl<'de> serde::Deserialize<'de> for AliasNode { "expr", "alias", "relation", + "metadata", ]; #[allow(clippy::enum_variant_names)] @@ -800,6 +807,7 @@ impl<'de> serde::Deserialize<'de> for AliasNode { Expr, Alias, Relation, + Metadata, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -824,6 +832,7 @@ impl<'de> serde::Deserialize<'de> for AliasNode { "expr" => Ok(GeneratedField::Expr), "alias" => Ok(GeneratedField::Alias), "relation" => Ok(GeneratedField::Relation), + "metadata" => Ok(GeneratedField::Metadata), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -846,6 +855,7 @@ impl<'de> serde::Deserialize<'de> for AliasNode { let mut expr__ = None; let mut alias__ = None; let mut relation__ = None; + let mut metadata__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Expr => { @@ -866,12 +876,21 @@ impl<'de> serde::Deserialize<'de> for AliasNode { } relation__ = Some(map_.next_value()?); } + GeneratedField::Metadata => { + if metadata__.is_some() { + return Err(serde::de::Error::duplicate_field("metadata")); + } + metadata__ = Some( + map_.next_value::>()? + ); + } } } Ok(AliasNode { expr: expr__, alias: alias__.unwrap_or_default(), relation: relation__.unwrap_or_default(), + metadata: metadata__.unwrap_or_default(), }) } } @@ -5761,6 +5780,9 @@ impl serde::Serialize for FileScanExecConf { if self.constraints.is_some() { len += 1; } + if self.batch_size.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.FileScanExecConf", len)?; if !self.file_groups.is_empty() { struct_ser.serialize_field("fileGroups", &self.file_groups)?; @@ -5789,6 +5811,11 @@ impl serde::Serialize for FileScanExecConf { if let Some(v) = self.constraints.as_ref() { struct_ser.serialize_field("constraints", v)?; } + if let Some(v) = self.batch_size.as_ref() { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("batchSize", ToString::to_string(&v).as_str())?; + } struct_ser.end() } } @@ -5812,6 +5839,8 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { "output_ordering", "outputOrdering", "constraints", + "batch_size", + "batchSize", ]; #[allow(clippy::enum_variant_names)] @@ -5825,6 +5854,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { ObjectStoreUrl, OutputOrdering, Constraints, + BatchSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -5855,6 +5885,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { "objectStoreUrl" | "object_store_url" => Ok(GeneratedField::ObjectStoreUrl), "outputOrdering" | "output_ordering" => Ok(GeneratedField::OutputOrdering), "constraints" => Ok(GeneratedField::Constraints), + "batchSize" | "batch_size" => Ok(GeneratedField::BatchSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -5883,6 +5914,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { let mut object_store_url__ = None; let mut output_ordering__ = None; let mut constraints__ = None; + let mut batch_size__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::FileGroups => { @@ -5942,6 +5974,14 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { } constraints__ = map_.next_value()?; } + GeneratedField::BatchSize => { + if batch_size__.is_some() { + return Err(serde::de::Error::duplicate_field("batchSize")); + } + batch_size__ = + map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0) + ; + } } } Ok(FileScanExecConf { @@ -5954,6 +5994,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { object_store_url: object_store_url__.unwrap_or_default(), output_ordering: output_ordering__.unwrap_or_default(), constraints: constraints__, + batch_size: batch_size__, }) } } @@ -8716,6 +8757,98 @@ impl<'de> serde::Deserialize<'de> for JoinOn { deserializer.deserialize_struct("datafusion.JoinOn", FIELDS, GeneratedVisitor) } } +impl serde::Serialize for JsonScanExecNode { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.base_conf.is_some() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.JsonScanExecNode", len)?; + if let Some(v) = self.base_conf.as_ref() { + struct_ser.serialize_field("baseConf", v)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for JsonScanExecNode { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "base_conf", + "baseConf", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + BaseConf, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "baseConf" | "base_conf" => Ok(GeneratedField::BaseConf), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = JsonScanExecNode; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.JsonScanExecNode") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut base_conf__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::BaseConf => { + if base_conf__.is_some() { + return Err(serde::de::Error::duplicate_field("baseConf")); + } + base_conf__ = map_.next_value()?; + } + } + } + Ok(JsonScanExecNode { + base_conf: base_conf__, + }) + } + } + deserializer.deserialize_struct("datafusion.JsonScanExecNode", FIELDS, GeneratedVisitor) + } +} impl serde::Serialize for JsonSink { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result @@ -15641,6 +15774,9 @@ impl serde::Serialize for PhysicalPlanNode { physical_plan_node::PhysicalPlanType::Unnest(v) => { struct_ser.serialize_field("unnest", v)?; } + physical_plan_node::PhysicalPlanType::JsonScan(v) => { + struct_ser.serialize_field("jsonScan", v)?; + } } } struct_ser.end() @@ -15697,6 +15833,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode { "parquet_sink", "parquetSink", "unnest", + "json_scan", + "jsonScan", ]; #[allow(clippy::enum_variant_names)] @@ -15730,6 +15868,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode { CsvSink, ParquetSink, Unnest, + JsonScan, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -15780,6 +15919,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode { "csvSink" | "csv_sink" => Ok(GeneratedField::CsvSink), "parquetSink" | "parquet_sink" => Ok(GeneratedField::ParquetSink), "unnest" => Ok(GeneratedField::Unnest), + "jsonScan" | "json_scan" => Ok(GeneratedField::JsonScan), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -16003,6 +16143,13 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode { return Err(serde::de::Error::duplicate_field("unnest")); } physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::Unnest) +; + } + GeneratedField::JsonScan => { + if physical_plan_type__.is_some() { + return Err(serde::de::Error::duplicate_field("jsonScan")); + } + physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::JsonScan) ; } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index f5ec45da48f2..d2165dad4850 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -739,6 +739,11 @@ pub struct AliasNode { pub alias: ::prost::alloc::string::String, #[prost(message, repeated, tag = "3")] pub relation: ::prost::alloc::vec::Vec, + #[prost(map = "string, string", tag = "4")] + pub metadata: ::std::collections::HashMap< + ::prost::alloc::string::String, + ::prost::alloc::string::String, + >, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct BinaryExprNode { @@ -1043,7 +1048,7 @@ pub mod table_reference { pub struct PhysicalPlanNode { #[prost( oneof = "physical_plan_node::PhysicalPlanType", - tags = "1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30" + tags = "1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31" )] pub physical_plan_type: ::core::option::Option, } @@ -1111,6 +1116,8 @@ pub mod physical_plan_node { ParquetSink(::prost::alloc::boxed::Box), #[prost(message, tag = "30")] Unnest(::prost::alloc::boxed::Box), + #[prost(message, tag = "31")] + JsonScan(super::JsonScanExecNode), } } #[derive(Clone, PartialEq, ::prost::Message)] @@ -1510,6 +1517,8 @@ pub struct FileScanExecConf { pub output_ordering: ::prost::alloc::vec::Vec, #[prost(message, optional, tag = "11")] pub constraints: ::core::option::Option, + #[prost(uint64, optional, tag = "12")] + pub batch_size: ::core::option::Option, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetScanExecNode { @@ -1553,6 +1562,11 @@ pub mod csv_scan_exec_node { } } #[derive(Clone, PartialEq, ::prost::Message)] +pub struct JsonScanExecNode { + #[prost(message, optional, tag = "1")] + pub base_conf: ::core::option::Option, +} +#[derive(Clone, PartialEq, ::prost::Message)] pub struct AvroScanExecNode { #[prost(message, optional, tag = "1")] pub base_conf: ::core::option::Option, diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index 5d84be1cff55..2df162f21e3a 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! Serialize / Deserialize DataFusion Plans to bytes diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 148856cd103c..c65569ef1cfb 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -867,7 +867,7 @@ impl AsLogicalPlan for LogicalPlanNode { None }; - let provider = ViewTable::try_new(input, definition)?; + let provider = ViewTable::new(input, definition); let table_name = from_table_reference(scan.table_name.as_ref(), "ViewScan")?; diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 5bb0cdb20c9c..841c31fa035f 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -19,6 +19,8 @@ //! DataFusion logical plans to be serialized and transmitted between //! processes. +use std::collections::HashMap; + use datafusion_common::{TableReference, UnnestOptions}; use datafusion_expr::dml::InsertOp; use datafusion_expr::expr::{ @@ -200,6 +202,7 @@ pub fn serialize_expr( expr, relation, name, + metadata, }) => { let alias = Box::new(protobuf::AliasNode { expr: Some(Box::new(serialize_expr(expr.as_ref(), codec)?)), @@ -208,6 +211,7 @@ pub fn serialize_expr( .map(|r| vec![r.into()]) .unwrap_or(vec![]), alias: name.to_owned(), + metadata: metadata.to_owned().unwrap_or(HashMap::new()), }); protobuf::LogicalExprNode { expr_type: Some(ExprType::Alias(alias)), diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 6331b7fb3114..a417eccee1cd 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -544,7 +544,8 @@ pub fn parse_protobuf_file_scan_config( .with_projection(projection) .with_limit(proto.limit.as_ref().map(|sl| sl.limit as usize)) .with_table_partition_cols(table_partition_cols) - .with_output_ordering(output_ordering); + .with_output_ordering(output_ordering) + .with_batch_size(proto.batch_size.map(|s| s as usize)); Ok(config) } @@ -657,6 +658,7 @@ impl TryFrom<&protobuf::FileSinkConfig> for FileSinkConfig { protobuf::InsertOp::Replace => InsertOp::Replace, }; Ok(Self { + original_url: String::default(), object_store_url: ObjectStoreUrl::parse(&conf.object_store_url)?, file_groups, table_paths, diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 60972ac54ba7..6562a9be458f 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -33,7 +33,7 @@ use datafusion::datasource::file_format::parquet::ParquetSink; use datafusion::datasource::physical_plan::AvroSource; #[cfg(feature = "parquet")] use datafusion::datasource::physical_plan::ParquetSource; -use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig}; +use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, JsonSource}; use datafusion::datasource::source::DataSourceExec; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::FunctionRegistry; @@ -247,6 +247,15 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { .with_file_compression_type(FileCompressionType::UNCOMPRESSED); Ok(conf.build()) } + PhysicalPlanType::JsonScan(scan) => { + let scan_conf = parse_protobuf_file_scan_config( + scan.base_conf.as_ref().unwrap(), + registry, + extension_codec, + Arc::new(JsonSource::new()), + )?; + Ok(scan_conf.build()) + } #[cfg_attr(not(feature = "parquet"), allow(unused_variables))] PhysicalPlanType::ParquetScan(scan) => { #[cfg(feature = "parquet")] @@ -1684,6 +1693,26 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { } } + if let Some(data_source_exec) = plan.downcast_ref::() { + let data_source = data_source_exec.data_source(); + if let Some(scan_conf) = data_source.as_any().downcast_ref::() + { + let source = scan_conf.file_source(); + if let Some(_json_source) = source.as_any().downcast_ref::() { + return Ok(protobuf::PhysicalPlanNode { + physical_plan_type: Some(PhysicalPlanType::JsonScan( + protobuf::JsonScanExecNode { + base_conf: Some(serialize_file_scan_config( + scan_conf, + extension_codec, + )?), + }, + )), + }); + } + } + } + #[cfg(feature = "parquet")] if let Some(exec) = plan.downcast_ref::() { let data_source_exec = exec.data_source(); diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 3f67842fe625..c2cf506eb96d 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -530,6 +530,7 @@ pub fn serialize_file_scan_config( }) .collect::>(), constraints: Some(conf.constraints.clone().into()), + batch_size: conf.batch_size.map(|s| s as u64), }) } diff --git a/datafusion/proto/tests/cases/mod.rs b/datafusion/proto/tests/cases/mod.rs index 25efa2690268..92d961fc7556 100644 --- a/datafusion/proto/tests/cases/mod.rs +++ b/datafusion/proto/tests/cases/mod.rs @@ -16,17 +16,17 @@ // under the License. use arrow::datatypes::{DataType, Field}; -use std::any::Any; -use std::fmt::Debug; - +use datafusion::logical_expr::ColumnarValue; use datafusion_common::plan_err; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, PartitionEvaluator, ScalarUDFImpl, Signature, - Volatility, WindowUDFImpl, + Accumulator, AggregateUDFImpl, PartitionEvaluator, ScalarFunctionArgs, ScalarUDFImpl, + Signature, Volatility, WindowUDFImpl, }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use datafusion_functions_window_common::partition::PartitionEvaluatorArgs; +use std::any::Any; +use std::fmt::Debug; mod roundtrip_logical_plan; mod roundtrip_physical_plan; @@ -69,6 +69,14 @@ impl ScalarUDFImpl for MyRegexUdf { plan_err!("regex_udf only accepts Utf8 arguments") } } + + fn invoke_with_args( + &self, + _args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + panic!("dummy - not implemented") + } + fn aliases(&self) -> &[String] { &self.aliases } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 9cc7514a0d33..9fa1f74ae188 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -67,7 +67,7 @@ use datafusion_common::{ use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ self, Between, BinaryExpr, Case, Cast, GroupingSet, InList, Like, ScalarFunction, - Unnest, + Unnest, WildcardOptions, }; use datafusion_expr::logical_plan::{Extension, UserDefinedLogicalNodeCore}; use datafusion_expr::{ @@ -2029,7 +2029,11 @@ fn roundtrip_unnest() { #[test] fn roundtrip_wildcard() { - let test_expr = wildcard(); + #[expect(deprecated)] + let test_expr = Expr::Wildcard { + qualifier: None, + options: Box::new(WildcardOptions::default()), + }; let ctx = SessionContext::new(); roundtrip_expr_test(test_expr, ctx); @@ -2037,7 +2041,11 @@ fn roundtrip_wildcard() { #[test] fn roundtrip_qualified_wildcard() { - let test_expr = qualified_wildcard("foo"); + #[expect(deprecated)] + let test_expr = Expr::Wildcard { + qualifier: Some("foo".into()), + options: Box::new(WildcardOptions::default()), + }; let ctx = SessionContext::new(); roundtrip_expr_test(test_expr, ctx); diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index b5bfef99a6f3..aeae39c4d039 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::fmt::Display; +use std::fmt::{Display, Formatter}; use std::ops::Deref; use std::sync::Arc; use std::vec; @@ -860,6 +860,10 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { ) -> Result> { todo!() } + + fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self, f) + } } #[derive(Debug)] @@ -1273,6 +1277,15 @@ fn roundtrip_analyze() -> Result<()> { ))) } +#[tokio::test] +async fn roundtrip_json_source() -> Result<()> { + let ctx = SessionContext::new(); + ctx.register_json("t1", "../core/tests/data/1.json", Default::default()) + .await?; + let plan = ctx.table("t1").await?.create_physical_plan().await?; + roundtrip_test(plan) +} + #[test] fn roundtrip_json_sink() -> Result<()> { let field_a = Field::new("plan_type", DataType::Utf8, false); @@ -1281,6 +1294,7 @@ fn roundtrip_json_sink() -> Result<()> { let input = Arc::new(PlaceholderRowExec::new(schema.clone())); let file_sink_config = FileSinkConfig { + original_url: String::default(), object_store_url: ObjectStoreUrl::local_filesystem(), file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], table_paths: vec![ListingTableUrl::parse("file:///")?], @@ -1317,6 +1331,7 @@ fn roundtrip_csv_sink() -> Result<()> { let input = Arc::new(PlaceholderRowExec::new(schema.clone())); let file_sink_config = FileSinkConfig { + original_url: String::default(), object_store_url: ObjectStoreUrl::local_filesystem(), file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], table_paths: vec![ListingTableUrl::parse("file:///")?], @@ -1372,6 +1387,7 @@ fn roundtrip_parquet_sink() -> Result<()> { let input = Arc::new(PlaceholderRowExec::new(schema.clone())); let file_sink_config = FileSinkConfig { + original_url: String::default(), object_store_url: ObjectStoreUrl::local_filesystem(), file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], table_paths: vec![ListingTableUrl::parse("file:///")?], diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index cdf61183eb3d..436f4388d8a3 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -22,16 +22,15 @@ use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, Dependency, Diagnostic, Result, Span, }; -use datafusion_expr::expr::{ScalarFunction, Unnest}; +use datafusion_expr::expr::{ScalarFunction, Unnest, WildcardOptions}; use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr}; use datafusion_expr::{ - expr, qualified_wildcard, wildcard, Expr, ExprFunctionExt, ExprSchemable, - WindowFrame, WindowFunctionDefinition, + expr, Expr, ExprFunctionExt, ExprSchemable, WindowFrame, WindowFunctionDefinition, }; use sqlparser::ast::{ DuplicateTreatment, Expr as SQLExpr, Function as SQLFunction, FunctionArg, FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments, - NullTreatment, ObjectName, OrderByExpr, WindowType, + NullTreatment, ObjectName, OrderByExpr, Spanned, WindowType, }; /// Suggest a valid function based on an invalid input function name @@ -217,13 +216,21 @@ impl SqlToRel<'_, S> { // it shouldn't have ordering requirement as function argument // required ordering should be defined in OVER clause. let is_function_window = over.is_some(); - let sql_parser_span = name.0[0].span; + let sql_parser_span = name.0[0].span(); let name = if name.0.len() > 1 { // DF doesn't handle compound identifiers // (e.g. "foo.bar") for function names yet name.to_string() } else { - crate::utils::normalize_ident(name.0[0].clone()) + match name.0[0].as_ident() { + Some(ident) => crate::utils::normalize_ident(ident.clone()), + None => { + return plan_err!( + "Expected an identifier in function name, but found {:?}", + name.0[0] + ) + } + } }; if name.eq("make_map") { @@ -473,11 +480,27 @@ impl SqlToRel<'_, S> { name: _, arg: FunctionArgExpr::Wildcard, operator: _, - } => Ok(wildcard()), + } => { + #[expect(deprecated)] + let expr = Expr::Wildcard { + qualifier: None, + options: Box::new(WildcardOptions::default()), + }; + + Ok(expr) + } FunctionArg::Unnamed(FunctionArgExpr::Expr(arg)) => { self.sql_expr_to_logical_expr(arg, schema, planner_context) } - FunctionArg::Unnamed(FunctionArgExpr::Wildcard) => Ok(wildcard()), + FunctionArg::Unnamed(FunctionArgExpr::Wildcard) => { + #[expect(deprecated)] + let expr = Expr::Wildcard { + qualifier: None, + options: Box::new(WildcardOptions::default()), + }; + + Ok(expr) + } FunctionArg::Unnamed(FunctionArgExpr::QualifiedWildcard(object_name)) => { let qualifier = self.object_name_to_table_reference(object_name)?; // Sanity check on qualifier with schema @@ -485,7 +508,14 @@ impl SqlToRel<'_, S> { if qualified_indices.is_empty() { return plan_err!("Invalid qualifier {qualifier}"); } - Ok(qualified_wildcard(qualifier)) + + #[expect(deprecated)] + let expr = Expr::Wildcard { + qualifier: qualifier.into(), + options: Box::new(WildcardOptions::default()), + }; + + Ok(expr) } _ => not_impl_err!("Unsupported qualified wildcard argument: {sql:?}"), } diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 7d358d0b6624..7c276ce53e35 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -22,7 +22,7 @@ use datafusion_common::{ }; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{Case, Expr}; -use sqlparser::ast::{Expr as SQLExpr, Ident}; +use sqlparser::ast::{CaseWhen, Expr as SQLExpr, Ident}; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_expr::UNNAMED_TABLE; @@ -216,8 +216,7 @@ impl SqlToRel<'_, S> { pub(super) fn sql_case_identifier_to_expr( &self, operand: Option>, - conditions: Vec, - results: Vec, + conditions: Vec, else_result: Option>, schema: &DFSchema, planner_context: &mut PlannerContext, @@ -231,13 +230,22 @@ impl SqlToRel<'_, S> { } else { None }; - let when_expr = conditions + let when_then_expr = conditions .into_iter() - .map(|e| self.sql_expr_to_logical_expr(e, schema, planner_context)) - .collect::>>()?; - let then_expr = results - .into_iter() - .map(|e| self.sql_expr_to_logical_expr(e, schema, planner_context)) + .map(|e| { + Ok(( + Box::new(self.sql_expr_to_logical_expr( + e.condition, + schema, + planner_context, + )?), + Box::new(self.sql_expr_to_logical_expr( + e.result, + schema, + planner_context, + )?), + )) + }) .collect::>>()?; let else_expr = if let Some(e) = else_result { Some(Box::new(self.sql_expr_to_logical_expr( @@ -249,15 +257,7 @@ impl SqlToRel<'_, S> { None }; - Ok(Expr::Case(Case::new( - expr, - when_expr - .iter() - .zip(then_expr.iter()) - .map(|(w, t)| (Box::new(w.to_owned()), Box::new(t.to_owned()))) - .collect(), - else_expr, - ))) + Ok(Expr::Case(Case::new(expr, when_then_expr, else_expr))) } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index c5bcf5a2fae9..d29ccdc6a7e9 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -22,12 +22,12 @@ use datafusion_expr::planner::{ use sqlparser::ast::{ AccessExpr, BinaryOperator, CastFormat, CastKind, DataType as SQLDataType, DictionaryField, Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias, MapEntry, - StructField, Subscript, TrimWhereField, Value, + StructField, Subscript, TrimWhereField, Value, ValueWithSpan, }; use datafusion_common::{ - internal_datafusion_err, internal_err, not_impl_err, plan_err, Column, DFSchema, - Result, ScalarValue, + internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result, + ScalarValue, }; use datafusion_expr::expr::ScalarFunction; @@ -211,7 +211,7 @@ impl SqlToRel<'_, S> { // more context. match sql { SQLExpr::Value(value) => { - self.parse_value(value, planner_context.prepare_param_data_types()) + self.parse_value(value.into(), planner_context.prepare_param_data_types()) } SQLExpr::Extract { field, expr, .. } => { let mut extract_args = vec![ @@ -253,12 +253,10 @@ impl SqlToRel<'_, S> { SQLExpr::Case { operand, conditions, - results, else_result, } => self.sql_case_identifier_to_expr( operand, conditions, - results, else_result, schema, planner_context, @@ -292,7 +290,7 @@ impl SqlToRel<'_, S> { } SQLExpr::TypedString { data_type, value } => Ok(Expr::Cast(Cast::new( - Box::new(lit(value)), + Box::new(lit(value.into_string().unwrap())), self.convert_data_type(&data_type)?, ))), @@ -544,9 +542,10 @@ impl SqlToRel<'_, S> { planner_context, )?), match *time_zone { - SQLExpr::Value(Value::SingleQuotedString(s)) => { - DataType::Timestamp(TimeUnit::Nanosecond, Some(s.into())) - } + SQLExpr::Value(ValueWithSpan { + value: Value::SingleQuotedString(s), + span: _, + }) => DataType::Timestamp(TimeUnit::Nanosecond, Some(s.into())), _ => { return not_impl_err!( "Unsupported ast node in sqltorel: {time_zone:?}" @@ -983,6 +982,64 @@ impl SqlToRel<'_, S> { Ok(Expr::Cast(Cast::new(Box::new(expr), dt))) } + /// Extracts the root expression and access chain from a compound expression. + /// + /// This function attempts to identify if a compound expression (like `a.b.c`) should be treated + /// as a column reference with a qualifier (like `table.column`) or as a field access expression. + /// + /// # Arguments + /// + /// * `root` - The root SQL expression (e.g., the first part of `a.b.c`) + /// * `access_chain` - Vector of access expressions (e.g., `.b` and `.c` parts) + /// * `schema` - The schema to resolve column references against + /// * `planner_context` - Context for planning expressions + /// + /// # Returns + /// + /// A tuple containing: + /// * The resolved root expression + /// * The remaining access chain that should be processed as field accesses + fn extract_root_and_access_chain( + &self, + root: SQLExpr, + mut access_chain: Vec, + schema: &DFSchema, + planner_context: &mut PlannerContext, + ) -> Result<(Expr, Vec)> { + let SQLExpr::Identifier(root_ident) = root else { + let root = self.sql_expr_to_logical_expr(root, schema, planner_context)?; + return Ok((root, access_chain)); + }; + + let mut compound_idents = vec![root_ident]; + let first_non_ident = access_chain + .iter() + .position(|access| !matches!(access, AccessExpr::Dot(SQLExpr::Identifier(_)))) + .unwrap_or(access_chain.len()); + for access in access_chain.drain(0..first_non_ident) { + if let AccessExpr::Dot(SQLExpr::Identifier(ident)) = access { + compound_idents.push(ident); + } else { + return internal_err!("Expected identifier in access chain"); + } + } + + let root = if compound_idents.len() == 1 { + self.sql_identifier_to_expr( + compound_idents.pop().unwrap(), + schema, + planner_context, + )? + } else { + self.sql_compound_identifier_to_expr( + compound_idents, + schema, + planner_context, + )? + }; + Ok((root, access_chain)) + } + fn sql_compound_field_access_to_expr( &self, root: SQLExpr, @@ -990,7 +1047,12 @@ impl SqlToRel<'_, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { - let mut root = self.sql_expr_to_logical_expr(root, schema, planner_context)?; + let (root, access_chain) = self.extract_root_and_access_chain( + root, + access_chain, + schema, + planner_context, + )?; let fields = access_chain .into_iter() .map(|field| match field { @@ -999,10 +1061,12 @@ impl SqlToRel<'_, S> { Subscript::Index { index } => { // index can be a name, in which case it is a named field access match index { - SQLExpr::Value( - Value::SingleQuotedString(s) - | Value::DoubleQuotedString(s), - ) => Ok(Some(GetFieldAccess::NamedStructField { + SQLExpr::Value(ValueWithSpan { + value: + Value::SingleQuotedString(s) + | Value::DoubleQuotedString(s), + span: _, + }) => Ok(Some(GetFieldAccess::NamedStructField { name: ScalarValue::from(s), })), SQLExpr::JsonAccess { .. } => { @@ -1064,45 +1128,19 @@ impl SqlToRel<'_, S> { } } } - AccessExpr::Dot(expr) => { - let expr = - self.sql_expr_to_logical_expr(expr, schema, planner_context)?; - match expr { - Expr::Column(Column { - name, - relation, - spans, - }) => { - if let Some(relation) = &relation { - // If the first part of the dot access is a column reference, we should - // check if the column is from the same table as the root expression. - // If it is, we should replace the root expression with the column reference. - // Otherwise, we should treat the dot access as a named field access. - if relation.table() == root.schema_name().to_string() { - root = Expr::Column(Column { - name, - relation: Some(relation.clone()), - spans, - }); - Ok(None) - } else { - plan_err!( - "table name mismatch: {} != {}", - relation.table(), - root.schema_name() - ) - } - } else { - Ok(Some(GetFieldAccess::NamedStructField { - name: ScalarValue::from(name), - })) - } - } - _ => not_impl_err!( - "Dot access not supported for non-column expr: {expr:?}" - ), + AccessExpr::Dot(expr) => match expr { + SQLExpr::Value(ValueWithSpan { + value: Value::SingleQuotedString(s) | Value::DoubleQuotedString(s), + span : _ + }) => Ok(Some(GetFieldAccess::NamedStructField { + name: ScalarValue::from(s), + })), + _ => { + not_impl_err!( + "Dot access not supported for non-string expr: {expr:?}" + ) } - } + }, }) .collect::>>()?; diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs index b7ed04326f40..cce3f3004809 100644 --- a/datafusion/sql/src/expr/order_by.rs +++ b/datafusion/sql/src/expr/order_by.rs @@ -21,7 +21,9 @@ use datafusion_common::{ }; use datafusion_expr::expr::Sort; use datafusion_expr::{Expr, SortExpr}; -use sqlparser::ast::{Expr as SQLExpr, OrderByExpr, Value}; +use sqlparser::ast::{ + Expr as SQLExpr, OrderByExpr, OrderByOptions, Value, ValueWithSpan, +}; impl SqlToRel<'_, S> { /// Convert sql [OrderByExpr] to `Vec`. @@ -62,9 +64,8 @@ impl SqlToRel<'_, S> { let mut expr_vec = vec![]; for e in exprs { let OrderByExpr { - asc, expr, - nulls_first, + options: OrderByOptions { asc, nulls_first }, with_fill, } = e; @@ -73,7 +74,10 @@ impl SqlToRel<'_, S> { } let expr = match expr { - SQLExpr::Value(Value::Number(v, _)) if literal_to_column => { + SQLExpr::Value(ValueWithSpan { + value: Value::Number(v, _), + span: _, + }) if literal_to_column => { let field_index = v .parse::() .map_err(|err| plan_datafusion_err!("{}", err))?; diff --git a/datafusion/sql/src/expr/subquery.rs b/datafusion/sql/src/expr/subquery.rs index 481f024787fe..225c5d74c2ab 100644 --- a/datafusion/sql/src/expr/subquery.rs +++ b/datafusion/sql/src/expr/subquery.rs @@ -16,12 +16,11 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{DFSchema, Result}; -use datafusion_expr::expr::Exists; -use datafusion_expr::expr::InSubquery; -use datafusion_expr::{Expr, Subquery}; +use datafusion_common::{plan_err, DFSchema, Diagnostic, Result, Span, Spans}; +use datafusion_expr::expr::{Exists, InSubquery}; +use datafusion_expr::{Expr, LogicalPlan, Subquery}; use sqlparser::ast::Expr as SQLExpr; -use sqlparser::ast::Query; +use sqlparser::ast::{Query, SelectItem, SetExpr}; use std::sync::Arc; impl SqlToRel<'_, S> { @@ -41,6 +40,7 @@ impl SqlToRel<'_, S> { subquery: Subquery { subquery: Arc::new(sub_plan), outer_ref_columns, + spans: Spans::new(), }, negated, })) @@ -56,15 +56,37 @@ impl SqlToRel<'_, S> { ) -> Result { let old_outer_query_schema = planner_context.set_outer_query_schema(Some(input_schema.clone().into())); + + let mut spans = Spans::new(); + if let SetExpr::Select(select) = subquery.body.as_ref() { + for item in &select.projection { + if let SelectItem::UnnamedExpr(SQLExpr::Identifier(ident)) = item { + if let Some(span) = Span::try_from_sqlparser_span(ident.span) { + spans.add_span(span); + } + } + } + } + let sub_plan = self.query_to_plan(subquery, planner_context)?; let outer_ref_columns = sub_plan.all_out_ref_exprs(); planner_context.set_outer_query_schema(old_outer_query_schema); - let expr = Box::new(self.sql_to_expr(expr, input_schema, planner_context)?); + + self.validate_single_column( + &sub_plan, + spans.clone(), + "Too many columns! The subquery should only return one column", + "Select only one column in the subquery", + )?; + + let expr_obj = self.sql_to_expr(expr, input_schema, planner_context)?; + Ok(Expr::InSubquery(InSubquery::new( - expr, + Box::new(expr_obj), Subquery { subquery: Arc::new(sub_plan), outer_ref_columns, + spans, }, negated, ))) @@ -78,12 +100,72 @@ impl SqlToRel<'_, S> { ) -> Result { let old_outer_query_schema = planner_context.set_outer_query_schema(Some(input_schema.clone().into())); + let mut spans = Spans::new(); + if let SetExpr::Select(select) = subquery.body.as_ref() { + for item in &select.projection { + if let SelectItem::ExprWithAlias { alias, .. } = item { + if let Some(span) = Span::try_from_sqlparser_span(alias.span) { + spans.add_span(span); + } + } + } + } let sub_plan = self.query_to_plan(subquery, planner_context)?; let outer_ref_columns = sub_plan.all_out_ref_exprs(); planner_context.set_outer_query_schema(old_outer_query_schema); + + self.validate_single_column( + &sub_plan, + spans.clone(), + "Too many columns! The subquery should only return one column", + "Select only one column in the subquery", + )?; + Ok(Expr::ScalarSubquery(Subquery { subquery: Arc::new(sub_plan), outer_ref_columns, + spans, })) } + + fn validate_single_column( + &self, + sub_plan: &LogicalPlan, + spans: Spans, + error_message: &str, + help_message: &str, + ) -> Result<()> { + if sub_plan.schema().fields().len() > 1 { + let sub_schema = sub_plan.schema(); + let field_names = sub_schema.field_names(); + + plan_err!("{}: {}", error_message, field_names.join(", ")).map_err(|err| { + let diagnostic = self.build_multi_column_diagnostic( + spans, + error_message, + help_message, + ); + err.with_diagnostic(diagnostic) + }) + } else { + Ok(()) + } + } + + fn build_multi_column_diagnostic( + &self, + spans: Spans, + error_message: &str, + help_message: &str, + ) -> Diagnostic { + let full_span = Span::union_iter(spans.0.iter().cloned()); + let mut diagnostic = Diagnostic::new_error(error_message, full_span); + + for (i, span) in spans.iter().skip(1).enumerate() { + diagnostic.add_note(format!("Extra column {}", i + 1), Some(*span)); + } + + diagnostic.add_help(help_message, None); + diagnostic + } } diff --git a/datafusion/sql/src/expr/unary_op.rs b/datafusion/sql/src/expr/unary_op.rs index a4096ec2355b..626b79d6c3b6 100644 --- a/datafusion/sql/src/expr/unary_op.rs +++ b/datafusion/sql/src/expr/unary_op.rs @@ -16,12 +16,12 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, plan_err, DFSchema, Result}; +use datafusion_common::{not_impl_err, plan_err, DFSchema, Diagnostic, Result}; use datafusion_expr::{ type_coercion::{is_interval, is_timestamp}, Expr, ExprSchemable, }; -use sqlparser::ast::{Expr as SQLExpr, UnaryOperator, Value}; +use sqlparser::ast::{Expr as SQLExpr, UnaryOperator, Value, ValueWithSpan}; impl SqlToRel<'_, S> { pub(crate) fn parse_sql_unary_op( @@ -45,16 +45,26 @@ impl SqlToRel<'_, S> { { Ok(operand) } else { - plan_err!("Unary operator '+' only supports numeric, interval and timestamp types") + plan_err!("Unary operator '+' only supports numeric, interval and timestamp types").map_err(|e| { + let span = operand.spans().and_then(|s| s.first()); + let mut diagnostic = Diagnostic::new_error( + format!("+ cannot be used with {data_type}"), + span + ); + diagnostic.add_note("+ can only be used with numbers, intervals, and timestamps", None); + diagnostic.add_help(format!("perhaps you need to cast {operand}"), None); + e.with_diagnostic(diagnostic) + }) } } UnaryOperator::Minus => { match expr { // Optimization: if it's a number literal, we apply the negative operator // here directly to calculate the new literal. - SQLExpr::Value(Value::Number(n, _)) => { - self.parse_sql_number(&n, true) - } + SQLExpr::Value(ValueWithSpan { + value: Value::Number(n, _), + span: _, + }) => self.parse_sql_number(&n, true), SQLExpr::Interval(interval) => { self.sql_interval_to_expr(true, interval) } diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs index 168348aee222..d53691ef05d1 100644 --- a/datafusion/sql/src/expr/value.rs +++ b/datafusion/sql/src/expr/value.rs @@ -32,7 +32,9 @@ use datafusion_expr::expr::{BinaryExpr, Placeholder}; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{lit, Expr, Operator}; use log::debug; -use sqlparser::ast::{BinaryOperator, Expr as SQLExpr, Interval, UnaryOperator, Value}; +use sqlparser::ast::{ + BinaryOperator, Expr as SQLExpr, Interval, UnaryOperator, Value, ValueWithSpan, +}; use sqlparser::parser::ParserError::ParserError; use std::borrow::Cow; use std::cmp::Ordering; @@ -254,8 +256,14 @@ impl SqlToRel<'_, S> { fn interval_literal(interval_value: SQLExpr, negative: bool) -> Result { let s = match interval_value { - SQLExpr::Value(Value::SingleQuotedString(s) | Value::DoubleQuotedString(s)) => s, - SQLExpr::Value(Value::Number(ref v, long)) => { + SQLExpr::Value(ValueWithSpan { + value: Value::SingleQuotedString(s) | Value::DoubleQuotedString(s), + span: _, + }) => s, + SQLExpr::Value(ValueWithSpan { + value: Value::Number(ref v, long), + span: _, + }) => { if long { return not_impl_err!( "Unsupported interval argument. Long number not supported: {interval_value:?}" diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index d552efa8254c..7e11f160a397 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -20,7 +20,8 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] //! This crate provides: diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 92bfd8f9f07f..822b651eae86 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -16,11 +16,14 @@ // under the License. //! [`DFParser`]: DataFusion SQL Parser based on [`sqlparser`] +//! +//! This parser implements DataFusion specific statements such as +//! `CREATE EXTERNAL TABLE` use std::collections::VecDeque; use std::fmt; -use sqlparser::ast::ExprWithAlias; +use sqlparser::ast::{ExprWithAlias, OrderByOptions}; use sqlparser::tokenizer::TokenWithSpan; use sqlparser::{ ast::{ @@ -43,12 +46,23 @@ fn parse_file_type(s: &str) -> Result { Ok(s.to_uppercase()) } -/// DataFusion specific EXPLAIN (needed so we can EXPLAIN datafusion -/// specific COPY and other statements) +/// DataFusion specific `EXPLAIN` +/// +/// Syntax: +/// ```sql +/// EXPLAIN [FORMAT format] statement +///``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct ExplainStatement { + /// `EXPLAIN ANALYZE ..` pub analyze: bool, + /// `EXPLAIN .. VERBOSE ..` pub verbose: bool, + /// `EXPLAIN .. FORMAT ` + pub format: Option, + /// The statement to analyze. Note this is a DataFusion [`Statement`] (not a + /// [`sqlparser::ast::Statement`] so that we can use `EXPLAIN`, `COPY`, and other + /// DataFusion specific statements pub statement: Box, } @@ -57,6 +71,7 @@ impl fmt::Display for ExplainStatement { let Self { analyze, verbose, + format, statement, } = self; @@ -67,6 +82,9 @@ impl fmt::Display for ExplainStatement { if *verbose { write!(f, "VERBOSE ")?; } + if let Some(format) = format.as_ref() { + write!(f, "FORMAT {format} ")?; + } write!(f, "{statement}") } @@ -446,7 +464,6 @@ impl<'a> DFParser<'a> { self.parse_copy() } Keyword::EXPLAIN => { - // (TODO parse all supported statements) self.parser.next_token(); // EXPLAIN self.parse_explain() } @@ -620,15 +637,35 @@ impl<'a> DFParser<'a> { pub fn parse_explain(&mut self) -> Result { let analyze = self.parser.parse_keyword(Keyword::ANALYZE); let verbose = self.parser.parse_keyword(Keyword::VERBOSE); + let format = self.parse_explain_format()?; + let statement = self.parse_statement()?; Ok(Statement::Explain(ExplainStatement { statement: Box::new(statement), analyze, verbose, + format, })) } + pub fn parse_explain_format(&mut self) -> Result, ParserError> { + if !self.parser.parse_keyword(Keyword::FORMAT) { + return Ok(None); + } + + let next_token = self.parser.next_token(); + let format = match next_token.token { + Token::Word(w) => Ok(w.value), + Token::SingleQuotedString(w) => Ok(w), + Token::DoubleQuotedString(w) => Ok(w), + _ => self + .parser + .expected("an explain format such as TREE", next_token), + }?; + Ok(Some(format)) + } + /// Parse a SQL `CREATE` statement handling `CREATE EXTERNAL TABLE` pub fn parse_create(&mut self) -> Result { if self.parser.parse_keyword(Keyword::EXTERNAL) { @@ -708,8 +745,7 @@ impl<'a> DFParser<'a> { Ok(OrderByExpr { expr, - asc, - nulls_first, + options: OrderByOptions { asc, nulls_first }, with_fill: None, }) } @@ -756,11 +792,6 @@ impl<'a> DFParser<'a> { fn parse_column_def(&mut self) -> Result { let name = self.parser.parse_identifier()?; let data_type = self.parser.parse_data_type()?; - let collation = if self.parser.parse_keyword(Keyword::COLLATE) { - Some(self.parser.parse_object_name(false)?) - } else { - None - }; let mut options = vec![]; loop { if self.parser.parse_keyword(Keyword::CONSTRAINT) { @@ -782,7 +813,6 @@ impl<'a> DFParser<'a> { Ok(ColumnDef { name, data_type, - collation, options, }) } @@ -1006,7 +1036,6 @@ mod tests { span: Span::empty(), }, data_type, - collation: None, options: vec![], } } @@ -1016,7 +1045,7 @@ mod tests { // positive case let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv'"; let display = None; - let name = ObjectName(vec![Ident::from("t")]); + let name = ObjectName::from(vec![Ident::from("t")]); let expected = Statement::CreateExternalTable(CreateExternalTable { name: name.clone(), columns: vec![make_column_def("c1", DataType::Int(display))], @@ -1314,8 +1343,7 @@ mod tests { quote_style: None, span: Span::empty(), }), - asc, - nulls_first, + options: OrderByOptions { asc, nulls_first }, with_fill: None, }]], if_not_exists: false, @@ -1346,8 +1374,10 @@ mod tests { quote_style: None, span: Span::empty(), }), - asc: Some(true), - nulls_first: None, + options: OrderByOptions { + asc: Some(true), + nulls_first: None, + }, with_fill: None, }, OrderByExpr { @@ -1356,8 +1386,10 @@ mod tests { quote_style: None, span: Span::empty(), }), - asc: Some(false), - nulls_first: Some(true), + options: OrderByOptions { + asc: Some(false), + nulls_first: Some(true), + }, with_fill: None, }, ]], @@ -1395,8 +1427,10 @@ mod tests { span: Span::empty(), })), }, - asc: Some(true), - nulls_first: None, + options: OrderByOptions { + asc: Some(true), + nulls_first: None, + }, with_fill: None, }]], if_not_exists: false, @@ -1442,8 +1476,10 @@ mod tests { span: Span::empty(), })), }, - asc: Some(true), - nulls_first: None, + options: OrderByOptions { + asc: Some(true), + nulls_first: None, + }, with_fill: None, }]], if_not_exists: true, @@ -1543,6 +1579,7 @@ mod tests { let expected = Statement::Explain(ExplainStatement { analyze, verbose, + format: None, statement: Box::new(expected_copy), }); assert_eq!(verified_stmt(sql), expected); @@ -1656,7 +1693,7 @@ mod tests { // For error cases, see: `copy.slt` fn object_name(name: &str) -> CopyToSource { - CopyToSource::Relation(ObjectName(vec![Ident::new(name)])) + CopyToSource::Relation(ObjectName::from(vec![Ident::new(name)])) } // Based on sqlparser-rs diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index bc7c2b7f4377..180017ee9c19 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -23,20 +23,18 @@ use std::vec; use arrow::datatypes::*; use datafusion_common::config::SqlParserOptions; use datafusion_common::error::add_possible_columns_to_diag; +use datafusion_common::TableReference; use datafusion_common::{ field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, Diagnostic, SchemaError, }; -use sqlparser::ast::TimezoneInfo; -use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo}; -use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; -use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; - -use datafusion_common::TableReference; use datafusion_common::{not_impl_err, plan_err, DFSchema, DataFusionError, Result}; use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder}; use datafusion_expr::utils::find_column_exprs; use datafusion_expr::{col, Expr}; +use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo, TimezoneInfo}; +use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; +use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; use crate::utils::make_decimal_type; pub use datafusion_expr::planner::ContextProvider; @@ -54,6 +52,8 @@ pub struct ParserOptions { pub enable_options_value_normalization: bool, /// Whether to collect spans pub collect_spans: bool, + /// Whether `VARCHAR` is mapped to `Utf8View` during SQL planning. + pub map_varchar_to_utf8view: bool, } impl ParserOptions { @@ -72,6 +72,7 @@ impl ParserOptions { parse_float_as_decimal: false, enable_ident_normalization: true, support_varchar_with_length: true, + map_varchar_to_utf8view: false, enable_options_value_normalization: false, collect_spans: false, } @@ -111,6 +112,12 @@ impl ParserOptions { self } + /// Sets the `map_varchar_to_utf8view` option. + pub fn with_map_varchar_to_utf8view(mut self, value: bool) -> Self { + self.map_varchar_to_utf8view = value; + self + } + /// Sets the `enable_options_value_normalization` option. pub fn with_enable_options_value_normalization(mut self, value: bool) -> Self { self.enable_options_value_normalization = value; @@ -136,6 +143,7 @@ impl From<&SqlParserOptions> for ParserOptions { parse_float_as_decimal: options.parse_float_as_decimal, enable_ident_normalization: options.enable_ident_normalization, support_varchar_with_length: options.support_varchar_with_length, + map_varchar_to_utf8view: options.map_varchar_to_utf8view, enable_options_value_normalization: options .enable_options_value_normalization, collect_spans: options.collect_spans, @@ -169,16 +177,17 @@ impl IdentNormalizer { } } -/// Struct to store the states used by the Planner. The Planner will leverage the states to resolve -/// CTEs, Views, subqueries and PREPARE statements. The states include +/// Struct to store the states used by the Planner. The Planner will leverage the states +/// to resolve CTEs, Views, subqueries and PREPARE statements. The states include /// Common Table Expression (CTE) provided with WITH clause and /// Parameter Data Types provided with PREPARE statement and the query schema of the /// outer query plan. /// /// # Cloning /// -/// Only the `ctes` are truly cloned when the `PlannerContext` is cloned. This helps resolve -/// scoping issues of CTEs. By using cloning, a subquery can inherit CTEs from the outer query +/// Only the `ctes` are truly cloned when the `PlannerContext` is cloned. +/// This helps resolve scoping issues of CTEs. +/// By using cloning, a subquery can inherit CTEs from the outer query /// and can also define its own private CTEs without affecting the outer query. /// #[derive(Debug, Clone)] @@ -321,7 +330,8 @@ impl PlannerContext { /// by subsequent passes. /// /// Key interfaces are: -/// * [`Self::sql_statement_to_plan`]: Convert a statement (e.g. `SELECT ...`) into a [`LogicalPlan`] +/// * [`Self::sql_statement_to_plan`]: Convert a statement +/// (e.g. `SELECT ...`) into a [`LogicalPlan`] /// * [`Self::sql_to_expr`]: Convert an expression (e.g. `1 + 2`) into an [`Expr`] pub struct SqlToRel<'a, S: ContextProvider> { pub(crate) context_provider: &'a S, @@ -434,7 +444,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(plan) } else if idents.len() != plan.schema().fields().len() { plan_err!( - "Source table contains {} columns but only {} names given as column alias", + "Source table contains {} columns but only {} \ + names given as column alias", plan.schema().fields().len(), idents.len() ) @@ -548,31 +559,53 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { SQLDataType::Boolean | SQLDataType::Bool => Ok(DataType::Boolean), SQLDataType::TinyInt(_) => Ok(DataType::Int8), SQLDataType::SmallInt(_) | SQLDataType::Int2(_) => Ok(DataType::Int16), - SQLDataType::Int(_) | SQLDataType::Integer(_) | SQLDataType::Int4(_) => Ok(DataType::Int32), + SQLDataType::Int(_) | SQLDataType::Integer(_) | SQLDataType::Int4(_) => { + Ok(DataType::Int32) + } SQLDataType::BigInt(_) | SQLDataType::Int8(_) => Ok(DataType::Int64), - SQLDataType::UnsignedTinyInt(_) => Ok(DataType::UInt8), - SQLDataType::UnsignedSmallInt(_) | SQLDataType::UnsignedInt2(_) => Ok(DataType::UInt16), - SQLDataType::UnsignedInt(_) | SQLDataType::UnsignedInteger(_) | SQLDataType::UnsignedInt4(_) => { - Ok(DataType::UInt32) + SQLDataType::TinyIntUnsigned(_) => Ok(DataType::UInt8), + SQLDataType::SmallIntUnsigned(_) | SQLDataType::Int2Unsigned(_) => { + Ok(DataType::UInt16) } + SQLDataType::IntUnsigned(_) + | SQLDataType::IntegerUnsigned(_) + | SQLDataType::Int4Unsigned(_) => Ok(DataType::UInt32), SQLDataType::Varchar(length) => { match (length, self.options.support_varchar_with_length) { - (Some(_), false) => plan_err!("does not support Varchar with length, please set `support_varchar_with_length` to be true"), - _ => Ok(DataType::Utf8), + (Some(_), false) => plan_err!( + "does not support Varchar with length, \ + please set `support_varchar_with_length` to be true" + ), + _ => { + if self.options.map_varchar_to_utf8view { + Ok(DataType::Utf8View) + } else { + Ok(DataType::Utf8) + } + } } } - SQLDataType::UnsignedBigInt(_) | SQLDataType::UnsignedInt8(_) => Ok(DataType::UInt64), + SQLDataType::BigIntUnsigned(_) | SQLDataType::Int8Unsigned(_) => { + Ok(DataType::UInt64) + } SQLDataType::Float(_) => Ok(DataType::Float32), SQLDataType::Real | SQLDataType::Float4 => Ok(DataType::Float32), - SQLDataType::Double(ExactNumberInfo::None) | SQLDataType::DoublePrecision | SQLDataType::Float8 => Ok(DataType::Float64), - SQLDataType::Double(ExactNumberInfo::Precision(_)|ExactNumberInfo::PrecisionAndScale(_, _)) => { - not_impl_err!("Unsupported SQL type (precision/scale not supported) {sql_type}") + SQLDataType::Double(ExactNumberInfo::None) + | SQLDataType::DoublePrecision + | SQLDataType::Float8 => Ok(DataType::Float64), + SQLDataType::Double( + ExactNumberInfo::Precision(_) | ExactNumberInfo::PrecisionAndScale(_, _), + ) => { + not_impl_err!( + "Unsupported SQL type (precision/scale not supported) {sql_type}" + ) + } + SQLDataType::Char(_) | SQLDataType::Text | SQLDataType::String(_) => { + Ok(DataType::Utf8) } - SQLDataType::Char(_) - | SQLDataType::Text - | SQLDataType::String(_) => Ok(DataType::Utf8), SQLDataType::Timestamp(precision, tz_info) - if precision.is_none() || [0, 3, 6, 9].contains(&precision.unwrap()) => { + if precision.is_none() || [0, 3, 6, 9].contains(&precision.unwrap()) => + { let tz = if matches!(tz_info, TimezoneInfo::Tz) || matches!(tz_info, TimezoneInfo::WithTimeZone) { @@ -601,9 +634,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(DataType::Time64(TimeUnit::Nanosecond)) } else { // We don't support TIMETZ and TIME WITH TIME ZONE for now - not_impl_err!( - "Unsupported SQL type {sql_type:?}" - ) + not_impl_err!("Unsupported SQL type {sql_type:?}") } } SQLDataType::Numeric(exact_number_info) @@ -625,9 +656,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .enumerate() .map(|(idx, field)| { let data_type = self.convert_data_type(&field.field_type)?; - let field_name = match &field.field_name{ + let field_name = match &field.field_name { Some(ident) => ident.clone(), - None => Ident::new(format!("c{idx}")) + None => Ident::new(format!("c{idx}")), }; Ok(Arc::new(Field::new( self.ident_normalizer.normalize(field_name), @@ -638,9 +669,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .collect::>>()?; Ok(DataType::Struct(Fields::from(fields))) } - // Explicitly list all other types so that if sqlparser - // adds/changes the `SQLDataType` the compiler will tell us on upgrade - // and avoid bugs like https://github.com/apache/datafusion/issues/3059 SQLDataType::Nvarchar(_) | SQLDataType::JSON | SQLDataType::Uuid @@ -654,15 +682,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { | SQLDataType::Enum(_, _) | SQLDataType::Set(_) | SQLDataType::MediumInt(_) - | SQLDataType::UnsignedMediumInt(_) + | SQLDataType::MediumIntUnsigned(_) | SQLDataType::Character(_) | SQLDataType::CharacterVarying(_) | SQLDataType::CharVarying(_) | SQLDataType::CharacterLargeObject(_) | SQLDataType::CharLargeObject(_) - // Unsupported precision | SQLDataType::Timestamp(_, _) - // Precision is not supported | SQLDataType::Time(Some(_), _) | SQLDataType::Dec(_) | SQLDataType::BigNumeric(_) @@ -673,7 +699,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { | SQLDataType::Float64 | SQLDataType::JSONB | SQLDataType::Unspecified - // Clickhouse datatypes | SQLDataType::Int16 | SQLDataType::Int32 | SQLDataType::Int128 @@ -695,7 +720,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { | SQLDataType::Nullable(_) | SQLDataType::LowCardinality(_) | SQLDataType::Trigger - // MySQL datatypes | SQLDataType::TinyBlob | SQLDataType::MediumBlob | SQLDataType::LongBlob @@ -704,11 +728,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { | SQLDataType::LongText | SQLDataType::Bit(_) | SQLDataType::BitVarying(_) - // BigQuery UDFs + | SQLDataType::Signed + | SQLDataType::SignedInteger + | SQLDataType::Unsigned + | SQLDataType::UnsignedInteger | SQLDataType::AnyType - => not_impl_err!( - "Unsupported SQL type {sql_type:?}" - ), + | SQLDataType::Table(_) + | SQLDataType::VarBit(_) + | SQLDataType::GeometricType(_) => { + not_impl_err!("Unsupported SQL type {sql_type:?}") + } } } @@ -738,7 +767,18 @@ pub fn object_name_to_table_reference( enable_normalization: bool, ) -> Result { // Use destructure to make it clear no fields on ObjectName are ignored - let ObjectName(idents) = object_name; + let ObjectName(object_name_parts) = object_name; + let idents = object_name_parts + .into_iter() + .map(|object_name_part| { + object_name_part.as_ident().cloned().ok_or_else(|| { + plan_datafusion_err!( + "Expected identifier, but found: {:?}", + object_name_part + ) + }) + }) + .collect::>>()?; idents_to_table_reference(idents, enable_normalization) } @@ -820,7 +860,7 @@ pub(crate) fn idents_to_table_reference( pub fn object_name_to_qualifier( sql_table_name: &ObjectName, enable_normalization: bool, -) -> String { +) -> Result { let columns = vec!["table_name", "table_schema", "table_catalog"].into_iter(); let normalizer = IdentNormalizer::new(enable_normalization); sql_table_name @@ -828,13 +868,23 @@ pub fn object_name_to_qualifier( .iter() .rev() .zip(columns) - .map(|(ident, column_name)| { - format!( - r#"{} = '{}'"#, - column_name, - normalizer.normalize(ident.clone()) - ) + .map(|(object_name_part, column_name)| { + object_name_part + .as_ident() + .map(|ident| { + format!( + r#"{} = '{}'"#, + column_name, + normalizer.normalize(ident.clone()) + ) + }) + .ok_or_else(|| { + plan_datafusion_err!( + "Expected identifier, but found: {:?}", + object_name_part + ) + }) }) - .collect::>() - .join(" AND ") + .collect::>>() + .map(|parts| parts.join(" AND ")) } diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs index 9d5a54d90b2c..ea641320c01b 100644 --- a/datafusion/sql/src/query.rs +++ b/datafusion/sql/src/query.rs @@ -22,12 +22,13 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::stack::StackGuard; use datafusion_common::{not_impl_err, Constraints, DFSchema, Result}; use datafusion_expr::expr::Sort; +use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::{ CreateMemoryTable, DdlStatement, Distinct, LogicalPlan, LogicalPlanBuilder, }; use sqlparser::ast::{ - Expr as SQLExpr, Offset as SQLOffset, OrderBy, OrderByExpr, Query, SelectInto, - SetExpr, + Expr as SQLExpr, Offset as SQLOffset, OrderBy, OrderByExpr, OrderByKind, Query, + SelectInto, SetExpr, }; impl SqlToRel<'_, S> { @@ -50,10 +51,8 @@ impl SqlToRel<'_, S> { match set_expr { SetExpr::Select(mut select) => { let select_into = select.into.take(); - // Order-by expressions may refer to columns in the `FROM` clause, - // so we need to process `SELECT` and `ORDER BY` together. - let oby_exprs = to_order_by_exprs(query.order_by)?; - let plan = self.select_to_plan(*select, oby_exprs, planner_context)?; + let plan = + self.select_to_plan(*select, query.order_by, planner_context)?; let plan = self.limit(plan, query.offset, query.limit, planner_context)?; // Process the `SELECT INTO` after `LIMIT`. @@ -153,12 +152,23 @@ impl SqlToRel<'_, S> { /// Returns the order by expressions from the query. fn to_order_by_exprs(order_by: Option) -> Result> { - let Some(OrderBy { exprs, interpolate }) = order_by else { + to_order_by_exprs_with_select(order_by, None) +} + +/// Returns the order by expressions from the query with the select expressions. +pub(crate) fn to_order_by_exprs_with_select( + order_by: Option, + _select_exprs: Option<&Vec>, // TODO: ORDER BY ALL +) -> Result> { + let Some(OrderBy { kind, interpolate }) = order_by else { // If no order by, return an empty array. return Ok(vec![]); }; if let Some(_interpolate) = interpolate { return not_impl_err!("ORDER BY INTERPOLATE is not supported"); } - Ok(exprs) + match kind { + OrderByKind::All(_) => not_impl_err!("ORDER BY ALL is not supported"), + OrderByKind::Expressions(order_by_exprs) => Ok(order_by_exprs), + } } diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs index 88665401dc31..8a3c20e3971b 100644 --- a/datafusion/sql/src/relation/join.rs +++ b/datafusion/sql/src/relation/join.rs @@ -16,7 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, Column, Result}; +use datafusion_common::{not_impl_err, plan_datafusion_err, Column, Result}; use datafusion_expr::{JoinType, LogicalPlan, LogicalPlanBuilder}; use sqlparser::ast::{ Join, JoinConstraint, JoinOperator, ObjectName, TableFactor, TableWithJoins, @@ -55,13 +55,13 @@ impl SqlToRel<'_, S> { self.create_relation(join.relation, planner_context)? }; match join.join_operator { - JoinOperator::LeftOuter(constraint) => { + JoinOperator::LeftOuter(constraint) | JoinOperator::Left(constraint) => { self.parse_join(left, right, constraint, JoinType::Left, planner_context) } - JoinOperator::RightOuter(constraint) => { + JoinOperator::RightOuter(constraint) | JoinOperator::Right(constraint) => { self.parse_join(left, right, constraint, JoinType::Right, planner_context) } - JoinOperator::Inner(constraint) => { + JoinOperator::Inner(constraint) | JoinOperator::Join(constraint) => { self.parse_join(left, right, constraint, JoinType::Inner, planner_context) } JoinOperator::LeftSemi(constraint) => self.parse_join( @@ -136,7 +136,13 @@ impl SqlToRel<'_, S> { ) } else { let id = object_names.swap_remove(0); - Ok(self.ident_normalizer.normalize(id)) + id.as_ident() + .ok_or_else(|| { + plan_datafusion_err!( + "Expected identifier in USING clause" + ) + }) + .map(|ident| self.ident_normalizer.normalize(ident.clone())) } }) .collect::>>()?; @@ -186,6 +192,7 @@ pub(crate) fn is_lateral_join(join: &Join) -> Result { let is_lateral_syntax = is_lateral(&join.relation); let is_apply_syntax = match join.join_operator { JoinOperator::FullOuter(..) + | JoinOperator::Right(..) | JoinOperator::RightOuter(..) | JoinOperator::RightAnti(..) | JoinOperator::RightSemi(..) diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 800dd151a124..dee855f8c000 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -21,7 +21,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{ - not_impl_err, plan_err, DFSchema, Diagnostic, Result, Span, TableReference, + not_impl_err, plan_err, DFSchema, Diagnostic, Result, Span, Spans, TableReference, }; use datafusion_expr::builder::subquery_alias; use datafusion_expr::{expr::Unnest, Expr, LogicalPlan, LogicalPlanBuilder}; @@ -43,7 +43,8 @@ impl SqlToRel<'_, S> { name, alias, args, .. } => { if let Some(func_args) = args { - let tbl_func_name = name.0.first().unwrap().value.to_string(); + let tbl_func_name = + name.0.first().unwrap().as_ident().unwrap().to_string(); let args = func_args .args .into_iter() @@ -211,6 +212,7 @@ impl SqlToRel<'_, S> { LogicalPlan::Subquery(Subquery { subquery: input, outer_ref_columns, + spans: Spans::new(), }), alias, ) @@ -218,6 +220,7 @@ impl SqlToRel<'_, S> { plan => Ok(LogicalPlan::Subquery(Subquery { subquery: Arc::new(plan), outer_ref_columns, + spans: Spans::new(), })), } } diff --git a/datafusion/sql/src/resolve.rs b/datafusion/sql/src/resolve.rs index 88416dfe0324..96012a92c09a 100644 --- a/datafusion/sql/src/resolve.rs +++ b/datafusion/sql/src/resolve.rs @@ -81,7 +81,7 @@ impl Visitor for RelationVisitor { cte.visit(self); } self.ctes_in_scope - .push(ObjectName(vec![cte.alias.name.clone()])); + .push(ObjectName::from(vec![cte.alias.name.clone()])); } } ControlFlow::Continue(()) @@ -120,7 +120,7 @@ impl Visitor for RelationVisitor { ); if requires_information_schema { for s in INFORMATION_SCHEMA_TABLES { - self.relations.insert(ObjectName(vec![ + self.relations.insert(ObjectName::from(vec![ Ident::new(INFORMATION_SCHEMA), Ident::new(*s), ])); diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index e21def4c3941..2a2d0b3b3eb8 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -19,6 +19,7 @@ use std::collections::HashSet; use std::sync::Arc; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; +use crate::query::to_order_by_exprs_with_select; use crate::utils::{ check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs, resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up, @@ -33,19 +34,19 @@ use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; use datafusion_expr::expr_rewriter::{ normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_sorts, }; +use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::utils::{ expr_as_column_expr, expr_to_columns, find_aggregate_exprs, find_window_exprs, }; use datafusion_expr::{ - qualified_wildcard_with_options, wildcard_with_options, Aggregate, Expr, Filter, - GroupingSet, LogicalPlan, LogicalPlanBuilder, LogicalPlanBuilderOptions, - Partitioning, + Aggregate, Expr, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder, + LogicalPlanBuilderOptions, Partitioning, }; use indexmap::IndexMap; use sqlparser::ast::{ - Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderByExpr, - WildcardAdditionalOptions, WindowType, + Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderBy, + SelectItemQualifiedWildcardKind, WildcardAdditionalOptions, WindowType, }; use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins}; @@ -54,7 +55,7 @@ impl SqlToRel<'_, S> { pub(super) fn select_to_plan( &self, mut select: Select, - order_by: Vec, + query_order_by: Option, planner_context: &mut PlannerContext, ) -> Result { // Check for unsupported syntax first @@ -92,8 +93,12 @@ impl SqlToRel<'_, S> { planner_context, )?; + let order_by = + to_order_by_exprs_with_select(query_order_by, Some(&select_exprs))?; + // Having and group by clause may reference aliases defined in select projection - let projected_plan = self.project(base_plan.clone(), select_exprs.clone())?; + let projected_plan = self.project(base_plan.clone(), select_exprs)?; + let select_exprs = projected_plan.expressions(); // Place the fields of the base plan at the front so that when there are references // with the same name, the fields of the base plan will be searched first. @@ -578,7 +583,7 @@ impl SqlToRel<'_, S> { projection: Vec, empty_from: bool, planner_context: &mut PlannerContext, - ) -> Result> { + ) -> Result> { let mut prepared_select_exprs = vec![]; let mut error_builder = DataFusionErrorBuilder::new(); for expr in projection { @@ -597,7 +602,7 @@ impl SqlToRel<'_, S> { plan: &LogicalPlan, empty_from: bool, planner_context: &mut PlannerContext, - ) -> Result { + ) -> Result { match sql { SelectItem::UnnamedExpr(expr) => { let expr = self.sql_to_expr(expr, plan.schema(), planner_context)?; @@ -606,7 +611,8 @@ impl SqlToRel<'_, S> { &[&[plan.schema()]], &plan.using_columns()?, )?; - Ok(col) + + Ok(SelectExpr::Expression(col)) } SelectItem::ExprWithAlias { expr, alias } => { let select_expr = @@ -622,7 +628,8 @@ impl SqlToRel<'_, S> { Expr::Column(column) if column.name.eq(&name) => col, _ => col.alias(name), }; - Ok(expr) + + Ok(SelectExpr::Expression(expr)) } SelectItem::Wildcard(options) => { Self::check_wildcard_options(&options)?; @@ -635,10 +642,21 @@ impl SqlToRel<'_, S> { planner_context, options, )?; - Ok(wildcard_with_options(planned_options)) + + Ok(SelectExpr::Wildcard(planned_options)) } SelectItem::QualifiedWildcard(object_name, options) => { Self::check_wildcard_options(&options)?; + let object_name = match object_name { + SelectItemQualifiedWildcardKind::ObjectName(object_name) => { + object_name + } + SelectItemQualifiedWildcardKind::Expr(_) => { + return plan_err!( + "Qualified wildcard with expression not supported" + ) + } + }; let qualifier = self.object_name_to_table_reference(object_name)?; let planned_options = self.plan_wildcard_options( plan, @@ -646,7 +664,8 @@ impl SqlToRel<'_, S> { planner_context, options, )?; - Ok(qualified_wildcard_with_options(qualifier, planned_options)) + + Ok(SelectExpr::QualifiedWildcard(qualifier, planned_options)) } } } @@ -698,7 +717,14 @@ impl SqlToRel<'_, S> { planner_context, ) }) - .collect::>>()?; + .collect::>>()? + .into_iter() + .filter_map(|expr| match expr { + SelectExpr::Expression(expr) => Some(expr), + _ => None, + }) + .collect::>(); + let planned_replace = PlannedReplaceSelectItem { items: replace.items.into_iter().map(|i| *i).collect(), planned_expressions: replace_expr, @@ -710,8 +736,17 @@ impl SqlToRel<'_, S> { } /// Wrap a plan in a projection - fn project(&self, input: LogicalPlan, expr: Vec) -> Result { - self.validate_schema_satisfies_exprs(input.schema(), &expr)?; + fn project(&self, input: LogicalPlan, expr: Vec) -> Result { + // convert to Expr for validate_schema_satisfies_exprs + let exprs = expr + .iter() + .filter_map(|e| match e { + SelectExpr::Expression(expr) => Some(expr.to_owned()), + _ => None, + }) + .collect::>(); + self.validate_schema_satisfies_exprs(input.schema(), &exprs)?; + LogicalPlanBuilder::from(input).project(expr)?.build() } diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index a55b3b039087..272d6f874b4d 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -31,7 +31,7 @@ impl SqlToRel<'_, S> { ) -> Result { let set_expr_span = Span::try_from_sqlparser_span(set_expr.span()); match set_expr { - SetExpr::Select(s) => self.select_to_plan(*s, vec![], planner_context), + SetExpr::Select(s) => self.select_to_plan(*s, None, planner_context), SetExpr::Values(v) => self.sql_values_to_plan(v, planner_context), SetExpr::SetOperation { op, diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index fbe6d6501c86..fc6cb0d32fef 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -48,15 +48,16 @@ use datafusion_expr::{ CreateExternalTable as PlanCreateExternalTable, CreateFunction, CreateFunctionBody, CreateIndex as PlanCreateIndex, CreateMemoryTable, CreateView, Deallocate, DescribeTable, DmlStatement, DropCatalogSchema, DropFunction, DropTable, DropView, - EmptyRelation, Execute, Explain, Expr, ExprSchemable, Filter, LogicalPlan, - LogicalPlanBuilder, OperateFunctionArg, PlanType, Prepare, SetVariable, SortExpr, - Statement as PlanStatement, ToStringifiedPlan, TransactionAccessMode, + EmptyRelation, Execute, Explain, ExplainFormat, Expr, ExprSchemable, Filter, + LogicalPlan, LogicalPlanBuilder, OperateFunctionArg, PlanType, Prepare, SetVariable, + SortExpr, Statement as PlanStatement, ToStringifiedPlan, TransactionAccessMode, TransactionConclusion, TransactionEnd, TransactionIsolationLevel, TransactionStart, Volatility, WriteOp, }; use sqlparser::ast::{ self, BeginTransactionKind, NullsDistinctOption, ShowStatementIn, ShowStatementOptions, SqliteOnConflict, TableObject, UpdateTableFromKind, + ValueWithSpan, }; use sqlparser::ast::{ Assignment, AssignmentTarget, ColumnDef, CreateIndex, CreateTable, @@ -75,7 +76,13 @@ fn object_name_to_string(object_name: &ObjectName) -> String { object_name .0 .iter() - .map(ident_to_string) + .map(|object_name_part| { + object_name_part + .as_ident() + // TODO: It might be better to return an error + // than to silently use a default value. + .map_or_else(String::new, ident_to_string) + }) .collect::>() .join(".") } @@ -160,7 +167,8 @@ fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec {} + | ast::ColumnOption::Alias(_) + | ast::ColumnOption::Collation(_) => {} } } } @@ -177,8 +185,9 @@ impl SqlToRel<'_, S> { DFStatement::Explain(ExplainStatement { verbose, analyze, + format, statement, - }) => self.explain_to_plan(verbose, analyze, *statement), + }) => self.explain_to_plan(verbose, analyze, format, *statement), } } @@ -214,11 +223,13 @@ impl SqlToRel<'_, S> { verbose, statement, analyze, - format: _, + format, describe_alias: _, .. } => { - self.explain_to_plan(verbose, analyze, DFStatement::Statement(statement)) + let format = format.map(|format| format.to_string()); + let statement = DFStatement::Statement(statement); + self.explain_to_plan(verbose, analyze, format, statement) } Statement::Query(query) => self.query_to_plan(*query, planner_context), Statement::ShowVariable { variable } => self.show_variable_to_plan(&variable), @@ -273,6 +284,12 @@ impl SqlToRel<'_, S> { with_aggregation_policy, with_row_access_policy, with_tags, + iceberg, + external_volume, + base_location, + catalog, + catalog_sync, + storage_serialization_policy, }) if table_properties.is_empty() && with_options.is_empty() => { if temporary { return not_impl_err!("Temporary tables not supported")?; @@ -393,6 +410,24 @@ impl SqlToRel<'_, S> { if with_tags.is_some() { return not_impl_err!("With tags not supported")?; } + if iceberg { + return not_impl_err!("Iceberg not supported")?; + } + if external_volume.is_some() { + return not_impl_err!("External volume not supported")?; + } + if base_location.is_some() { + return not_impl_err!("Base location not supported")?; + } + if catalog.is_some() { + return not_impl_err!("Catalog not supported")?; + } + if catalog_sync.is_some() { + return not_impl_err!("Catalog sync not supported")?; + } + if storage_serialization_policy.is_some() { + return not_impl_err!("Storage serialization policy not supported")?; + } // Merge inline constraints and existing constraints let mut all_constraints = constraints; @@ -434,6 +469,7 @@ impl SqlToRel<'_, S> { .alias(field.name()) }) .collect::>(); + LogicalPlanBuilder::from(plan.clone()) .project(project_exprs)? .build()? @@ -687,6 +723,8 @@ impl SqlToRel<'_, S> { // has_parentheses specifies the syntax, but the plan is the // same no matter the syntax used, so ignore it has_parentheses: _, + immediate, + into, } => { // `USING` is a MySQL-specific syntax and currently not supported. if !using.is_empty() { @@ -694,7 +732,14 @@ impl SqlToRel<'_, S> { "Execute statement with USING is not supported" ); } - + if immediate { + return not_impl_err!( + "Execute statement with IMMEDIATE is not supported" + ); + } + if !into.is_empty() { + return not_impl_err!("Execute statement with INTO is not supported"); + } let empty_schema = DFSchema::empty(); let parameters = parameters .into_iter() @@ -702,7 +747,7 @@ impl SqlToRel<'_, S> { .collect::>>()?; Ok(LogicalPlan::Statement(PlanStatement::Execute(Execute { - name: object_name_to_string(&name), + name: object_name_to_string(&name.unwrap()), parameters, }))) } @@ -903,18 +948,23 @@ impl SqlToRel<'_, S> { returning, or, } => { - let from = + let froms = from.map(|update_table_from_kind| match update_table_from_kind { - UpdateTableFromKind::BeforeSet(from) => from, - UpdateTableFromKind::AfterSet(from) => from, + UpdateTableFromKind::BeforeSet(froms) => froms, + UpdateTableFromKind::AfterSet(froms) => froms, }); + // TODO: support multiple tables in UPDATE SET FROM + if froms.as_ref().is_some_and(|f| f.len() > 1) { + plan_err!("Multiple tables in UPDATE SET FROM not yet supported")?; + } + let update_from = froms.and_then(|mut f| f.pop()); if returning.is_some() { plan_err!("Update-returning clause not yet supported")?; } if or.is_some() { plan_err!("ON conflict not supported")?; } - self.update_to_plan(table, assignments, from, selection) + self.update_to_plan(table, assignments, update_from, selection) } Statement::Delete(Delete { @@ -955,12 +1005,28 @@ impl SqlToRel<'_, S> { begin: false, modifier, transaction, + statements, + exception_statements, + has_end_keyword, } => { if let Some(modifier) = modifier { return not_impl_err!( "Transaction modifier not supported: {modifier}" ); } + if !statements.is_empty() { + return not_impl_err!( + "Transaction with multiple statements not supported" + ); + } + if exception_statements.is_some() { + return not_impl_err!( + "Transaction with exception statements not supported" + ); + } + if has_end_keyword { + return not_impl_err!("Transaction with END keyword not supported"); + } self.validate_transaction_kind(transaction)?; let isolation_level: ast::TransactionIsolationLevel = modes .iter() @@ -1085,7 +1151,7 @@ impl SqlToRel<'_, S> { // At the moment functions can't be qualified `schema.name` let name = match &name.0[..] { [] => exec_err!("Function should have name")?, - [n] => n.value.clone(), + [n] => n.as_ident().unwrap().value.clone(), [..] => not_impl_err!("Qualified functions are not supported")?, }; // @@ -1143,7 +1209,7 @@ impl SqlToRel<'_, S> { // At the moment functions can't be qualified `schema.name` let name = match &desc.name.0[..] { [] => exec_err!("Function should have name")?, - [n] => n.value.clone(), + [n] => n.as_ident().unwrap().value.clone(), [..] => not_impl_err!("Qualified functions are not supported")?, }; let statement = DdlStatement::DropFunction(DropFunction { @@ -1344,8 +1410,9 @@ impl SqlToRel<'_, S> { planner_context, ) .unwrap(); - let asc = order_by_expr.asc.unwrap_or(true); - let nulls_first = order_by_expr.nulls_first.unwrap_or(!asc); + let asc = order_by_expr.options.asc.unwrap_or(true); + let nulls_first = + order_by_expr.options.nulls_first.unwrap_or(!asc); SortExpr::new(ordered_expr, asc, nulls_first) }) @@ -1564,17 +1631,26 @@ impl SqlToRel<'_, S> { &self, verbose: bool, analyze: bool, + format: Option, statement: DFStatement, ) -> Result { let plan = self.statement_to_plan(statement)?; if matches!(plan, LogicalPlan::Explain(_)) { return plan_err!("Nested EXPLAINs are not supported"); } + let plan = Arc::new(plan); let schema = LogicalPlan::explain_schema(); let schema = schema.to_dfschema_ref()?; + if verbose && format.is_some() { + return plan_err!("EXPLAIN VERBOSE with FORMAT is not supported"); + } + if analyze { + if format.is_some() { + return plan_err!("EXPLAIN ANALYZE with FORMAT is not supported"); + } Ok(LogicalPlan::Analyze(Analyze { verbose, input: plan, @@ -1583,8 +1659,16 @@ impl SqlToRel<'_, S> { } else { let stringified_plans = vec![plan.to_stringified(PlanType::InitialLogicalPlan)]; + + // default to configuration value + let options = self.context_provider.options(); + let format = format.as_ref().unwrap_or(&options.explain.format); + + let format: ExplainFormat = format.parse()?; + Ok(LogicalPlan::Explain(Explain { verbose, + explain_format: format, plan, stringified_plans, schema, @@ -1612,7 +1696,7 @@ impl SqlToRel<'_, S> { variable_vec = variable_vec.split_at(variable_vec.len() - 1).0.to_vec(); } - let variable = object_name_to_string(&ObjectName(variable_vec)); + let variable = object_name_to_string(&ObjectName::from(variable_vec)); let base_query = format!("SELECT {columns} FROM information_schema.df_settings"); let query = if variable == "all" { // Add an ORDER BY so the output comes out in a consistent order @@ -1679,7 +1763,7 @@ impl SqlToRel<'_, S> { // Parse value string from Expr let value_string = match &value[0] { SQLExpr::Identifier(i) => ident_to_string(i), - SQLExpr::Value(v) => match crate::utils::value_to_string(v) { + SQLExpr::Value(v) => match crate::utils::value_to_string(&v.value) { None => { return plan_err!("Unsupported Value {}", value[0]); } @@ -1779,7 +1863,9 @@ impl SqlToRel<'_, S> { .0 .iter() .last() - .ok_or_else(|| plan_datafusion_err!("Empty column id"))?; + .ok_or_else(|| plan_datafusion_err!("Empty column id"))? + .as_ident() + .unwrap(); // Validate that the assignment target column exists table_schema.field_with_unqualified_name(&col_name.value)?; Ok((col_name.value.clone(), assign.value.clone())) @@ -1917,7 +2003,11 @@ impl SqlToRel<'_, S> { if let SetExpr::Values(ast::Values { rows, .. }) = (*source.body).clone() { for row in rows.iter() { for (idx, val) in row.iter().enumerate() { - if let SQLExpr::Value(Value::Placeholder(name)) = val { + if let SQLExpr::Value(ValueWithSpan { + value: Value::Placeholder(name), + span: _, + }) = val + { let name = name.replace('$', "").parse::().map_err(|_| { plan_datafusion_err!("Can't parse placeholder: {name}") @@ -1998,7 +2088,7 @@ impl SqlToRel<'_, S> { let where_clause = object_name_to_qualifier( &sql_table_name, self.options.enable_ident_normalization, - ); + )?; if !self.has_table("information_schema", "columns") { return plan_err!( @@ -2123,7 +2213,7 @@ ON p.function_name = r.routine_name let where_clause = object_name_to_qualifier( &sql_table_name, self.options.enable_ident_normalization, - ); + )?; // Do a table lookup to verify the table exists let table_ref = self.object_name_to_table_reference(sql_table_name)?; diff --git a/datafusion/sql/src/unparser/ast.rs b/datafusion/sql/src/unparser/ast.rs index 6d77c01ea888..6fcc203637cc 100644 --- a/datafusion/sql/src/unparser/ast.rs +++ b/datafusion/sql/src/unparser/ast.rs @@ -16,15 +16,16 @@ // under the License. use core::fmt; +use std::ops::ControlFlow; -use sqlparser::ast; use sqlparser::ast::helpers::attached_token::AttachedToken; +use sqlparser::ast::{self, visit_expressions_mut, OrderByKind, SelectFlavor}; #[derive(Clone)] pub struct QueryBuilder { with: Option, body: Option>, - order_by: Vec, + order_by_kind: Option, limit: Option, limit_by: Vec, offset: Option, @@ -46,8 +47,8 @@ impl QueryBuilder { pub fn take_body(&mut self) -> Option> { self.body.take() } - pub fn order_by(&mut self, value: Vec) -> &mut Self { - self.order_by = value; + pub fn order_by(&mut self, value: OrderByKind) -> &mut Self { + self.order_by_kind = Some(value); self } pub fn limit(&mut self, value: Option) -> &mut Self { @@ -75,14 +76,13 @@ impl QueryBuilder { self } pub fn build(&self) -> Result { - let order_by = if self.order_by.is_empty() { - None - } else { - Some(ast::OrderBy { - exprs: self.order_by.clone(), + let order_by = self + .order_by_kind + .as_ref() + .map(|order_by_kind| ast::OrderBy { + kind: order_by_kind.clone(), interpolate: None, - }) - }; + }); Ok(ast::Query { with: self.with.clone(), @@ -105,7 +105,7 @@ impl QueryBuilder { Self { with: Default::default(), body: Default::default(), - order_by: Default::default(), + order_by_kind: Default::default(), limit: Default::default(), limit_by: Default::default(), offset: Default::default(), @@ -138,6 +138,7 @@ pub struct SelectBuilder { named_window: Vec, qualify: Option, value_table_mode: Option, + flavor: Option, } #[allow(dead_code)] @@ -176,6 +177,37 @@ impl SelectBuilder { self.lateral_views = value; self } + + /// Replaces the selection with a new value. + /// + /// This function is used to replace a specific expression within the selection. + /// Unlike the `selection` method which combines existing and new selections with AND, + /// this method searches for and replaces occurrences of a specific expression. + /// + /// This method is primarily used to modify LEFT MARK JOIN expressions. + /// When processing a LEFT MARK JOIN, we need to replace the placeholder expression + /// with the actual join condition in the selection clause. + /// + /// # Arguments + /// + /// * `existing_expr` - The expression to replace + /// * `value` - The new expression to set as the selection + pub fn replace_mark( + &mut self, + existing_expr: &ast::Expr, + value: &ast::Expr, + ) -> &mut Self { + if let Some(selection) = &mut self.selection { + visit_expressions_mut(selection, |expr| { + if expr == existing_expr { + *expr = value.clone(); + } + ControlFlow::<()>::Continue(()) + }); + } + self + } + pub fn selection(&mut self, value: Option) -> &mut Self { // With filter pushdown optimization, the LogicalPlan can have filters defined as part of `TableScan` and `Filter` nodes. // To avoid overwriting one of the filters, we combine the existing filter with the additional filter. @@ -264,6 +296,10 @@ impl SelectBuilder { window_before_qualify: false, prewhere: None, select_token: AttachedToken::empty(), + flavor: match self.flavor { + Some(ref value) => value.clone(), + None => return Err(Into::into(UninitializedFieldError::from("flavor"))), + }, }) } fn create_empty() -> Self { @@ -283,6 +319,7 @@ impl SelectBuilder { named_window: Default::default(), qualify: Default::default(), value_table_mode: Default::default(), + flavor: Some(SelectFlavor::Standard), } } } @@ -422,6 +459,7 @@ pub struct TableRelationBuilder { with_hints: Vec, version: Option, partitions: Vec, + index_hints: Vec, } #[allow(dead_code)] @@ -450,6 +488,10 @@ impl TableRelationBuilder { self.partitions = value; self } + pub fn index_hints(&mut self, value: Vec) -> &mut Self { + self.index_hints = value; + self + } pub fn build(&self) -> Result { Ok(ast::TableFactor::Table { name: match self.name { @@ -467,6 +509,7 @@ impl TableRelationBuilder { with_ordinality: false, json_path: None, sample: None, + index_hints: self.index_hints.clone(), }) } fn create_empty() -> Self { @@ -477,6 +520,7 @@ impl TableRelationBuilder { with_hints: Default::default(), version: Default::default(), partitions: Default::default(), + index_hints: Default::default(), } } } diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs index 399f0df0a699..05914b98f55f 100644 --- a/datafusion/sql/src/unparser/dialect.rs +++ b/datafusion/sql/src/unparser/dialect.rs @@ -313,7 +313,7 @@ impl PostgreSqlDialect { } Ok(ast::Expr::Function(Function { - name: ObjectName(vec![Ident { + name: ObjectName::from(vec![Ident { value: func_name.to_string(), quote_style: None, span: Span::empty(), @@ -421,11 +421,11 @@ impl Dialect for MySqlDialect { } fn int64_cast_dtype(&self) -> ast::DataType { - ast::DataType::Custom(ObjectName(vec![Ident::new("SIGNED")]), vec![]) + ast::DataType::Custom(ObjectName::from(vec![Ident::new("SIGNED")]), vec![]) } fn int32_cast_dtype(&self) -> ast::DataType { - ast::DataType::Custom(ObjectName(vec![Ident::new("SIGNED")]), vec![]) + ast::DataType::Custom(ObjectName::from(vec![Ident::new("SIGNED")]), vec![]) } fn timestamp_cast_dtype( @@ -898,8 +898,8 @@ impl CustomDialectBuilder { self } - pub fn with_unnest_as_table_factor(mut self, _unnest_as_table_factor: bool) -> Self { - self.unnest_as_table_factor = _unnest_as_table_factor; + pub fn with_unnest_as_table_factor(mut self, unnest_as_table_factor: bool) -> Self { + self.unnest_as_table_factor = unnest_as_table_factor; self } } diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index bf6361312727..48360377586e 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -18,8 +18,8 @@ use datafusion_expr::expr::{AggregateFunctionParams, Unnest, WindowFunctionParams}; use sqlparser::ast::Value::SingleQuotedString; use sqlparser::ast::{ - self, Array, BinaryOperator, Expr as AstExpr, Function, Ident, Interval, ObjectName, - Subscript, TimezoneInfo, UnaryOperator, + self, Array, BinaryOperator, CaseWhen, Expr as AstExpr, Function, Ident, Interval, + ObjectName, OrderByOptions, Subscript, TimezoneInfo, UnaryOperator, ValueWithSpan, }; use std::sync::Arc; use std::vec; @@ -94,6 +94,7 @@ impl Unparser<'_> { Ok(root_expr) } + #[cfg_attr(feature = "recursive_protection", recursive::recursive)] fn expr_to_sql_inner(&self, expr: &Expr) -> Result { match expr { Expr::InList(InList { @@ -154,12 +155,14 @@ impl Unparser<'_> { }) => { let conditions = when_then_expr .iter() - .map(|(w, _)| self.expr_to_sql_inner(w)) - .collect::>>()?; - let results = when_then_expr - .iter() - .map(|(_, t)| self.expr_to_sql_inner(t)) - .collect::>>()?; + .map(|(cond, result)| { + Ok(CaseWhen { + condition: self.expr_to_sql_inner(cond)?, + result: self.expr_to_sql_inner(result)?, + }) + }) + .collect::>>()?; + let operand = match expr.as_ref() { Some(e) => match self.expr_to_sql_inner(e) { Ok(sql_expr) => Some(Box::new(sql_expr)), @@ -178,7 +181,6 @@ impl Unparser<'_> { Ok(ast::Expr::Case { operand, conditions, - results, else_result, }) } @@ -247,7 +249,7 @@ impl Unparser<'_> { })); Ok(ast::Expr::Function(Function { - name: ObjectName(vec![Ident { + name: ObjectName::from(vec![Ident { value: func_name.to_string(), quote_style: None, span: Span::empty(), @@ -300,7 +302,7 @@ impl Unparser<'_> { None => None, }; Ok(ast::Expr::Function(Function { - name: ObjectName(vec![Ident { + name: ObjectName::from(vec![Ident { value: func_name.to_string(), quote_style: None, span: Span::empty(), @@ -436,7 +438,7 @@ impl Unparser<'_> { let idents: Vec = qualifier.to_vec().into_iter().map(Ident::new).collect(); Ok(ast::Expr::QualifiedWildcard( - ObjectName(idents), + ObjectName::from(idents), attached_token, )) } else { @@ -478,7 +480,7 @@ impl Unparser<'_> { } }, Expr::Placeholder(p) => { - Ok(ast::Expr::Value(ast::Value::Placeholder(p.id.to_string()))) + Ok(ast::Expr::value(ast::Value::Placeholder(p.id.to_string()))) } Expr::OuterReferenceColumn(_, col) => self.col_to_sql(col), Expr::Unnest(unnest) => self.unnest_to_sql(unnest), @@ -508,7 +510,7 @@ impl Unparser<'_> { ) -> Result { let args = self.function_args_to_sql(args)?; Ok(ast::Expr::Function(Function { - name: ObjectName(vec![Ident { + name: ObjectName::from(vec![Ident { value: func_name.to_string(), quote_style: None, span: Span::empty(), @@ -660,8 +662,10 @@ impl Unparser<'_> { Ok(ast::OrderByExpr { expr: sql_parser_expr, - asc: Some(*asc), - nulls_first, + options: OrderByOptions { + asc: Some(*asc), + nulls_first, + }, with_fill: None, }) } @@ -674,7 +678,7 @@ impl Unparser<'_> { } } - fn col_to_sql(&self, col: &Column) -> Result { + pub fn col_to_sql(&self, col: &Column) -> Result { if let Some(table_ref) = &col.relation { let mut id = if self.dialect.full_qualified_col() { table_ref.to_vec() @@ -701,7 +705,11 @@ impl Unparser<'_> { datafusion_expr::window_frame::WindowFrameBound::Preceding(val) => { Ok(ast::WindowFrameBound::Preceding({ let val = self.scalar_to_sql(val)?; - if let ast::Expr::Value(ast::Value::Null) = &val { + if let ast::Expr::Value(ValueWithSpan { + value: ast::Value::Null, + span: _, + }) = &val + { None } else { Some(Box::new(val)) @@ -711,7 +719,11 @@ impl Unparser<'_> { datafusion_expr::window_frame::WindowFrameBound::Following(val) => { Ok(ast::WindowFrameBound::Following({ let val = self.scalar_to_sql(val)?; - if let ast::Expr::Value(ast::Value::Null) = &val { + if let ast::Expr::Value(ValueWithSpan { + value: ast::Value::Null, + span: _, + }) = &val + { None } else { Some(Box::new(val)) @@ -999,7 +1011,7 @@ impl Unparser<'_> { Ok(ast::Expr::Cast { kind: ast::CastKind::Cast, - expr: Box::new(ast::Expr::Value(SingleQuotedString(ts))), + expr: Box::new(ast::Expr::value(SingleQuotedString(ts))), data_type: self.dialect.timestamp_cast_dtype(&time_unit, &None), format: None, }) @@ -1021,7 +1033,7 @@ impl Unparser<'_> { .to_string(); Ok(ast::Expr::Cast { kind: ast::CastKind::Cast, - expr: Box::new(ast::Expr::Value(SingleQuotedString(time))), + expr: Box::new(ast::Expr::value(SingleQuotedString(time))), data_type: ast::DataType::Time(None, TimezoneInfo::None), format: None, }) @@ -1056,102 +1068,102 @@ impl Unparser<'_> { /// For example ScalarValue::Date32(d) corresponds to the ast::Expr CAST('datestr' as DATE) fn scalar_to_sql(&self, v: &ScalarValue) -> Result { match v { - ScalarValue::Null => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Null => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Boolean(Some(b)) => { - Ok(ast::Expr::Value(ast::Value::Boolean(b.to_owned()))) + Ok(ast::Expr::value(ast::Value::Boolean(b.to_owned()))) } - ScalarValue::Boolean(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Boolean(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Float16(Some(f)) => { - Ok(ast::Expr::Value(ast::Value::Number(f.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(f.to_string(), false))) } - ScalarValue::Float16(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Float16(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Float32(Some(f)) => { let f_val = match f.fract() { 0.0 => format!("{:.1}", f), _ => format!("{}", f), }; - Ok(ast::Expr::Value(ast::Value::Number(f_val, false))) + Ok(ast::Expr::value(ast::Value::Number(f_val, false))) } - ScalarValue::Float32(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Float32(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Float64(Some(f)) => { let f_val = match f.fract() { 0.0 => format!("{:.1}", f), _ => format!("{}", f), }; - Ok(ast::Expr::Value(ast::Value::Number(f_val, false))) + Ok(ast::Expr::value(ast::Value::Number(f_val, false))) } - ScalarValue::Float64(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Float64(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Decimal128(Some(value), precision, scale) => { - Ok(ast::Expr::Value(ast::Value::Number( + Ok(ast::Expr::value(ast::Value::Number( Decimal128Type::format_decimal(*value, *precision, *scale), false, ))) } - ScalarValue::Decimal128(None, ..) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Decimal128(None, ..) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Decimal256(Some(value), precision, scale) => { - Ok(ast::Expr::Value(ast::Value::Number( + Ok(ast::Expr::value(ast::Value::Number( Decimal256Type::format_decimal(*value, *precision, *scale), false, ))) } - ScalarValue::Decimal256(None, ..) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Decimal256(None, ..) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Int8(Some(i)) => { - Ok(ast::Expr::Value(ast::Value::Number(i.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(i.to_string(), false))) } - ScalarValue::Int8(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Int8(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Int16(Some(i)) => { - Ok(ast::Expr::Value(ast::Value::Number(i.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(i.to_string(), false))) } - ScalarValue::Int16(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Int16(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Int32(Some(i)) => { - Ok(ast::Expr::Value(ast::Value::Number(i.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(i.to_string(), false))) } - ScalarValue::Int32(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Int32(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Int64(Some(i)) => { - Ok(ast::Expr::Value(ast::Value::Number(i.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(i.to_string(), false))) } - ScalarValue::Int64(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Int64(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::UInt8(Some(ui)) => { - Ok(ast::Expr::Value(ast::Value::Number(ui.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(ui.to_string(), false))) } - ScalarValue::UInt8(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::UInt8(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::UInt16(Some(ui)) => { - Ok(ast::Expr::Value(ast::Value::Number(ui.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(ui.to_string(), false))) } - ScalarValue::UInt16(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::UInt16(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::UInt32(Some(ui)) => { - Ok(ast::Expr::Value(ast::Value::Number(ui.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(ui.to_string(), false))) } - ScalarValue::UInt32(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::UInt32(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::UInt64(Some(ui)) => { - Ok(ast::Expr::Value(ast::Value::Number(ui.to_string(), false))) + Ok(ast::Expr::value(ast::Value::Number(ui.to_string(), false))) } - ScalarValue::UInt64(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::UInt64(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Utf8(Some(str)) => { - Ok(ast::Expr::Value(SingleQuotedString(str.to_string()))) + Ok(ast::Expr::value(SingleQuotedString(str.to_string()))) } - ScalarValue::Utf8(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Utf8(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Utf8View(Some(str)) => { - Ok(ast::Expr::Value(SingleQuotedString(str.to_string()))) + Ok(ast::Expr::value(SingleQuotedString(str.to_string()))) } - ScalarValue::Utf8View(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Utf8View(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::LargeUtf8(Some(str)) => { - Ok(ast::Expr::Value(SingleQuotedString(str.to_string()))) + Ok(ast::Expr::value(SingleQuotedString(str.to_string()))) } - ScalarValue::LargeUtf8(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::LargeUtf8(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Binary(Some(_)) => not_impl_err!("Unsupported scalar: {v:?}"), - ScalarValue::Binary(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Binary(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::BinaryView(Some(_)) => { not_impl_err!("Unsupported scalar: {v:?}") } - ScalarValue::BinaryView(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::BinaryView(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::FixedSizeBinary(..) => { not_impl_err!("Unsupported scalar: {v:?}") } ScalarValue::LargeBinary(Some(_)) => { not_impl_err!("Unsupported scalar: {v:?}") } - ScalarValue::LargeBinary(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::LargeBinary(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::FixedSizeList(a) => self.scalar_value_list_to_sql(a.values()), ScalarValue::List(a) => self.scalar_value_list_to_sql(a.values()), ScalarValue::LargeList(a) => self.scalar_value_list_to_sql(a.values()), @@ -1170,14 +1182,14 @@ impl Unparser<'_> { Ok(ast::Expr::Cast { kind: ast::CastKind::Cast, - expr: Box::new(ast::Expr::Value(SingleQuotedString( + expr: Box::new(ast::Expr::value(SingleQuotedString( date.to_string(), ))), data_type: ast::DataType::Date, format: None, }) } - ScalarValue::Date32(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Date32(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Date64(Some(_)) => { let datetime = v .to_array()? @@ -1193,57 +1205,57 @@ impl Unparser<'_> { Ok(ast::Expr::Cast { kind: ast::CastKind::Cast, - expr: Box::new(ast::Expr::Value(SingleQuotedString( + expr: Box::new(ast::Expr::value(SingleQuotedString( datetime.to_string(), ))), data_type: self.ast_type_for_date64_in_cast(), format: None, }) } - ScalarValue::Date64(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Date64(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Time32Second(Some(_t)) => { self.handle_time::(v) } - ScalarValue::Time32Second(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Time32Second(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Time32Millisecond(Some(_t)) => { self.handle_time::(v) } ScalarValue::Time32Millisecond(None) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::Time64Microsecond(Some(_t)) => { self.handle_time::(v) } ScalarValue::Time64Microsecond(None) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::Time64Nanosecond(Some(_t)) => { self.handle_time::(v) } - ScalarValue::Time64Nanosecond(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::Time64Nanosecond(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::TimestampSecond(Some(_ts), tz) => { self.handle_timestamp::(v, tz) } ScalarValue::TimestampSecond(None, _) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::TimestampMillisecond(Some(_ts), tz) => { self.handle_timestamp::(v, tz) } ScalarValue::TimestampMillisecond(None, _) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::TimestampMicrosecond(Some(_ts), tz) => { self.handle_timestamp::(v, tz) } ScalarValue::TimestampMicrosecond(None, _) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::TimestampNanosecond(Some(_ts), tz) => { self.handle_timestamp::(v, tz) } ScalarValue::TimestampNanosecond(None, _) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::IntervalYearMonth(Some(_)) | ScalarValue::IntervalDayTime(Some(_)) @@ -1251,33 +1263,33 @@ impl Unparser<'_> { self.interval_scalar_to_sql(v) } ScalarValue::IntervalYearMonth(None) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } - ScalarValue::IntervalDayTime(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::IntervalDayTime(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::IntervalMonthDayNano(None) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::DurationSecond(Some(_d)) => { not_impl_err!("Unsupported scalar: {v:?}") } - ScalarValue::DurationSecond(None) => Ok(ast::Expr::Value(ast::Value::Null)), + ScalarValue::DurationSecond(None) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::DurationMillisecond(Some(_d)) => { not_impl_err!("Unsupported scalar: {v:?}") } ScalarValue::DurationMillisecond(None) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::DurationMicrosecond(Some(_d)) => { not_impl_err!("Unsupported scalar: {v:?}") } ScalarValue::DurationMicrosecond(None) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::DurationNanosecond(Some(_d)) => { not_impl_err!("Unsupported scalar: {v:?}") } ScalarValue::DurationNanosecond(None) => { - Ok(ast::Expr::Value(ast::Value::Null)) + Ok(ast::Expr::value(ast::Value::Null)) } ScalarValue::Struct(_) => not_impl_err!("Unsupported scalar: {v:?}"), ScalarValue::Map(_) => not_impl_err!("Unsupported scalar: {v:?}"), @@ -1301,7 +1313,7 @@ impl Unparser<'_> { // MONTH only if months != 0 && days == 0 && microseconds == 0 { let interval = Interval { - value: Box::new(ast::Expr::Value(ast::Value::Number( + value: Box::new(ast::Expr::value(ast::Value::Number( months.to_string(), false, ))), @@ -1318,7 +1330,7 @@ impl Unparser<'_> { // DAY only if microseconds == 0 { let interval = Interval { - value: Box::new(ast::Expr::Value(ast::Value::Number( + value: Box::new(ast::Expr::value(ast::Value::Number( days.to_string(), false, ))), @@ -1336,7 +1348,7 @@ impl Unparser<'_> { if microseconds % 1_000_000 != 0 { let interval = Interval { - value: Box::new(ast::Expr::Value(ast::Value::Number( + value: Box::new(ast::Expr::value(ast::Value::Number( microseconds.to_string(), false, ))), @@ -1352,7 +1364,7 @@ impl Unparser<'_> { if secs % 60 != 0 { let interval = Interval { - value: Box::new(ast::Expr::Value(ast::Value::Number( + value: Box::new(ast::Expr::value(ast::Value::Number( secs.to_string(), false, ))), @@ -1368,7 +1380,7 @@ impl Unparser<'_> { if mins % 60 != 0 { let interval = Interval { - value: Box::new(ast::Expr::Value(ast::Value::Number( + value: Box::new(ast::Expr::value(ast::Value::Number( mins.to_string(), false, ))), @@ -1384,7 +1396,7 @@ impl Unparser<'_> { if hours % 24 != 0 { let interval = Interval { - value: Box::new(ast::Expr::Value(ast::Value::Number( + value: Box::new(ast::Expr::value(ast::Value::Number( hours.to_string(), false, ))), @@ -1399,7 +1411,7 @@ impl Unparser<'_> { let days = hours / 24; let interval = Interval { - value: Box::new(ast::Expr::Value(ast::Value::Number( + value: Box::new(ast::Expr::value(ast::Value::Number( days.to_string(), false, ))), @@ -1421,7 +1433,7 @@ impl Unparser<'_> { ); }; let interval = Interval { - value: Box::new(ast::Expr::Value(SingleQuotedString( + value: Box::new(ast::Expr::value(SingleQuotedString( result.to_uppercase(), ))), leading_field: None, @@ -1435,7 +1447,7 @@ impl Unparser<'_> { IntervalStyle::SQLStandard => match v { ScalarValue::IntervalYearMonth(Some(v)) => { let interval = Interval { - value: Box::new(ast::Expr::Value(SingleQuotedString( + value: Box::new(ast::Expr::value(SingleQuotedString( v.to_string(), ))), leading_field: Some(ast::DateTimeField::Month), @@ -1456,7 +1468,7 @@ impl Unparser<'_> { let millis = v.milliseconds % 1_000; let interval = Interval { - value: Box::new(ast::Expr::Value(SingleQuotedString(format!( + value: Box::new(ast::Expr::value(SingleQuotedString(format!( "{days} {hours}:{mins}:{secs}.{millis:3}" )))), leading_field: Some(ast::DateTimeField::Day), @@ -1469,7 +1481,7 @@ impl Unparser<'_> { ScalarValue::IntervalMonthDayNano(Some(v)) => { if v.months >= 0 && v.days == 0 && v.nanoseconds == 0 { let interval = Interval { - value: Box::new(ast::Expr::Value(SingleQuotedString( + value: Box::new(ast::Expr::value(SingleQuotedString( v.months.to_string(), ))), leading_field: Some(ast::DateTimeField::Month), @@ -1490,7 +1502,7 @@ impl Unparser<'_> { let millis = (v.nanoseconds % 1_000_000_000) / 1_000_000; let interval = Interval { - value: Box::new(ast::Expr::Value(SingleQuotedString( + value: Box::new(ast::Expr::value(SingleQuotedString( format!("{days} {hours}:{mins}:{secs}.{millis:03}"), ))), leading_field: Some(ast::DateTimeField::Day), @@ -1535,7 +1547,7 @@ impl Unparser<'_> { let args = self.function_args_to_sql(std::slice::from_ref(&unnest.expr))?; Ok(ast::Expr::Function(Function { - name: ObjectName(vec![Ident { + name: ObjectName::from(vec![Ident { value: "UNNEST".to_string(), quote_style: None, span: Span::empty(), @@ -1564,10 +1576,10 @@ impl Unparser<'_> { DataType::Int16 => Ok(ast::DataType::SmallInt(None)), DataType::Int32 => Ok(self.dialect.int32_cast_dtype()), DataType::Int64 => Ok(self.dialect.int64_cast_dtype()), - DataType::UInt8 => Ok(ast::DataType::UnsignedTinyInt(None)), - DataType::UInt16 => Ok(ast::DataType::UnsignedSmallInt(None)), - DataType::UInt32 => Ok(ast::DataType::UnsignedInteger(None)), - DataType::UInt64 => Ok(ast::DataType::UnsignedBigInt(None)), + DataType::UInt8 => Ok(ast::DataType::TinyIntUnsigned(None)), + DataType::UInt16 => Ok(ast::DataType::SmallIntUnsigned(None)), + DataType::UInt32 => Ok(ast::DataType::IntegerUnsigned(None)), + DataType::UInt64 => Ok(ast::DataType::BigIntUnsigned(None)), DataType::Float16 => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } @@ -1661,8 +1673,8 @@ mod tests { use datafusion_expr::{ case, cast, col, cube, exists, grouping_set, interval_datetime_lit, interval_year_month_lit, lit, not, not_exists, out_ref_col, placeholder, rollup, - table_scan, try_cast, when, wildcard, ScalarUDF, ScalarUDFImpl, Signature, - Volatility, WindowFrame, WindowFunctionDefinition, + table_scan, try_cast, when, ColumnarValue, ScalarFunctionArgs, ScalarUDF, + ScalarUDFImpl, Signature, Volatility, WindowFrame, WindowFunctionDefinition, }; use datafusion_expr::{interval_month_day_nano_lit, ExprFunctionExt}; use datafusion_functions::expr_fn::{get_field, named_struct}; @@ -1711,6 +1723,10 @@ mod tests { fn return_type(&self, _arg_types: &[DataType]) -> Result { Ok(DataType::Int32) } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + panic!("dummy - not implemented") + } } // See sql::tests for E2E tests. @@ -1913,16 +1929,24 @@ mod tests { ), (sum(col("a")), r#"sum(a)"#), ( + #[expect(deprecated)] count_udaf() - .call(vec![wildcard()]) + .call(vec![Expr::Wildcard { + qualifier: None, + options: Box::new(WildcardOptions::default()), + }]) .distinct() .build() .unwrap(), "count(DISTINCT *)", ), ( + #[expect(deprecated)] count_udaf() - .call(vec![wildcard()]) + .call(vec![Expr::Wildcard { + qualifier: None, + options: Box::new(WildcardOptions::default()), + }]) .filter(lit(true)) .build() .unwrap(), @@ -1942,10 +1966,14 @@ mod tests { r#"row_number(col) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)"#, ), ( + #[expect(deprecated)] Expr::WindowFunction(WindowFunction { fun: WindowFunctionDefinition::AggregateUDF(count_udaf()), params: WindowFunctionParams { - args: vec![wildcard()], + args: vec![Expr::Wildcard { + qualifier: None, + options: Box::new(WildcardOptions::default()), + }], partition_by: vec![], order_by: vec![Sort::new(col("a"), false, true)], window_frame: WindowFrame::new_bounds( @@ -2557,7 +2585,7 @@ mod tests { let default_dialect = CustomDialectBuilder::new().build(); let mysql_dialect = CustomDialectBuilder::new() .with_int64_cast_dtype(ast::DataType::Custom( - ObjectName(vec![Ident::new("SIGNED")]), + ObjectName::from(vec![Ident::new("SIGNED")]), vec![], )) .build(); @@ -2585,7 +2613,7 @@ mod tests { let default_dialect = CustomDialectBuilder::new().build(); let mysql_dialect = CustomDialectBuilder::new() .with_int32_cast_dtype(ast::DataType::Custom( - ObjectName(vec![Ident::new("SIGNED")]), + ObjectName::from(vec![Ident::new("SIGNED")]), vec![], )) .build(); diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index aa5d52212945..5df701981964 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -49,7 +49,7 @@ use datafusion_expr::{ LogicalPlanBuilder, Operator, Projection, SortExpr, TableScan, Unnest, UserDefinedLogicalNode, }; -use sqlparser::ast::{self, Ident, SetExpr, TableAliasColumnDef}; +use sqlparser::ast::{self, Ident, OrderByKind, SetExpr, TableAliasColumnDef}; use std::sync::Arc; /// Convert a DataFusion [`LogicalPlan`] to [`ast::Statement`] @@ -322,6 +322,7 @@ impl Unparser<'_> { } } + #[cfg_attr(feature = "recursive_protection", recursive::recursive)] fn select_to_sql_recursively( &self, plan: &LogicalPlan, @@ -356,7 +357,7 @@ impl Unparser<'_> { table_parts.push( self.new_ident_quoted_if_needs(scan.table_name.table().to_string()), ); - builder.name(ast::ObjectName(table_parts)); + builder.name(ast::ObjectName::from(table_parts)); relation.table(builder); Ok(()) @@ -375,17 +376,20 @@ impl Unparser<'_> { } else { None }; - if self.dialect.unnest_as_table_factor() { if let Some((_, unnest_alias)) = &unnest_params { if let LogicalPlan::Unnest(unnest) = &p.input.as_ref() { - return self.unnest_to_table_factor_sql( - unnest, - query, - select, - relation, - unnest_alias, - ); + if let Some(unnest_relation) = + self.try_unnest_to_table_factor_sql(unnest, unnest_alias)? + { + relation.unnest(unnest_relation); + return self.select_to_sql_recursively( + p.input.as_ref(), + query, + select, + relation, + ); + } } } } @@ -481,7 +485,7 @@ impl Unparser<'_> { }; if let Some(fetch) = sort.fetch { - query_ref.limit(Some(ast::Expr::Value(ast::Value::Number( + query_ref.limit(Some(ast::Expr::value(ast::Value::Number( fetch.to_string(), false, )))); @@ -576,14 +580,20 @@ impl Unparser<'_> { } LogicalPlan::Join(join) => { let mut table_scan_filters = vec![]; + let (left_plan, right_plan) = match join.join_type { + JoinType::RightSemi | JoinType::RightAnti => { + (&join.right, &join.left) + } + _ => (&join.left, &join.right), + }; let left_plan = - match try_transform_to_simple_table_scan_with_filters(&join.left)? { + match try_transform_to_simple_table_scan_with_filters(left_plan)? { Some((plan, filters)) => { table_scan_filters.extend(filters); Arc::new(plan) } - None => Arc::clone(&join.left), + None => Arc::clone(left_plan), }; self.select_to_sql_recursively( @@ -594,12 +604,12 @@ impl Unparser<'_> { )?; let right_plan = - match try_transform_to_simple_table_scan_with_filters(&join.right)? { + match try_transform_to_simple_table_scan_with_filters(right_plan)? { Some((plan, filters)) => { table_scan_filters.extend(filters); Arc::new(plan) } - None => Arc::clone(&join.right), + None => Arc::clone(right_plan), }; let mut right_relation = RelationBuilder::default(); @@ -651,19 +661,70 @@ impl Unparser<'_> { &mut right_relation, )?; - let Ok(Some(relation)) = right_relation.build() else { - return internal_err!("Failed to build right relation"); - }; - - let ast_join = ast::Join { - relation, - global: false, - join_operator: self - .join_operator_to_sql(join.join_type, join_constraint)?, + match join.join_type { + JoinType::LeftSemi + | JoinType::LeftAnti + | JoinType::LeftMark + | JoinType::RightSemi + | JoinType::RightAnti => { + let mut query_builder = QueryBuilder::default(); + let mut from = TableWithJoinsBuilder::default(); + let mut exists_select: SelectBuilder = SelectBuilder::default(); + from.relation(right_relation); + exists_select.push_from(from); + if let Some(filter) = &join.filter { + exists_select.selection(Some(self.expr_to_sql(filter)?)); + } + for (left, right) in &join.on { + exists_select.selection(Some( + self.expr_to_sql(&left.clone().eq(right.clone()))?, + )); + } + exists_select.projection(vec![ast::SelectItem::UnnamedExpr( + ast::Expr::value(ast::Value::Number("1".to_string(), false)), + )]); + query_builder.body(Box::new(SetExpr::Select(Box::new( + exists_select.build()?, + )))); + + let negated = match join.join_type { + JoinType::LeftSemi + | JoinType::RightSemi + | JoinType::LeftMark => false, + JoinType::LeftAnti | JoinType::RightAnti => true, + _ => unreachable!(), + }; + let exists_expr = ast::Expr::Exists { + subquery: Box::new(query_builder.build()?), + negated, + }; + if join.join_type == JoinType::LeftMark { + let (table_ref, _) = right_plan.schema().qualified_field(0); + let column = self + .col_to_sql(&Column::new(table_ref.cloned(), "mark"))?; + select.replace_mark(&column, &exists_expr); + } else { + select.selection(Some(exists_expr)); + } + } + JoinType::Inner + | JoinType::Left + | JoinType::Right + | JoinType::Full => { + let Ok(Some(relation)) = right_relation.build() else { + return internal_err!("Failed to build right relation"); + }; + let ast_join = ast::Join { + relation, + global: false, + join_operator: self + .join_operator_to_sql(join.join_type, join_constraint)?, + }; + let mut from = select.pop_from().unwrap(); + from.push_join(ast_join); + select.push_from(from); + } }; - let mut from = select.pop_from().unwrap(); - from.push_join(ast_join); - select.push_from(from); Ok(()) } @@ -871,29 +932,37 @@ impl Unparser<'_> { None } - fn unnest_to_table_factor_sql( + fn try_unnest_to_table_factor_sql( &self, unnest: &Unnest, - query: &mut Option, - select: &mut SelectBuilder, - relation: &mut RelationBuilder, alias: &str, - ) -> Result<()> { + ) -> Result> { let mut unnest_relation = UnnestRelationBuilder::default(); + let LogicalPlan::Projection(projection) = unnest.input.as_ref() else { + return Ok(None); + }; unnest_relation.alias(Some(self.new_table_alias(alias.to_string(), vec![]))); - let LogicalPlan::Projection(p) = unnest.input.as_ref() else { - return internal_err!("Unnest input is not a Projection: {unnest:?}"); + if !matches!(projection.input.as_ref(), LogicalPlan::EmptyRelation(_)) { + // It may be possible that UNNEST is used as a source for the query. + // However, at this point, we don't yet know if it is just a single expression + // from another source or if it's from UNNEST. + // + // Unnest(Projection(EmptyRelation)) denotes a case with `UNNEST([...])`, + // which is normally safe to unnest as a table factor. + // However, in the future, more comprehensive checks can be added here. + return Ok(None); }; - let exprs = p + + let exprs = projection .expr .iter() .map(|e| self.expr_to_sql(e)) .collect::>>()?; unnest_relation.array_exprs(exprs); - relation.unnest(unnest_relation); - self.select_to_sql_recursively(p.input.as_ref(), query, select, relation) + + Ok(Some(unnest_relation)) } fn is_scan_with_pushdown(scan: &TableScan) -> bool { @@ -1076,11 +1145,13 @@ impl Unparser<'_> { } } - fn sorts_to_sql(&self, sort_exprs: &[SortExpr]) -> Result> { - sort_exprs - .iter() - .map(|sort_expr| self.sort_to_sql(sort_expr)) - .collect::>>() + fn sorts_to_sql(&self, sort_exprs: &[SortExpr]) -> Result { + Ok(OrderByKind::Expressions( + sort_exprs + .iter() + .map(|sort_expr| self.sort_to_sql(sort_expr)) + .collect::>>()?, + )) } fn join_operator_to_sql( @@ -1136,7 +1207,7 @@ impl Unparser<'_> { // this is represented as two columns like `[t1.id, t2.id]` // This code forms `id` (without relation name) let ident = self.new_ident_quoted_if_needs(left_name.to_string()); - object_names.push(ast::ObjectName(vec![ident])); + object_names.push(ast::ObjectName::from(vec![ident])); } // USING is only valid with matching column names; arbitrary expressions // are not allowed diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs index 2e3c8e9e9484..aa480cf4fff9 100644 --- a/datafusion/sql/src/unparser/rewrite.rs +++ b/datafusion/sql/src/unparser/rewrite.rs @@ -363,6 +363,7 @@ pub(super) fn inject_column_aliases( expr: Box::new(expr.clone()), relation, name: col_alias.value, + metadata: None, }) }) .collect::>(); diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs index f21fb2fcb49f..75038ccc4314 100644 --- a/datafusion/sql/src/unparser/utils.rs +++ b/datafusion/sql/src/unparser/utils.rs @@ -448,7 +448,7 @@ pub(crate) fn date_part_to_sql( }; return Ok(Some(ast::Expr::Function(ast::Function { - name: ast::ObjectName(vec![ast::Ident { + name: ast::ObjectName::from(vec![ast::Ident { value: "strftime".to_string(), quote_style: None, span: Span::empty(), @@ -457,7 +457,7 @@ pub(crate) fn date_part_to_sql( duplicate_treatment: None, args: vec![ ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr( - ast::Expr::Value(ast::Value::SingleQuotedString( + ast::Expr::value(ast::Value::SingleQuotedString( field.to_string(), )), )), diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 4a248de101dc..bc2a94cd44ff 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -102,10 +102,10 @@ impl CheckColumnsSatisfyExprsPurpose { fn message_prefix(&self) -> &'static str { match self { CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate => { - "Projection references non-aggregate values" + "Column in SELECT must be in GROUP BY or an aggregate function" } CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate => { - "HAVING clause references non-aggregate values" + "Column in HAVING must be in GROUP BY or an aggregate function" } } } @@ -159,7 +159,7 @@ fn check_column_satisfies_expr( ) -> Result<()> { if !columns.contains(expr) { return plan_err!( - "{}: Expression {} could not be resolved from available columns: {}", + "{}: While expanding wildcard, column \"{}\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"{}\" appears in the SELECT clause satisfies this requirement", purpose.message_prefix(), expr, expr_vec_fmt!(columns) @@ -169,7 +169,7 @@ fn check_column_satisfies_expr( purpose.diagnostic_message(expr), expr.spans().and_then(|spans| spans.first()), ) - .with_help(format!("add '{expr}' to GROUP BY clause"), None); + .with_help(format!("Either add '{expr}' to GROUP BY clause, or use an aggregare function like ANY_VALUE({expr})"), None); err.with_diagnostic(diagnostic) }); } @@ -496,30 +496,30 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> { /// /// For example an expr of **unnest(unnest(column1)) + unnest(unnest(unnest(column2)))** /// ```text - /// ┌──────────────────┐ - /// │ binaryexpr │ - /// │ │ - /// └──────────────────┘ - /// f_down / / │ │ - /// / / f_up │ │ - /// / / f_down│ │f_up - /// unnest │ │ - /// │ │ - /// f_down / / f_up(rewriting) │ │ - /// / / - /// / / unnest - /// unnest - /// f_down / / f_up(rewriting) - /// f_down / /f_up / / - /// / / / / - /// / / unnest - /// column1 - /// f_down / /f_up - /// / / - /// / / - /// column2 + /// ┌──────────────────┐ + /// │ binaryexpr │ + /// │ │ + /// └──────────────────┘ + /// f_down / / │ │ + /// / / f_up │ │ + /// / / f_down│ │f_up + /// unnest │ │ + /// │ │ + /// f_down / / f_up(rewriting) │ │ + /// / / + /// / / unnest + /// unnest + /// f_down / / f_up(rewriting) + /// f_down / /f_up / / + /// / / / / + /// / / unnest + /// column1 + /// f_down / /f_up + /// / / + /// / / + /// column2 /// ``` - /// + /// fn f_up(&mut self, expr: Expr) -> Result> { if let Expr::Unnest(ref traversing_unnest) = expr { if traversing_unnest == self.top_most_unnest.as_ref().unwrap() { diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs index d70484f718c8..ebb21e9cdef5 100644 --- a/datafusion/sql/tests/cases/diagnostic.rs +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -190,7 +190,7 @@ fn test_missing_non_aggregate_in_group_by() -> Result<()> { assert_eq!(diag.span, Some(spans["a"])); assert_eq!( diag.helps[0].message, - "add 'person.first_name' to GROUP BY clause" + "Either add 'person.first_name' to GROUP BY clause, or use an aggregare function like ANY_VALUE(person.first_name)" ); Ok(()) } @@ -286,3 +286,109 @@ fn test_invalid_function() -> Result<()> { assert_eq!(diag.span, Some(spans["whole"])); Ok(()) } +#[test] +fn test_scalar_subquery_multiple_columns() -> Result<(), Box> { + let query = "SELECT (SELECT 1 AS /*x*/x/*x*/, 2 AS /*y*/y/*y*/) AS col"; + let spans = get_spans(query); + let diag = do_query(query); + + assert_eq!( + diag.message, + "Too many columns! The subquery should only return one column" + ); + + let expected_span = Some(Span { + start: spans["x"].start, + end: spans["y"].end, + }); + assert_eq!(diag.span, expected_span); + assert_eq!( + diag.notes + .iter() + .map(|n| (n.message.as_str(), n.span)) + .collect::>(), + vec![("Extra column 1", Some(spans["y"]))] + ); + assert_eq!( + diag.helps + .iter() + .map(|h| h.message.as_str()) + .collect::>(), + vec!["Select only one column in the subquery"] + ); + + Ok(()) +} + +#[test] +fn test_in_subquery_multiple_columns() -> Result<(), Box> { + // This query uses an IN subquery with multiple columns - this should trigger an error + let query = "SELECT * FROM person WHERE id IN (SELECT /*id*/id/*id*/, /*first*/first_name/*first*/ FROM person)"; + let spans = get_spans(query); + let diag = do_query(query); + + assert_eq!( + diag.message, + "Too many columns! The subquery should only return one column" + ); + + let expected_span = Some(Span { + start: spans["id"].start, + end: spans["first"].end, + }); + assert_eq!(diag.span, expected_span); + assert_eq!( + diag.notes + .iter() + .map(|n| (n.message.as_str(), n.span)) + .collect::>(), + vec![("Extra column 1", Some(spans["first"]))] + ); + assert_eq!( + diag.helps + .iter() + .map(|h| h.message.as_str()) + .collect::>(), + vec!["Select only one column in the subquery"] + ); + Ok(()) +} + +#[test] +fn test_unary_op_plus_with_column() -> Result<()> { + // Test with a direct query that references a column with an incompatible type + let query = "SELECT +/*whole*/first_name/*whole*/ FROM person"; + let spans = get_spans(query); + let diag = do_query(query); + assert_eq!(diag.message, "+ cannot be used with Utf8"); + assert_eq!(diag.span, Some(spans["whole"])); + assert_eq!( + diag.notes[0].message, + "+ can only be used with numbers, intervals, and timestamps" + ); + assert_eq!( + diag.helps[0].message, + "perhaps you need to cast person.first_name" + ); + Ok(()) +} + +#[test] +fn test_unary_op_plus_with_non_column() -> Result<()> { + // create a table with a column of type varchar + let query = "SELECT +'a'"; + let diag = do_query(query); + assert_eq!(diag.message, "+ cannot be used with Utf8"); + assert_eq!( + diag.notes[0].message, + "+ can only be used with numbers, intervals, and timestamps" + ); + assert_eq!(diag.notes[0].span, None); + assert_eq!( + diag.helps[0].message, + "perhaps you need to cast Utf8(\"a\")" + ); + assert_eq!(diag.helps[0].span, None); + assert_eq!(diag.span, None); + Ok(()) +} diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 08155381c7cd..498d3e4098df 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -16,7 +16,9 @@ // under the License. use arrow::datatypes::{DataType, Field, Schema}; -use datafusion_common::{assert_contains, DFSchema, DFSchemaRef, Result, TableReference}; +use datafusion_common::{ + assert_contains, Column, DFSchema, DFSchemaRef, Result, TableReference, +}; use datafusion_expr::test::function_stub::{ count_udaf, max_udaf, min_udaf, sum, sum_udaf, }; @@ -32,7 +34,8 @@ use datafusion_functions_window::rank::rank_udwf; use datafusion_sql::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_sql::unparser::dialect::{ CustomDialectBuilder, DefaultDialect as UnparserDefaultDialect, DefaultDialect, - Dialect as UnparserDialect, MySqlDialect as UnparserMySqlDialect, SqliteDialect, + Dialect as UnparserDialect, MySqlDialect as UnparserMySqlDialect, + PostgreSqlDialect as UnparserPostgreSqlDialect, SqliteDialect, }; use datafusion_sql::unparser::{expr_to_sql, plan_to_sql, Unparser}; use sqlparser::ast::Statement; @@ -43,7 +46,7 @@ use std::{fmt, vec}; use crate::common::{MockContextProvider, MockSessionState}; use datafusion_expr::builder::{ - project, table_scan_with_filter_and_fetch, table_scan_with_filters, + project, subquery_alias, table_scan_with_filter_and_fetch, table_scan_with_filters, }; use datafusion_functions::core::planner::CoreFunctionPlanner; use datafusion_functions_nested::extract::array_element_udf; @@ -325,9 +328,9 @@ fn roundtrip_statement_with_dialect() -> Result<()> { unparser_dialect: Box::new(UnparserMySqlDialect {}), }, TestStatementWithDialect { - sql: "select * from (select * from j1 limit 10);", + sql: "select j1_id from (select j1_id from j1 limit 10);", expected: - "SELECT * FROM (SELECT * FROM `j1` LIMIT 10) AS `derived_limit`", + "SELECT `j1`.`j1_id` FROM (SELECT `j1`.`j1_id` FROM `j1` LIMIT 10) AS `derived_limit`", parser_dialect: Box::new(MySqlDialect {}), unparser_dialect: Box::new(UnparserMySqlDialect {}), }, @@ -369,7 +372,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { }, TestStatementWithDialect { sql: "SELECT j1_string from j1 join j2 on j1.j1_id = j2.j2_id order by j1_id", - expected: r#"SELECT j1.j1_string FROM j1 JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id ASC NULLS LAST"#, + expected: r#"SELECT j1.j1_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id ASC NULLS LAST"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, @@ -394,7 +397,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { ) abc ORDER BY abc.j2_string", - expected: r#"SELECT abc.j1_string, abc.j2_string FROM (SELECT DISTINCT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#, + expected: r#"SELECT abc.j1_string, abc.j2_string FROM (SELECT DISTINCT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, @@ -411,7 +414,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { j1_id ) AS agg (id, string_count) ", - expected: r#"SELECT agg.string_count FROM (SELECT j1.j1_id, min(j2.j2_string) FROM j1 LEFT JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id) AS agg (id, string_count)"#, + expected: r#"SELECT agg.string_count FROM (SELECT j1.j1_id, min(j2.j2_string) FROM j1 LEFT OUTER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id) AS agg (id, string_count)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, @@ -440,7 +443,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { ) abc ORDER BY abc.j2_string", - expected: r#"SELECT abc.j1_string, abc.j2_string FROM (SELECT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id, j1.j1_string, j2.j2_string ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#, + expected: r#"SELECT abc.j1_string, abc.j2_string FROM (SELECT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id, j1.j1_string, j2.j2_string ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, @@ -465,7 +468,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { ) abc ORDER BY j2_string", - expected: r#"SELECT abc.j1_string FROM (SELECT j1.j1_string, j2.j2_string FROM j1 JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST, j2.j2_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#, + expected: r#"SELECT abc.j1_string FROM (SELECT j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST, j2.j2_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, @@ -527,85 +530,79 @@ fn roundtrip_statement_with_dialect() -> Result<()> { }, TestStatementWithDialect { sql: "SELECT * FROM (SELECT j1_id + 1 FROM j1) AS temp_j(id2)", - expected: r#"SELECT * FROM (SELECT (`j1`.`j1_id` + 1) AS `id2` FROM `j1`) AS `temp_j`"#, + expected: r#"SELECT `temp_j`.`id2` FROM (SELECT (`j1`.`j1_id` + 1) AS `id2` FROM `j1`) AS `temp_j`"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(SqliteDialect {}), }, TestStatementWithDialect { sql: "SELECT * FROM (SELECT j1_id FROM j1 LIMIT 1) AS temp_j(id2)", - expected: r#"SELECT * FROM (SELECT `j1`.`j1_id` AS `id2` FROM `j1` LIMIT 1) AS `temp_j`"#, + expected: r#"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2` FROM `j1` LIMIT 1) AS `temp_j`"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(SqliteDialect {}), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3])", - expected: r#"SELECT * FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))")"#, + expected: r#"SELECT "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))" FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))")"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]) AS t1 (c1)", - expected: r#"SELECT * FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") AS t1 (c1)"#, - parser_dialect: Box::new(GenericDialect {}), - unparser_dialect: Box::new(UnparserDefaultDialect {}), - }, - TestStatementWithDialect { - sql: "SELECT * FROM UNNEST([1,2,3]) AS t1 (c1)", - expected: r#"SELECT * FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") AS t1 (c1)"#, + expected: r#"SELECT t1.c1 FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") AS t1 (c1)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]), j1", - expected: r#"SELECT * FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") CROSS JOIN j1"#, + expected: r#"SELECT "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))", j1.j1_id, j1.j1_string FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") CROSS JOIN j1"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) JOIN j1 ON u.c1 = j1.j1_id", - expected: r#"SELECT * FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") AS u (c1) JOIN j1 ON (u.c1 = j1.j1_id)"#, + expected: r#"SELECT u.c1, j1.j1_id, j1.j1_string FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") AS u (c1) INNER JOIN j1 ON (u.c1 = j1.j1_id)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) UNION ALL SELECT * FROM UNNEST([4,5,6]) u(c1)", - expected: r#"SELECT * FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") AS u (c1) UNION ALL SELECT * FROM (SELECT UNNEST([4, 5, 6]) AS "UNNEST(make_array(Int64(4),Int64(5),Int64(6)))") AS u (c1)"#, + expected: r#"SELECT u.c1 FROM (SELECT UNNEST([1, 2, 3]) AS "UNNEST(make_array(Int64(1),Int64(2),Int64(3)))") AS u (c1) UNION ALL SELECT u.c1 FROM (SELECT UNNEST([4, 5, 6]) AS "UNNEST(make_array(Int64(4),Int64(5),Int64(6)))") AS u (c1)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3])", - expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3)))"#, + expected: r#"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))) FROM UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3)))"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]) AS t1 (c1)", - expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS t1 (c1)"#, + expected: r#"SELECT t1.c1 FROM UNNEST([1, 2, 3]) AS t1 (c1)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]) AS t1 (c1)", - expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS t1 (c1)"#, + expected: r#"SELECT t1.c1 FROM UNNEST([1, 2, 3]) AS t1 (c1)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]), j1", - expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3))) CROSS JOIN j1"#, + expected: r#"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))), j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3))) CROSS JOIN j1"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) JOIN j1 ON u.c1 = j1.j1_id", - expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS u (c1) JOIN j1 ON (u.c1 = j1.j1_id)"#, + expected: r#"SELECT u.c1, j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) AS u (c1) INNER JOIN j1 ON (u.c1 = j1.j1_id)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, TestStatementWithDialect { sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) UNION ALL SELECT * FROM UNNEST([4,5,6]) u(c1)", - expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS u (c1) UNION ALL SELECT * FROM UNNEST([4, 5, 6]) AS u (c1)"#, + expected: r#"SELECT u.c1 FROM UNNEST([1, 2, 3]) AS u (c1) UNION ALL SELECT u.c1 FROM UNNEST([4, 5, 6]) AS u (c1)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, @@ -617,7 +614,7 @@ fn roundtrip_statement_with_dialect() -> Result<()> { }, TestStatementWithDialect { sql: "SELECT UNNEST([1,2,3]) as c1", - expected: r#"SELECT * FROM UNNEST([1, 2, 3]) AS c1"#, + expected: r#"SELECT UNNEST([1, 2, 3]) AS c1"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, @@ -629,25 +626,31 @@ fn roundtrip_statement_with_dialect() -> Result<()> { }, TestStatementWithDialect { sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col)", - expected: r#"SELECT * FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS UNNEST(outer_ref(u.array_col))"#, + expected: r#"SELECT u.array_col, u.struct_col, UNNEST(outer_ref(u.array_col)) FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS UNNEST(outer_ref(u.array_col))"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, TestStatementWithDialect { sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col) AS t1 (c1)", - expected: r#"SELECT * FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS t1 (c1)"#, + expected: r#"SELECT u.array_col, u.struct_col, t1.c1 FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS t1 (c1)"#, + parser_dialect: Box::new(GenericDialect {}), + unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), + }, + TestStatementWithDialect { + sql: "SELECT unnest([1, 2, 3, 4]) from unnest([1, 2, 3]);", + expected: r#"SELECT UNNEST([1, 2, 3, 4]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3),Int64(4))) FROM UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3)))"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(CustomDialectBuilder::default().with_unnest_as_table_factor(true).build()), }, TestStatementWithDialect { sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col)", - expected: r#"SELECT * FROM unnest_table AS u CROSS JOIN LATERAL (SELECT UNNEST(u.array_col) AS "UNNEST(outer_ref(u.array_col))")"#, + expected: r#"SELECT u.array_col, u.struct_col, "UNNEST(outer_ref(u.array_col))" FROM unnest_table AS u CROSS JOIN LATERAL (SELECT UNNEST(u.array_col) AS "UNNEST(outer_ref(u.array_col))")"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, TestStatementWithDialect { sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col) AS t1 (c1)", - expected: r#"SELECT * FROM unnest_table AS u CROSS JOIN LATERAL (SELECT UNNEST(u.array_col) AS "UNNEST(outer_ref(u.array_col))") AS t1 (c1)"#, + expected: r#"SELECT u.array_col, u.struct_col, t1.c1 FROM unnest_table AS u CROSS JOIN LATERAL (SELECT UNNEST(u.array_col) AS "UNNEST(outer_ref(u.array_col))") AS t1 (c1)"#, parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, @@ -1096,7 +1099,7 @@ fn test_table_scan_pushdown() -> Result<()> { plan_to_sql(&query_from_table_scan_with_two_projections)?; assert_eq!( query_from_table_scan_with_two_projections.to_string(), - "SELECT * FROM (SELECT t1.id, t1.age FROM t1)" + "SELECT t1.id, t1.age FROM (SELECT t1.id, t1.age FROM t1)" ); let table_scan_with_filter = table_scan_with_filters( @@ -1282,7 +1285,7 @@ fn test_join_with_table_scan_filters() -> Result<()> { let sql = plan_to_sql(&join_plan_with_filter)?; - let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND ("left"."name" LIKE 'some_name' AND (age > 10)))"#; + let expected_sql = r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND ("left"."name" LIKE 'some_name' AND (age > 10)))"#; assert_eq!(sql.to_string(), expected_sql); @@ -1297,7 +1300,7 @@ fn test_join_with_table_scan_filters() -> Result<()> { let sql = plan_to_sql(&join_plan_no_filter)?; - let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table ON "left".id = right_table.id AND ("left"."name" LIKE 'some_name' AND (age > 10))"#; + let expected_sql = r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND ("left"."name" LIKE 'some_name' AND (age > 10))"#; assert_eq!(sql.to_string(), expected_sql); @@ -1322,7 +1325,7 @@ fn test_join_with_table_scan_filters() -> Result<()> { let sql = plan_to_sql(&join_plan_multiple_filters)?; - let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND (("left"."name" LIKE 'some_name' AND (right_table."name" = 'before_join_filter_val')) AND (age > 10))) WHERE ("left"."name" = 'after_join_filter_val')"#; + let expected_sql = r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND (("left"."name" LIKE 'some_name' AND (right_table."name" = 'before_join_filter_val')) AND (age > 10))) WHERE ("left"."name" = 'after_join_filter_val')"#; assert_eq!(sql.to_string(), expected_sql); @@ -1352,7 +1355,7 @@ fn test_join_with_table_scan_filters() -> Result<()> { let sql = plan_to_sql(&join_plan_duplicated_filter)?; - let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND (("left"."name" LIKE 'some_name' AND (right_table.age > 10)) AND (right_table.age < 11)))"#; + let expected_sql = r#"SELECT * FROM left_table AS "left" INNER JOIN right_table ON "left".id = right_table.id AND (("left".id > 5) AND (("left"."name" LIKE 'some_name' AND (right_table.age > 10)) AND (right_table.age < 11)))"#; assert_eq!(sql.to_string(), expected_sql); @@ -1457,13 +1460,13 @@ fn test_unnest_to_sql() { fn test_join_with_no_conditions() { sql_round_trip( GenericDialect {}, - "SELECT * FROM j1 JOIN j2", - "SELECT * FROM j1 CROSS JOIN j2", + "SELECT j1.j1_id, j1.j1_string FROM j1 JOIN j2", + "SELECT j1.j1_id, j1.j1_string FROM j1 CROSS JOIN j2", ); sql_round_trip( GenericDialect {}, - "SELECT * FROM j1 CROSS JOIN j2", - "SELECT * FROM j1 CROSS JOIN j2", + "SELECT j1.j1_id, j1.j1_string FROM j1 CROSS JOIN j2", + "SELECT j1.j1_id, j1.j1_string FROM j1 CROSS JOIN j2", ); } @@ -1564,7 +1567,7 @@ fn test_unparse_extension_to_statement() -> Result<()> { Arc::new(UnusedUnparser {}), ]); let sql = unparser.plan_to_sql(&extension)?; - let expected = "SELECT * FROM j1"; + let expected = "SELECT j1.j1_id, j1.j1_string FROM j1"; assert_eq!(sql.to_string(), expected); if let Some(err) = plan_to_sql(&extension).err() { @@ -1627,7 +1630,8 @@ fn test_unparse_extension_to_sql() -> Result<()> { Arc::new(UnusedUnparser {}), ]); let sql = unparser.plan_to_sql(&plan)?; - let expected = "SELECT j1.j1_id AS user_id FROM (SELECT * FROM j1)"; + let expected = + "SELECT j1.j1_id AS user_id FROM (SELECT j1.j1_id, j1.j1_string FROM j1)"; assert_eq!(sql.to_string(), expected); if let Some(err) = plan_to_sql(&plan).err() { @@ -1751,3 +1755,167 @@ fn test_unparse_subquery_alias_with_table_pushdown() -> Result<()> { assert_eq!(sql.to_string(), expected); Ok(()) } + +#[test] +fn test_unparse_left_anti_join() -> Result<()> { + // select t1.d from t1 where c not in (select c from t2) + let schema = Schema::new(vec![ + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Int32, false), + ]); + + // LeftAnti Join: t1.c = __correlated_sq_1.c + // TableScan: t1 projection=[c] + // SubqueryAlias: __correlated_sq_1 + // TableScan: t2 projection=[c] + + let table_scan1 = table_scan(Some("t1"), &schema, Some(vec![0, 1]))?.build()?; + let table_scan2 = table_scan(Some("t2"), &schema, Some(vec![0]))?.build()?; + let subquery = subquery_alias(table_scan2, "__correlated_sq_1")?; + let plan = LogicalPlanBuilder::from(table_scan1) + .project(vec![col("t1.d")])? + .join_on( + subquery, + datafusion_expr::JoinType::LeftAnti, + vec![col("t1.c").eq(col("__correlated_sq_1.c"))], + )? + .build()?; + + let unparser = Unparser::new(&UnparserPostgreSqlDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_eq!("SELECT \"t1\".\"d\" FROM \"t1\" WHERE NOT EXISTS (SELECT 1 FROM \"t2\" AS \"__correlated_sq_1\" WHERE (\"t1\".\"c\" = \"__correlated_sq_1\".\"c\"))", sql.to_string()); + Ok(()) +} + +#[test] +fn test_unparse_left_semi_join() -> Result<()> { + // select t1.d from t1 where c in (select c from t2) + let schema = Schema::new(vec![ + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Int32, false), + ]); + + // LeftSemi Join: t1.c = __correlated_sq_1.c + // TableScan: t1 projection=[c] + // SubqueryAlias: __correlated_sq_1 + // TableScan: t2 projection=[c] + + let table_scan1 = table_scan(Some("t1"), &schema, Some(vec![0, 1]))?.build()?; + let table_scan2 = table_scan(Some("t2"), &schema, Some(vec![0]))?.build()?; + let subquery = subquery_alias(table_scan2, "__correlated_sq_1")?; + let plan = LogicalPlanBuilder::from(table_scan1) + .project(vec![col("t1.d")])? + .join_on( + subquery, + datafusion_expr::JoinType::LeftSemi, + vec![col("t1.c").eq(col("__correlated_sq_1.c"))], + )? + .build()?; + + let unparser = Unparser::new(&UnparserPostgreSqlDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_eq!("SELECT \"t1\".\"d\" FROM \"t1\" WHERE EXISTS (SELECT 1 FROM \"t2\" AS \"__correlated_sq_1\" WHERE (\"t1\".\"c\" = \"__correlated_sq_1\".\"c\"))", sql.to_string()); + Ok(()) +} + +#[test] +fn test_unparse_left_mark_join() -> Result<()> { + // select t1.d from t1 where t1.d < 0 OR exists (select 1 from t2 where t1.c = t2.c) + let schema = Schema::new(vec![ + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Int32, false), + ]); + // Filter: __correlated_sq_1.mark OR t1.d < Int32(0) + // Projection: t1.d + // LeftMark Join: Filter: t1.c = __correlated_sq_1.c + // TableScan: t1 projection=[c, d] + // SubqueryAlias: __correlated_sq_1 + // TableScan: t2 projection=[c] + let table_scan1 = table_scan(Some("t1"), &schema, Some(vec![0, 1]))?.build()?; + let table_scan2 = table_scan(Some("t2"), &schema, Some(vec![0]))?.build()?; + let subquery = subquery_alias(table_scan2, "__correlated_sq_1")?; + let plan = LogicalPlanBuilder::from(table_scan1) + .join_on( + subquery, + datafusion_expr::JoinType::LeftMark, + vec![col("t1.c").eq(col("__correlated_sq_1.c"))], + )? + .project(vec![col("t1.d")])? + .filter(col("mark").or(col("t1.d").lt(lit(0))))? + .build()?; + + let unparser = Unparser::new(&UnparserPostgreSqlDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_eq!("SELECT \"t1\".\"d\" FROM \"t1\" WHERE (EXISTS (SELECT 1 FROM \"t2\" AS \"__correlated_sq_1\" WHERE (\"t1\".\"c\" = \"__correlated_sq_1\".\"c\")) OR (\"t1\".\"d\" < 0))", sql.to_string()); + Ok(()) +} + +#[test] +fn test_unparse_right_semi_join() -> Result<()> { + // select t2.c, t2.d from t1 right semi join t2 on t1.c = t2.c where t2.c <= 1 + let schema = Schema::new(vec![ + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Int32, false), + ]); + // Filter: t2.c <= Int64(1) + // RightSemi Join: t1.c = t2.c + // TableScan: t1 projection=[c, d] + // Projection: t2.c, t2.d + // TableScan: t2 projection=[c, d] + let left = table_scan(Some("t1"), &schema, Some(vec![0, 1]))?.build()?; + let right_table_scan = table_scan(Some("t2"), &schema, Some(vec![0, 1]))?.build()?; + let right = LogicalPlanBuilder::from(right_table_scan) + .project(vec![col("c"), col("d")])? + .build()?; + let plan = LogicalPlanBuilder::from(left) + .join( + right, + datafusion_expr::JoinType::RightSemi, + ( + vec![Column::from_qualified_name("t1.c")], + vec![Column::from_qualified_name("t2.c")], + ), + None, + )? + .filter(col("t2.c").lt_eq(lit(1i64)))? + .build()?; + let unparser = Unparser::new(&UnparserPostgreSqlDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_eq!("SELECT \"t2\".\"c\", \"t2\".\"d\" FROM \"t2\" WHERE (\"t2\".\"c\" <= 1) AND EXISTS (SELECT 1 FROM \"t1\" WHERE (\"t1\".\"c\" = \"t2\".\"c\"))", sql.to_string()); + Ok(()) +} + +#[test] +fn test_unparse_right_anti_join() -> Result<()> { + // select t2.c, t2.d from t1 right anti join t2 on t1.c = t2.c where t2.c <= 1 + let schema = Schema::new(vec![ + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Int32, false), + ]); + // Filter: t2.c <= Int64(1) + // RightAnti Join: t1.c = t2.c + // TableScan: t1 projection=[c, d] + // Projection: t2.c, t2.d + // TableScan: t2 projection=[c, d] + let left = table_scan(Some("t1"), &schema, Some(vec![0, 1]))?.build()?; + let right_table_scan = table_scan(Some("t2"), &schema, Some(vec![0, 1]))?.build()?; + let right = LogicalPlanBuilder::from(right_table_scan) + .project(vec![col("c"), col("d")])? + .build()?; + let plan = LogicalPlanBuilder::from(left) + .join( + right, + datafusion_expr::JoinType::RightAnti, + ( + vec![Column::from_qualified_name("t1.c")], + vec![Column::from_qualified_name("t2.c")], + ), + None, + )? + .filter(col("t2.c").lt_eq(lit(1i64)))? + .build()?; + let unparser = Unparser::new(&UnparserPostgreSqlDialect {}); + let sql = unparser.plan_to_sql(&plan)?; + assert_eq!("SELECT \"t2\".\"c\", \"t2\".\"d\" FROM \"t2\" WHERE (\"t2\".\"c\" <= 1) AND NOT EXISTS (SELECT 1 FROM \"t1\" WHERE (\"t1\".\"c\" = \"t2\".\"c\"))", sql.to_string()); + Ok(()) +} diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 1df18302687e..2939e965cd6e 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -30,8 +30,8 @@ use datafusion_expr::{ col, logical_plan::{LogicalPlan, Prepare}, test::function_stub::sum_udaf, - CreateIndex, DdlStatement, ScalarUDF, ScalarUDFImpl, Signature, Statement, - Volatility, + ColumnarValue, CreateIndex, DdlStatement, ScalarFunctionArgs, ScalarUDF, + ScalarUDFImpl, Signature, Statement, Volatility, }; use datafusion_functions::{string, unicode}; use datafusion_sql::{ @@ -54,15 +54,6 @@ use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect}; mod cases; mod common; -#[test] -fn test_schema_support() { - quick_test( - "SELECT * FROM s1.test", - "Projection: *\ - \n TableScan: s1.test", - ); -} - #[test] fn parse_decimals() { let test_data = [ @@ -92,6 +83,7 @@ fn parse_decimals() { parse_float_as_decimal: true, enable_ident_normalization: false, support_varchar_with_length: false, + map_varchar_to_utf8view: false, enable_options_value_normalization: false, collect_spans: false, }, @@ -148,6 +140,7 @@ fn parse_ident_normalization() { parse_float_as_decimal: false, enable_ident_normalization, support_varchar_with_length: false, + map_varchar_to_utf8view: false, enable_options_value_normalization: false, collect_spans: false, }, @@ -449,19 +442,6 @@ Explain quick_test(sql, plan); } -#[test] -fn plan_copy_to_query() { - let sql = "COPY (select * from test_decimal limit 10) to 'output.csv'"; - let plan = r#" -CopyTo: format=csv output_url=output.csv options: () - Limit: skip=0, fetch=10 - Projection: * - TableScan: test_decimal - "# - .trim(); - quick_test(sql, plan); -} - #[test] fn plan_insert() { let sql = @@ -585,15 +565,6 @@ fn select_repeated_column() { ); } -#[test] -fn select_wildcard_with_repeated_column_but_is_aliased() { - quick_test( - "SELECT *, first_name AS fn from person", - "Projection: *, person.first_name AS fn\ - \n TableScan: person", - ); -} - #[test] fn select_scalar_func_with_literal_no_relation() { quick_test( @@ -793,30 +764,6 @@ fn join_with_ambiguous_column() { quick_test(sql, expected); } -#[test] -fn where_selection_with_ambiguous_column() { - let sql = "SELECT * FROM person a, person b WHERE id = id + 1"; - let err = logical_plan(sql) - .expect_err("query should have failed") - .strip_backtrace(); - assert_eq!( - "\"Schema error: Ambiguous reference to unqualified field id\"", - format!("{err:?}") - ); -} - -#[test] -fn natural_join() { - let sql = "SELECT * FROM lineitem a NATURAL JOIN lineitem b"; - let expected = "Projection: *\ - \n Inner Join: Using a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.price = b.price\ - \n SubqueryAlias: a\ - \n TableScan: lineitem\ - \n SubqueryAlias: b\ - \n TableScan: lineitem"; - quick_test(sql, expected); -} - #[test] fn natural_left_join() { let sql = "SELECT l_item_id FROM lineitem a NATURAL LEFT JOIN lineitem b"; @@ -841,83 +788,6 @@ fn natural_right_join() { quick_test(sql, expected); } -#[test] -fn natural_join_no_common_becomes_cross_join() { - let sql = "SELECT * FROM person a NATURAL JOIN lineitem b"; - let expected = "Projection: *\ - \n Cross Join: \ - \n SubqueryAlias: a\ - \n TableScan: person\ - \n SubqueryAlias: b\ - \n TableScan: lineitem"; - quick_test(sql, expected); -} - -#[test] -fn using_join_multiple_keys() { - let sql = "SELECT * FROM person a join person b using (id, age)"; - let expected = "Projection: *\ - \n Inner Join: Using a.id = b.id, a.age = b.age\ - \n SubqueryAlias: a\ - \n TableScan: person\ - \n SubqueryAlias: b\ - \n TableScan: person"; - quick_test(sql, expected); -} - -#[test] -fn using_join_multiple_keys_subquery() { - let sql = - "SELECT age FROM (SELECT * FROM person a join person b using (id, age, state))"; - let expected = "Projection: a.age\ - \n Projection: *\ - \n Inner Join: Using a.id = b.id, a.age = b.age, a.state = b.state\ - \n SubqueryAlias: a\ - \n TableScan: person\ - \n SubqueryAlias: b\ - \n TableScan: person"; - quick_test(sql, expected); -} - -#[test] -fn using_join_multiple_keys_qualified_wildcard_select() { - let sql = "SELECT a.* FROM person a join person b using (id, age)"; - let expected = "Projection: a.*\ - \n Inner Join: Using a.id = b.id, a.age = b.age\ - \n SubqueryAlias: a\ - \n TableScan: person\ - \n SubqueryAlias: b\ - \n TableScan: person"; - quick_test(sql, expected); -} - -#[test] -fn using_join_multiple_keys_select_all_columns() { - let sql = "SELECT a.*, b.* FROM person a join person b using (id, age)"; - let expected = "Projection: a.*, b.*\ - \n Inner Join: Using a.id = b.id, a.age = b.age\ - \n SubqueryAlias: a\ - \n TableScan: person\ - \n SubqueryAlias: b\ - \n TableScan: person"; - quick_test(sql, expected); -} - -#[test] -fn using_join_multiple_keys_multiple_joins() { - let sql = "SELECT * FROM person a join person b using (id, age, state) join person c using (id, age, state)"; - let expected = "Projection: *\ - \n Inner Join: Using a.id = c.id, a.age = c.age, a.state = c.state\ - \n Inner Join: Using a.id = b.id, a.age = b.age, a.state = b.state\ - \n SubqueryAlias: a\ - \n TableScan: person\ - \n SubqueryAlias: b\ - \n TableScan: person\ - \n SubqueryAlias: c\ - \n TableScan: person"; - quick_test(sql, expected); -} - #[test] fn select_with_having() { let sql = "SELECT id, age @@ -950,7 +820,7 @@ fn select_with_having_refers_to_invalid_column() { HAVING first_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.first_name could not be resolved from available columns: person.id, max(person.age)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.id, max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -974,7 +844,7 @@ fn select_with_having_with_aggregate_not_in_select() { HAVING MAX(age) > 100"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.first_name could not be resolved from available columns: max(person.age)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1010,7 +880,7 @@ fn select_aggregate_with_having_referencing_column_not_in_select() { HAVING first_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.first_name could not be resolved from available columns: count(*)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.first_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"count(*)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1131,7 +1001,7 @@ fn select_aggregate_with_group_by_with_having_referencing_column_not_in_group_by HAVING MAX(age) > 10 AND last_name = 'M'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: HAVING clause references non-aggregate values: Expression person.last_name could not be resolved from available columns: person.first_name, max(person.age)", + "Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.last_name\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.first_name, max(person.age)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1233,24 +1103,6 @@ fn select_binary_expr_nested() { quick_test(sql, expected); } -#[test] -fn select_wildcard_with_groupby() { - quick_test( - r#"SELECT * FROM person GROUP BY id, first_name, last_name, age, state, salary, birth_date, "😀""#, - "Projection: *\ - \n Aggregate: groupBy=[[person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person.😀]], aggr=[[]]\ - \n TableScan: person", - ); - quick_test( - "SELECT * FROM (SELECT first_name, last_name FROM person) AS a GROUP BY first_name, last_name", - "Projection: *\ - \n Aggregate: groupBy=[[a.first_name, a.last_name]], aggr=[[]]\ - \n SubqueryAlias: a\ - \n Projection: person.first_name, person.last_name\ - \n TableScan: person", - ); -} - #[test] fn select_simple_aggregate() { quick_test( @@ -1397,56 +1249,6 @@ fn select_interval_out_of_range() { ); } -#[test] -fn recursive_ctes() { - let sql = " - WITH RECURSIVE numbers AS ( - select 1 as n - UNION ALL - select n + 1 FROM numbers WHERE N < 10 - ) - select * from numbers;"; - quick_test( - sql, - "Projection: *\ - \n SubqueryAlias: numbers\ - \n RecursiveQuery: is_distinct=false\ - \n Projection: Int64(1) AS n\ - \n EmptyRelation\ - \n Projection: numbers.n + Int64(1)\ - \n Filter: numbers.n < Int64(10)\ - \n TableScan: numbers", - ) -} - -#[test] -fn recursive_ctes_disabled() { - let sql = " - WITH RECURSIVE numbers AS ( - select 1 as n - UNION ALL - select n + 1 FROM numbers WHERE N < 10 - ) - select * from numbers;"; - - // manually setting up test here so that we can disable recursive ctes - let mut state = MockSessionState::default(); - state.config_options.execution.enable_recursive_ctes = false; - let context = MockContextProvider { state }; - - let planner = SqlToRel::new_with_options(&context, ParserOptions::default()); - let result = DFParser::parse_sql_with_dialect(sql, &GenericDialect {}); - let mut ast = result.unwrap(); - - let err = planner - .statement_to_plan(ast.pop_front().unwrap()) - .expect_err("query should have failed"); - assert_eq!( - "This feature is not implemented: Recursive CTEs are not enabled", - err.strip_backtrace() - ); -} - #[test] fn select_simple_aggregate_with_groupby_and_column_is_in_aggregate_and_groupby() { quick_test( @@ -1563,7 +1365,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_nested_and_not_res let sql = "SELECT ((age + 1) / 2) * (age + 9), MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.age could not be resolved from available columns: person.age + Int64(1), min(person.first_name)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.age\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.age + Int64(1), min(person.first_name)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1573,7 +1375,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_and_its_column_sel let sql = "SELECT age, MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression person.age could not be resolved from available columns: person.age + Int64(1), min(person.first_name)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"person.age\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"person.age + Int64(1), min(person.first_name)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -1618,15 +1420,6 @@ fn select_aggregate_with_non_column_inner_expression_with_groupby() { ); } -#[test] -fn test_wildcard() { - quick_test( - "SELECT * from person", - "Projection: *\ - \n TableScan: person", - ); -} - #[test] fn select_count_one() { let sql = "SELECT count(1) FROM person"; @@ -1843,7 +1636,7 @@ fn select_7480_2() { let sql = "SELECT c1, c13, MIN(c12) FROM aggregate_test_100 GROUP BY c1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Error during planning: Projection references non-aggregate values: Expression aggregate_test_100.c13 could not be resolved from available columns: aggregate_test_100.c1, min(aggregate_test_100.c12)", + "Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column \"aggregate_test_100.c13\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"aggregate_test_100.c1, min(aggregate_test_100.c12)\" appears in the SELECT clause satisfies this requirement", err.strip_backtrace() ); } @@ -2060,20 +1853,6 @@ fn join_with_using() { quick_test(sql, expected); } -#[test] -fn project_wildcard_on_join_with_using() { - let sql = "SELECT * \ - FROM lineitem \ - JOIN lineitem as lineitem2 \ - USING (l_item_id)"; - let expected = "Projection: *\ - \n Inner Join: Using lineitem.l_item_id = lineitem2.l_item_id\ - \n TableScan: lineitem\ - \n SubqueryAlias: lineitem2\ - \n TableScan: lineitem"; - quick_test(sql, expected); -} - #[test] fn equijoin_explicit_syntax_3_tables() { let sql = "SELECT id, order_id, l_description \ @@ -2698,6 +2477,10 @@ impl ScalarUDFImpl for DummyUDF { fn return_type(&self, _arg_types: &[DataType]) -> Result { Ok(self.return_type.clone()) } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + panic!("dummy - not implemented") + } } /// Create logical plan, write with formatter, compare to expected output @@ -2867,24 +2650,6 @@ fn exists_subquery_schema_outer_schema_overlap() { quick_test(sql, expected); } -#[test] -fn exists_subquery_wildcard() { - let sql = "SELECT id FROM person p WHERE EXISTS \ - (SELECT * FROM person \ - WHERE last_name = p.last_name \ - AND state = p.state)"; - - let expected = "Projection: p.id\ - \n Filter: EXISTS ()\ - \n Subquery:\ - \n Projection: *\ - \n Filter: person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)\ - \n TableScan: person\ - \n SubqueryAlias: p\ - \n TableScan: person"; - quick_test(sql, expected); -} - #[test] fn in_subquery_uncorrelated() { let sql = "SELECT id FROM person p WHERE id IN \ @@ -2958,88 +2723,6 @@ fn scalar_subquery_reference_outer_field() { quick_test(sql, expected); } -#[test] -fn subquery_references_cte() { - let sql = "WITH \ - cte AS (SELECT * FROM person) \ - SELECT * FROM person WHERE EXISTS (SELECT * FROM cte WHERE id = person.id)"; - - let expected = "Projection: *\ - \n Filter: EXISTS ()\ - \n Subquery:\ - \n Projection: *\ - \n Filter: cte.id = outer_ref(person.id)\ - \n SubqueryAlias: cte\ - \n Projection: *\ - \n TableScan: person\ - \n TableScan: person"; - - quick_test(sql, expected) -} - -#[test] -fn cte_with_no_column_names() { - let sql = "WITH \ - numbers AS ( \ - SELECT 1 as a, 2 as b, 3 as c \ - ) \ - SELECT * FROM numbers;"; - - let expected = "Projection: *\ - \n SubqueryAlias: numbers\ - \n Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c\ - \n EmptyRelation"; - - quick_test(sql, expected) -} - -#[test] -fn cte_with_column_names() { - let sql = "WITH \ - numbers(a, b, c) AS ( \ - SELECT 1, 2, 3 \ - ) \ - SELECT * FROM numbers;"; - - let expected = "Projection: *\ - \n SubqueryAlias: numbers\ - \n Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c\ - \n Projection: Int64(1), Int64(2), Int64(3)\ - \n EmptyRelation"; - - quick_test(sql, expected) -} - -#[test] -fn cte_with_column_aliases_precedence() { - // The end result should always be what CTE specification says - let sql = "WITH \ - numbers(a, b, c) AS ( \ - SELECT 1 as x, 2 as y, 3 as z \ - ) \ - SELECT * FROM numbers;"; - - let expected = "Projection: *\ - \n SubqueryAlias: numbers\ - \n Projection: x AS a, y AS b, z AS c\ - \n Projection: Int64(1) AS x, Int64(2) AS y, Int64(3) AS z\ - \n EmptyRelation"; - quick_test(sql, expected) -} - -#[test] -fn cte_unbalanced_number_of_columns() { - let sql = "WITH \ - numbers(a) AS ( \ - SELECT 1, 2, 3 \ - ) \ - SELECT * FROM numbers;"; - - let expected = "Error during planning: Source table contains 3 columns but only 1 names given as column alias"; - let result = logical_plan(sql).err().unwrap(); - assert_eq!(result.strip_backtrace(), expected); -} - #[test] fn aggregate_with_rollup() { let sql = @@ -3133,128 +2816,6 @@ fn join_on_complex_condition() { quick_test(sql, expected); } -#[test] -fn lateral_constant() { - let sql = "SELECT * FROM j1, LATERAL (SELECT 1) AS j2"; - let expected = "Projection: *\ - \n Cross Join: \ - \n TableScan: j1\ - \n SubqueryAlias: j2\ - \n Projection: Int64(1)\ - \n EmptyRelation"; - quick_test(sql, expected); -} - -#[test] -fn lateral_comma_join() { - let sql = "SELECT j1_string, j2_string FROM - j1, \ - LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2"; - let expected = "Projection: j1.j1_string, j2.j2_string\ - \n Cross Join: \ - \n TableScan: j1\ - \n SubqueryAlias: j2\ - \n Subquery:\ - \n Projection: *\ - \n Filter: outer_ref(j1.j1_id) < j2.j2_id\ - \n TableScan: j2"; - quick_test(sql, expected); -} - -#[test] -fn lateral_comma_join_referencing_join_rhs() { - let sql = "SELECT * FROM\ - \n j1 JOIN (j2 JOIN j3 ON(j2_id = j3_id - 2)) ON(j1_id = j2_id),\ - \n LATERAL (SELECT * FROM j3 WHERE j3_string = j2_string) as j4;"; - let expected = "Projection: *\ - \n Cross Join: \ - \n Inner Join: Filter: j1.j1_id = j2.j2_id\ - \n TableScan: j1\ - \n Inner Join: Filter: j2.j2_id = j3.j3_id - Int64(2)\ - \n TableScan: j2\ - \n TableScan: j3\ - \n SubqueryAlias: j4\ - \n Subquery:\ - \n Projection: *\ - \n Filter: j3.j3_string = outer_ref(j2.j2_string)\ - \n TableScan: j3"; - quick_test(sql, expected); -} - -#[test] -fn lateral_comma_join_with_shadowing() { - // The j1_id on line 3 references the (closest) j1 definition from line 2. - let sql = "\ - SELECT * FROM j1, LATERAL (\ - SELECT * FROM j1, LATERAL (\ - SELECT * FROM j2 WHERE j1_id = j2_id\ - ) as j2\ - ) as j2;"; - let expected = "Projection: *\ - \n Cross Join: \ - \n TableScan: j1\ - \n SubqueryAlias: j2\ - \n Subquery:\ - \n Projection: *\ - \n Cross Join: \ - \n TableScan: j1\ - \n SubqueryAlias: j2\ - \n Subquery:\ - \n Projection: *\ - \n Filter: outer_ref(j1.j1_id) = j2.j2_id\ - \n TableScan: j2"; - quick_test(sql, expected); -} - -#[test] -fn lateral_left_join() { - let sql = "SELECT j1_string, j2_string FROM \ - j1 \ - LEFT JOIN LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2 ON(true);"; - let expected = "Projection: j1.j1_string, j2.j2_string\ - \n Left Join: Filter: Boolean(true)\ - \n TableScan: j1\ - \n SubqueryAlias: j2\ - \n Subquery:\ - \n Projection: *\ - \n Filter: outer_ref(j1.j1_id) < j2.j2_id\ - \n TableScan: j2"; - quick_test(sql, expected); -} - -#[test] -fn lateral_nested_left_join() { - let sql = "SELECT * FROM - j1, \ - (j2 LEFT JOIN LATERAL (SELECT * FROM j3 WHERE j1_id + j2_id = j3_id) AS j3 ON(true))"; - let expected = "Projection: *\ - \n Cross Join: \ - \n TableScan: j1\ - \n Left Join: Filter: Boolean(true)\ - \n TableScan: j2\ - \n SubqueryAlias: j3\ - \n Subquery:\ - \n Projection: *\ - \n Filter: outer_ref(j1.j1_id) + outer_ref(j2.j2_id) = j3.j3_id\ - \n TableScan: j3"; - quick_test(sql, expected); -} - -#[test] -fn lateral_unnest() { - let sql = "SELECT * from unnest_table u, unnest(u.array_col)"; - let expected = "Projection: *\ - \n Cross Join: \ - \n SubqueryAlias: u\ - \n TableScan: unnest_table\ - \n Subquery:\ - \n Projection: __unnest_placeholder(outer_ref(u.array_col),depth=1) AS UNNEST(outer_ref(u.array_col))\ - \n Unnest: lists[__unnest_placeholder(outer_ref(u.array_col))|depth=1] structs[]\ - \n Projection: outer_ref(u.array_col) AS __unnest_placeholder(outer_ref(u.array_col))\ - \n EmptyRelation"; - quick_test(sql, expected); -} - #[test] fn hive_aggregate_with_filter() -> Result<()> { let dialect = &HiveDialect {}; @@ -3515,20 +3076,6 @@ fn test_one_side_constant_full_join() { quick_test(sql, expected); } -#[test] -fn test_select_all_inner_join() { - let sql = "SELECT * - FROM person \ - INNER JOIN orders \ - ON orders.customer_id * 2 = person.id + 10"; - - let expected = "Projection: *\ - \n Inner Join: Filter: orders.customer_id * Int64(2) = person.id + Int64(10)\ - \n TableScan: person\ - \n TableScan: orders"; - quick_test(sql, expected); -} - #[test] fn test_select_join_key_inner_join() { let sql = "SELECT orders.customer_id * 2, person.id + 10 @@ -4258,34 +3805,6 @@ fn test_prepare_statement_to_plan_limit() { prepare_stmt_replace_params_quick_test(plan, param_values, expected_plan); } -#[test] -fn test_prepare_statement_to_plan_value_list() { - let sql = "PREPARE my_plan(STRING, STRING) AS SELECT * FROM (VALUES(1, $1), (2, $2)) AS t (num, letter);"; - - let expected_plan = "Prepare: \"my_plan\" [Utf8, Utf8] \ - \n Projection: *\ - \n SubqueryAlias: t\ - \n Projection: column1 AS num, column2 AS letter\ - \n Values: (Int64(1), $1), (Int64(2), $2)"; - - let expected_dt = "[Utf8, Utf8]"; - - let plan = prepare_stmt_quick_test(sql, expected_plan, expected_dt); - - /////////////////// - // replace params with values - let param_values = vec![ - ScalarValue::from("a".to_string()), - ScalarValue::from("b".to_string()), - ]; - let expected_plan = "Projection: *\ - \n SubqueryAlias: t\ - \n Projection: column1 AS num, column2 AS letter\ - \n Values: (Int64(1), Utf8(\"a\") AS $1), (Int64(2), Utf8(\"b\") AS $2)"; - - prepare_stmt_replace_params_quick_test(plan, param_values, expected_plan); -} - #[test] fn test_prepare_statement_unknown_list_param() { let sql = "SELECT id from person where id = $2"; @@ -4320,44 +3839,6 @@ fn test_prepare_statement_bad_list_idx() { assert_contains!(err.to_string(), "Error during planning: Failed to parse placeholder id: invalid digit found in string"); } -#[test] -fn test_table_alias() { - let sql = "select * from (\ - (select id from person) t1 \ - CROSS JOIN \ - (select age from person) t2 \ - ) as f"; - - let expected = "Projection: *\ - \n SubqueryAlias: f\ - \n Cross Join: \ - \n SubqueryAlias: t1\ - \n Projection: person.id\ - \n TableScan: person\ - \n SubqueryAlias: t2\ - \n Projection: person.age\ - \n TableScan: person"; - quick_test(sql, expected); - - let sql = "select * from (\ - (select id from person) t1 \ - CROSS JOIN \ - (select age from person) t2 \ - ) as f (c1, c2)"; - - let expected = "Projection: *\ - \n SubqueryAlias: f\ - \n Projection: t1.id AS c1, t2.age AS c2\ - \n Cross Join: \ - \n SubqueryAlias: t1\ - \n Projection: person.id\ - \n TableScan: person\ - \n SubqueryAlias: t2\ - \n Projection: person.age\ - \n TableScan: person"; - quick_test(sql, expected); -} - #[test] fn test_inner_join_with_cast_key() { let sql = "SELECT person.id, person.age diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 5742f583acf7..492457d80e97 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -42,7 +42,7 @@ async-trait = { workspace = true } bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } -clap = { version = "4.5.31", features = ["derive", "env"] } +clap = { version = "4.5.32", features = ["derive", "env"] } datafusion = { workspace = true, default-features = true, features = ["avro"] } futures = { workspace = true } half = { workspace = true, default-features = true } @@ -52,7 +52,7 @@ log = { workspace = true } object_store = { workspace = true } postgres-protocol = { version = "0.6.7", optional = true } postgres-types = { version = "0.2.8", features = ["derive", "with-chrono-0_4"], optional = true } -rust_decimal = { version = "1.36.0", features = ["tokio-pg"] } +rust_decimal = { version = "1.37.1", features = ["tokio-pg"] } # When updating the following dependency verify that sqlite test file regeneration works correctly # by running the regenerate_sqlite_files.sh script. sqllogictest = "0.28.0" diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index a18455476dab..77162f4001ae 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -28,8 +28,8 @@ This crate is a submodule of DataFusion that contains an implementation of [sqll ## Overview This crate uses [sqllogictest-rs](https://github.com/risinglightdb/sqllogictest-rs) to parse and run `.slt` files in the -[`test_files`](test_files) directory of this crate or the [`data/sqlite`](sqlite) -directory of the datafusion-testing crate. +[`test_files`](test_files) directory of this crate or the [`data/sqlite`](https://github.com/apache/datafusion-testing/tree/main/data/sqlite) +directory of the [datafusion-testing](https://github.com/apache/datafusion-testing) crate. ## Testing setup diff --git a/datafusion/sqllogictest/src/lib.rs b/datafusion/sqllogictest/src/lib.rs index ee20e70d14f4..1a208aa3fac2 100644 --- a/datafusion/sqllogictest/src/lib.rs +++ b/datafusion/sqllogictest/src/lib.rs @@ -20,6 +20,9 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] //! DataFusion sqllogictest driver diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index bc43f6bc8e61..9d8620b100f3 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -3807,6 +3807,52 @@ SELECT MIN(value), MAX(value) FROM timestampmicrosecond statement ok DROP TABLE timestampmicrosecond; +# min_duration, max_duration +statement ok +create table d +as values + (arrow_cast(1, 'Duration(Second)'), arrow_cast(2, 'Duration(Millisecond)'), arrow_cast(3, 'Duration(Microsecond)'), arrow_cast(4, 'Duration(Nanosecond)'), 1), + (arrow_cast(11, 'Duration(Second)'),arrow_cast(22, 'Duration(Millisecond)'), arrow_cast(33, 'Duration(Microsecond)'), arrow_cast(44, 'Duration(Nanosecond)'), 1); + +query ???? +SELECT min(column1), min(column2), min(column3), min(column4) FROM d; +---- +0 days 0 hours 0 mins 1 secs 0 days 0 hours 0 mins 0.002 secs 0 days 0 hours 0 mins 0.000003 secs 0 days 0 hours 0 mins 0.000000004 secs + +query ???? +SELECT max(column1), max(column2), max(column3), max(column4) FROM d; +---- +0 days 0 hours 0 mins 11 secs 0 days 0 hours 0 mins 0.022 secs 0 days 0 hours 0 mins 0.000033 secs 0 days 0 hours 0 mins 0.000000044 secs + +# GROUP BY follows a different code path +query ????I +SELECT min(column1), min(column2), min(column3), min(column4), column5 FROM d GROUP BY column5; +---- +0 days 0 hours 0 mins 1 secs 0 days 0 hours 0 mins 0.002 secs 0 days 0 hours 0 mins 0.000003 secs 0 days 0 hours 0 mins 0.000000004 secs 1 + +query ????I +SELECT max(column1), max(column2), max(column3), max(column4), column5 FROM d GROUP BY column5; +---- +0 days 0 hours 0 mins 11 secs 0 days 0 hours 0 mins 0.022 secs 0 days 0 hours 0 mins 0.000033 secs 0 days 0 hours 0 mins 0.000000044 secs 1 + +statement ok +INSERT INTO d VALUES + (arrow_cast(3, 'Duration(Second)'), arrow_cast(1, 'Duration(Millisecond)'), arrow_cast(7, 'Duration(Microsecond)'), arrow_cast(2, 'Duration(Nanosecond)'), 1), + (arrow_cast(0, 'Duration(Second)'), arrow_cast(9, 'Duration(Millisecond)'), arrow_cast(5, 'Duration(Microsecond)'), arrow_cast(8, 'Duration(Nanosecond)'), 1); + +query ????I +SELECT max(column1), max(column2), max(column3), max(column4), column5 FROM d GROUP BY column5 ORDER BY column5; +---- +0 days 0 hours 0 mins 11 secs 0 days 0 hours 0 mins 0.022 secs 0 days 0 hours 0 mins 0.000033 secs 0 days 0 hours 0 mins 0.000000044 secs 1 + +query ????I +SELECT min(column1), min(column2), min(column3), min(column4), column5 FROM d GROUP BY column5 ORDER BY column5; +---- +0 days 0 hours 0 mins 0 secs 0 days 0 hours 0 mins 0.001 secs 0 days 0 hours 0 mins 0.000003 secs 0 days 0 hours 0 mins 0.000000002 secs 1 + +statement ok +drop table d; + # max_bool statement ok CREATE TABLE max_bool (value BOOLEAN); diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index 3a4d641abf68..8755918cd16c 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -298,6 +298,27 @@ SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100 GROU 4 5 23 5 5 14 +# Test approx_distinct for varchar(with Utf8View) / int +statement ok +CREATE TABLE aggregate_test_100_utf8view AS SELECT + arrow_cast(c1, 'Utf8View') as c1, + c2, + c5 +FROM aggregate_test_100; + +# Test approx_distinct for varchar(with Utf8View) / int +query III +SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100_utf8view GROUP BY c2 ORDER BY c2; +---- +1 5 22 +2 5 22 +3 5 19 +4 5 23 +5 5 14 + +statement ok +DROP TABLE aggregate_test_100_utf8view; + # Test count with nullable fields query III SELECT c2, count(c3), count(c11) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; diff --git a/datafusion/sqllogictest/test_files/aggregates_topk.slt b/datafusion/sqllogictest/test_files/aggregates_topk.slt index 5fa0845cd2d5..cc1693843848 100644 --- a/datafusion/sqllogictest/test_files/aggregates_topk.slt +++ b/datafusion/sqllogictest/test_files/aggregates_topk.slt @@ -18,7 +18,6 @@ ####### # Setup test data table ####### - # TopK aggregation statement ok CREATE TABLE traces(trace_id varchar, timestamp bigint, other bigint) AS VALUES @@ -214,5 +213,62 @@ a -1 -1 NULL 0 0 c 1 2 + +# Setting to map varchar to utf8view, to test PR https://github.com/apache/datafusion/pull/15152 +# Before the PR, the test case would not work because the Utf8View will not be supported by the TopK aggregation +statement ok +CREATE TABLE traces_utf8view +AS SELECT + arrow_cast(trace_id, 'Utf8View') as trace_id, + timestamp, + other +FROM traces; + +query TT +explain select trace_id, MAX(timestamp) from traces_utf8view group by trace_id order by MAX(timestamp) desc limit 4; +---- +logical_plan +01)Sort: max(traces_utf8view.timestamp) DESC NULLS FIRST, fetch=4 +02)--Aggregate: groupBy=[[traces_utf8view.trace_id]], aggr=[[max(traces_utf8view.timestamp)]] +03)----TableScan: traces_utf8view projection=[trace_id, timestamp] +physical_plan +01)SortPreservingMergeExec: [max(traces_utf8view.timestamp)@1 DESC], fetch=4 +02)--SortExec: TopK(fetch=4), expr=[max(traces_utf8view.timestamp)@1 DESC], preserve_partitioning=[true] +03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces_utf8view.timestamp)], lim=[4] +04)------CoalesceBatchesExec: target_batch_size=8192 +05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4 +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_utf8view.timestamp)], lim=[4] +08)--------------DataSourceExec: partitions=1, partition_sizes=[1] + + +# Also add LargeUtf8 to test PR https://github.com/apache/datafusion/pull/15152 +# Before the PR, the test case would not work because the LargeUtf8 will not be supported by the TopK aggregation +statement ok +CREATE TABLE traces_largeutf8 +AS SELECT + arrow_cast(trace_id, 'LargeUtf8') as trace_id, + timestamp, + other +FROM traces; + +query TT +explain select trace_id, MAX(timestamp) from traces_largeutf8 group by trace_id order by MAX(timestamp) desc limit 4; +---- +logical_plan +01)Sort: max(traces_largeutf8.timestamp) DESC NULLS FIRST, fetch=4 +02)--Aggregate: groupBy=[[traces_largeutf8.trace_id]], aggr=[[max(traces_largeutf8.timestamp)]] +03)----TableScan: traces_largeutf8 projection=[trace_id, timestamp] +physical_plan +01)SortPreservingMergeExec: [max(traces_largeutf8.timestamp)@1 DESC], fetch=4 +02)--SortExec: TopK(fetch=4), expr=[max(traces_largeutf8.timestamp)@1 DESC], preserve_partitioning=[true] +03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces_largeutf8.timestamp)], lim=[4] +04)------CoalesceBatchesExec: target_batch_size=8192 +05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4 +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_largeutf8.timestamp)], lim=[4] +08)--------------DataSourceExec: partitions=1, partition_sizes=[1] + + statement ok drop table traces; diff --git a/datafusion/sqllogictest/test_files/alias.slt b/datafusion/sqllogictest/test_files/alias.slt new file mode 100644 index 000000000000..5339179db4c4 --- /dev/null +++ b/datafusion/sqllogictest/test_files/alias.slt @@ -0,0 +1,59 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# test table alias +statement count 0 +create table t1(id int); + +statement count 0 +create table t2(age int); + +query TT +explain select * from ((select id from t1) cross join (select age from t2)) as f; +---- +logical_plan +01)SubqueryAlias: f +02)--Cross Join: +03)----TableScan: t1 projection=[id] +04)----TableScan: t2 projection=[age] +physical_plan +01)CrossJoinExec +02)--DataSourceExec: partitions=1, partition_sizes=[0] +03)--DataSourceExec: partitions=1, partition_sizes=[0] + +query TT +explain select * from ((select id from t1) cross join (select age from t2)) as f(c1, c2); +---- +logical_plan +01)SubqueryAlias: f +02)--Projection: t1.id AS c1, t2.age AS c2 +03)----Cross Join: +04)------TableScan: t1 projection=[id] +05)------TableScan: t2 projection=[age] +physical_plan +01)ProjectionExec: expr=[id@0 as c1, age@1 as c2] +02)--CrossJoinExec +03)----DataSourceExec: partitions=1, partition_sizes=[0] +04)----DataSourceExec: partitions=1, partition_sizes=[0] + +statement count 0 +drop table t1; + +statement count 0 +drop table t2; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 3b7f12960681..509c7c182a7f 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -5905,6 +5905,15 @@ false false false false false false false false false false false false +query BBBB +select array_has_all(make_array(1,2,3), []), + array_has_any(make_array(1,2,3), []), + array_has_all(make_array('aa','bb','cc'), []), + array_has_any(make_array('aa','bb','cc'), []) +; +---- +true false true false + query BBBBBBBBBBBBB select array_has_all(make_array(1,2,3), make_array(1,3)), array_has_all(make_array(1,2,3), make_array(1,4)), diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index f39ff56ce449..e2bb23e35732 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -631,3 +631,20 @@ COPY source_table to '/tmp/table.parquet' (row_group_size 55 + 102); # Copy using execution.keep_partition_by_columns with an invalid value query error DataFusion error: Invalid or Unsupported Configuration: provided value for 'execution.keep_partition_by_columns' was not recognized: "invalid_value" COPY source_table to '/tmp/table.parquet' OPTIONS (execution.keep_partition_by_columns invalid_value); + +statement count 0 +create table t; + +query TT +explain COPY (select * from t limit 10) to 'output.csv'; +---- +logical_plan +01)CopyTo: format=csv output_url=output.csv options: () +02)--Limit: skip=0, fetch=10 +03)----TableScan: t projection=[], fetch=10 +physical_plan +01)DataSinkExec: sink=CsvSink(file_groups=[]) +02)--DataSourceExec: partitions=1, partition_sizes=[0], fetch=10 + +statement count 0 +drop table t; diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt index 95b9b5a9252e..e019af9775a4 100644 --- a/datafusion/sqllogictest/test_files/cte.slt +++ b/datafusion/sqllogictest/test_files/cte.slt @@ -859,3 +859,149 @@ SELECT * FROM 400 500 1 400 500 2 400 500 3 + +query error DataFusion error: Error during planning: Source table contains 3 columns but only 1 names given as column alias +with numbers(a) as (select 1, 2, 3) select * from numbers; + +query TT +explain with numbers(a,b,c) as (select 1 as x, 2 as y, 3 as z) select * from numbers; +---- +logical_plan +01)SubqueryAlias: numbers +02)--Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c +03)----EmptyRelation +physical_plan +01)ProjectionExec: expr=[1 as a, 2 as b, 3 as c] +02)--PlaceholderRowExec + +query TT +explain with numbers(a,b,c) as (select 1,2,3) select * from numbers; +---- +logical_plan +01)SubqueryAlias: numbers +02)--Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c +03)----EmptyRelation +physical_plan +01)ProjectionExec: expr=[1 as a, 2 as b, 3 as c] +02)--PlaceholderRowExec + +query TT +explain with numbers as (select 1 as a, 2 as b, 3 as c) select * from numbers; +---- +logical_plan +01)SubqueryAlias: numbers +02)--Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c +03)----EmptyRelation +physical_plan +01)ProjectionExec: expr=[1 as a, 2 as b, 3 as c] +02)--PlaceholderRowExec + +statement count 0 +create table person (id int, name string, primary key(id)) + +query TT +explain with cte as (select * from person) SELECT * FROM person WHERE EXISTS (SELECT * FROM cte WHERE id = person.id); +---- +logical_plan +01)LeftSemi Join: person.id = __correlated_sq_1.id +02)--TableScan: person projection=[id, name] +03)--SubqueryAlias: __correlated_sq_1 +04)----SubqueryAlias: cte +05)------TableScan: person projection=[id] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8182 +02)--HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(id@0, id@0)] +03)----DataSourceExec: partitions=1, partition_sizes=[0] +04)----DataSourceExec: partitions=1, partition_sizes=[0] + +statement count 0 +drop table person; + +statement count 0 +create table j1(a int); + +statement count 0 +create table j2(b int); + +query TT +explain SELECT * FROM j1, LATERAL (SELECT 1) AS j2; +---- +logical_plan +01)Cross Join: +02)--TableScan: j1 projection=[a] +03)--SubqueryAlias: j2 +04)----Projection: Int64(1) +05)------EmptyRelation +physical_plan +01)CrossJoinExec +02)--DataSourceExec: partitions=1, partition_sizes=[0] +03)--ProjectionExec: expr=[1 as Int64(1)] +04)----PlaceholderRowExec + +statement count 0 +drop table j1; + +statement count 0 +drop table j2; + +query TT +explain WITH RECURSIVE numbers AS ( + select 1 as n + UNION ALL + select n + 1 FROM numbers WHERE N < 10 +) select * from numbers; +---- +logical_plan +01)SubqueryAlias: numbers +02)--RecursiveQuery: is_distinct=false +03)----Projection: Int64(1) AS n +04)------EmptyRelation +05)----Projection: numbers.n + Int64(1) +06)------Filter: numbers.n < Int64(10) +07)--------TableScan: numbers +physical_plan +01)RecursiveQueryExec: name=numbers, is_distinct=false +02)--ProjectionExec: expr=[1 as n] +03)----PlaceholderRowExec +04)--CoalescePartitionsExec +05)----ProjectionExec: expr=[n@0 + 1 as numbers.n + Int64(1)] +06)------CoalesceBatchesExec: target_batch_size=8182 +07)--------FilterExec: n@0 < 10 +08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)------------WorkTableExec: name=numbers + +query TT +explain WITH RECURSIVE numbers AS ( + select 1 as n + UNION ALL + select n + 1 FROM numbers WHERE N < 10 +) select * from numbers; +---- +logical_plan +01)SubqueryAlias: numbers +02)--RecursiveQuery: is_distinct=false +03)----Projection: Int64(1) AS n +04)------EmptyRelation +05)----Projection: numbers.n + Int64(1) +06)------Filter: numbers.n < Int64(10) +07)--------TableScan: numbers +physical_plan +01)RecursiveQueryExec: name=numbers, is_distinct=false +02)--ProjectionExec: expr=[1 as n] +03)----PlaceholderRowExec +04)--CoalescePartitionsExec +05)----ProjectionExec: expr=[n@0 + 1 as numbers.n + Int64(1)] +06)------CoalesceBatchesExec: target_batch_size=8182 +07)--------FilterExec: n@0 < 10 +08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)------------WorkTableExec: name=numbers + +statement count 0 +set datafusion.execution.enable_recursive_ctes = false; + +query error DataFusion error: This feature is not implemented: Recursive CTEs are not enabled +explain WITH RECURSIVE numbers AS ( + select 1 as n + UNION ALL + select n + 1 FROM numbers WHERE N < 10 +) select * from numbers; diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 6f75a7d7f8fd..088d0155a66f 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -827,3 +827,57 @@ drop table table_with_pk; statement ok set datafusion.catalog.information_schema = false; + +# Test VARCHAR is mapped to Utf8View during SQL planning when setting map_varchar_to_utf8view to true +statement ok +CREATE TABLE t1(c1 VARCHAR(10) NOT NULL, c2 VARCHAR); + +query TTT +DESCRIBE t1; +---- +c1 Utf8 NO +c2 Utf8 YES + +statement ok +set datafusion.sql_parser.map_varchar_to_utf8view = true; + +statement ok +CREATE TABLE t2(c1 VARCHAR(10) NOT NULL, c2 VARCHAR); + +query TTT +DESCRIBE t2; +---- +c1 Utf8View NO +c2 Utf8View YES + +statement ok +DROP TABLE t1; + +statement ok +DROP TABLE t2; + +statement count 0 +create table t(a int) as values (1), (2), (3); + +statement count 0 +create view v as select a, count(a) from t group by a; + +query II rowsort +select * from v; +---- +1 1 +2 1 +3 1 + +query II rowsort +select "count(t.a)", a from v; +---- +1 1 +1 2 +1 3 + +statement count 0 +drop view v; + +statement count 0 +drop table t; diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index cab7308f6ff8..2e27ebe927d2 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -174,8 +174,6 @@ EXPLAIN VERBOSE SELECT a, b, c FROM simple_explain_test initial_logical_plan 01)Projection: simple_explain_test.a, simple_explain_test.b, simple_explain_test.c 02)--TableScan: simple_explain_test -logical_plan after inline_table_scan SAME TEXT AS ABOVE -logical_plan after expand_wildcard_rule SAME TEXT AS ABOVE logical_plan after resolve_grouping_function SAME TEXT AS ABOVE logical_plan after type_coercion SAME TEXT AS ABOVE analyzed_logical_plan SAME TEXT AS ABOVE @@ -435,3 +433,135 @@ drop table t1; statement ok drop table t2; + +## Tests for explain format + +statement ok +create table foo (x int, y int) as values (1,2), (1,3), (2,4); + +# defaults to indent mode +query TT +explain select * from values (1); +---- +logical_plan Values: (Int64(1)) +physical_plan DataSourceExec: partitions=1, partition_sizes=[1] + +# can explicitly request indent mode +query TT +explain format indent select * from values (1); +---- +logical_plan Values: (Int64(1)) +physical_plan DataSourceExec: partitions=1, partition_sizes=[1] + +# tree mode +query TT +explain format tree select * from values (1); +---- +physical_plan +01)┌───────────────────────────┐ +02)│ DataSourceExec │ +03)│ -------------------- │ +04)│ bytes: 128 │ +05)│ format: memory │ +06)│ rows: 1 │ +07)└───────────────────────────┘ + +# is not case sensitive +query TT +explain format TrEE select * from values (1); +---- +physical_plan +01)┌───────────────────────────┐ +02)│ DataSourceExec │ +03)│ -------------------- │ +04)│ bytes: 128 │ +05)│ format: memory │ +06)│ rows: 1 │ +07)└───────────────────────────┘ + +# wrapped in single quotes +query TT +explain format 'tree' select * from values (1); +---- +physical_plan +01)┌───────────────────────────┐ +02)│ DataSourceExec │ +03)│ -------------------- │ +04)│ bytes: 128 │ +05)│ format: memory │ +06)│ rows: 1 │ +07)└───────────────────────────┘ + +# wrapped in double quotes +query TT +explain format "tree" select * from values (1); +---- +physical_plan +01)┌───────────────────────────┐ +02)│ DataSourceExec │ +03)│ -------------------- │ +04)│ bytes: 128 │ +05)│ format: memory │ +06)│ rows: 1 │ +07)└───────────────────────────┘ + +# number is not a valid format +query error DataFusion error: SQL error: ParserError\("Expected: an explain format such as TREE, found: 123 at Line: 1, Column: 16"\) +explain format 123 select * from values (1); + +# verbose tree mode not supported +query error DataFusion error: Error during planning: EXPLAIN VERBOSE with FORMAT is not supported +explain verbose format tree select * from values (1); + +# no such thing as json mode +query error DataFusion error: Error during planning: Invalid explain format\. Expected 'indent', 'tree', 'pgjson' or 'graphviz'\. Got 'json' +explain format json select * from values (1); + +query error DataFusion error: Error during planning: Invalid explain format\. Expected 'indent', 'tree', 'pgjson' or 'graphviz'\. Got 'foo' +explain format foo select * from values (1); + +# pgjson mode +query TT +explain format pgjson select * from values (1); +---- +logical_plan +01)[ +02)--{ +03)----"Plan": { +04)------"Node Type": "Values", +05)------"Output": [ +06)--------"column1" +07)------], +08)------"Plans": [], +09)------"Values": "(Int64(1))" +10)----} +11)--} +12)] + +# graphviz mode +query TT +explain format graphviz select * from values (1); +---- +logical_plan +01) +02)// Begin DataFusion GraphViz Plan, +03)// display it online here: https://dreampuf.github.io/GraphvizOnline +04) +05)digraph { +06)--subgraph cluster_1 +07)--{ +08)----graph[label="LogicalPlan"] +09)----2[shape=box label="Values: (Int64(1))"] +10)--} +11)--subgraph cluster_3 +12)--{ +13)----graph[label="Detailed LogicalPlan"] +14)----4[shape=box label="Values: (Int64(1))\nSchema: [column1:Int64;N]"] +15)--} +16)} +17)// End DataFusion GraphViz Plan + +# unknown mode + +statement ok +drop table foo; diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt index 4470cf9fae59..aaea05be76b5 100644 --- a/datafusion/sqllogictest/test_files/explain_tree.slt +++ b/datafusion/sqllogictest/test_files/explain_tree.slt @@ -133,6 +133,31 @@ AS SELECT FROM hashjoin_datatype_table_t2_source +statement ok +CREATE UNBOUNDED EXTERNAL TABLE sink_table ( + c1 VARCHAR NOT NULL, + c2 TINYINT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT NOT NULL, + c5 INTEGER NOT NULL, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 INT UNSIGNED NOT NULL, + c10 BIGINT UNSIGNED NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL + ) +STORED AS CSV +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); + +statement ok +CREATE TABLE limit_table AS +SELECT * FROM table1 +UNION ALL SELECT * FROM table1 + ######## Begin Queries ######## # Filter @@ -142,22 +167,31 @@ explain SELECT int_col FROM table1 WHERE string_col != 'foo'; physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != foo │ -09)└─────────────┬─────────────┘ -10)┌─────────────┴─────────────┐ -11)│ RepartitionExec │ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != foo │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ -14)│ DataSourceExec │ +14)│ RepartitionExec │ 15)│ -------------------- │ -16)│ files: 1 │ -17)│ format: csv │ -18)└───────────────────────────┘ +16)│ output_partition_count: │ +17)│ 1 │ +18)│ │ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: csv │ +27)└───────────────────────────┘ # Aggregate query TT @@ -166,25 +200,55 @@ explain SELECT string_col, SUM(bigint_col) FROM table1 GROUP BY string_col; physical_plan 01)┌───────────────────────────┐ 02)│ AggregateExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ CoalesceBatchesExec │ -06)└─────────────┬─────────────┘ -07)┌─────────────┴─────────────┐ -08)│ RepartitionExec │ -09)└─────────────┬─────────────┘ -10)┌─────────────┴─────────────┐ -11)│ AggregateExec │ -12)└─────────────┬─────────────┘ -13)┌─────────────┴─────────────┐ -14)│ RepartitionExec │ -15)└─────────────┬─────────────┘ -16)┌─────────────┴─────────────┐ -17)│ DataSourceExec │ -18)│ -------------------- │ -19)│ files: 1 │ -20)│ format: csv │ -21)└───────────────────────────┘ +03)│ -------------------- │ +04)│ aggr: │ +05)│ sum(table1.bigint_col) │ +06)│ │ +07)│ group_by: string_col │ +08)│ │ +09)│ mode: │ +10)│ FinalPartitioned │ +11)└─────────────┬─────────────┘ +12)┌─────────────┴─────────────┐ +13)│ CoalesceBatchesExec │ +14)│ -------------------- │ +15)│ target_batch_size: │ +16)│ 8192 │ +17)└─────────────┬─────────────┘ +18)┌─────────────┴─────────────┐ +19)│ RepartitionExec │ +20)│ -------------------- │ +21)│ output_partition_count: │ +22)│ 4 │ +23)│ │ +24)│ partitioning_scheme: │ +25)│ Hash([string_col@0], 4) │ +26)└─────────────┬─────────────┘ +27)┌─────────────┴─────────────┐ +28)│ AggregateExec │ +29)│ -------------------- │ +30)│ aggr: │ +31)│ sum(table1.bigint_col) │ +32)│ │ +33)│ group_by: string_col │ +34)│ mode: Partial │ +35)└─────────────┬─────────────┘ +36)┌─────────────┴─────────────┐ +37)│ RepartitionExec │ +38)│ -------------------- │ +39)│ output_partition_count: │ +40)│ 1 │ +41)│ │ +42)│ partitioning_scheme: │ +43)│ RoundRobinBatch(4) │ +44)└─────────────┬─────────────┘ +45)┌─────────────┴─────────────┐ +46)│ DataSourceExec │ +47)│ -------------------- │ +48)│ files: 1 │ +49)│ format: csv │ +50)└───────────────────────────┘ + # Limit query TT @@ -204,6 +268,23 @@ physical_plan 11)│ format: csv │ 12)└───────────────────────────┘ +query TT +explain SELECT * FROM limit_table LIMIT 10; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ CoalescePartitionsExec │ +03)│ -------------------- │ +04)│ limit: 10 │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ DataSourceExec │ +08)│ -------------------- │ +09)│ bytes: 3120 │ +10)│ format: memory │ +11)│ rows: 2 │ +12)└───────────────────────────┘ + # 2 Joins query TT explain SELECT table1.string_col, table2.date_col FROM table1 JOIN table2 ON table1.int_col = table2.int_col; @@ -211,28 +292,46 @@ explain SELECT table1.string_col, table2.date_col FROM table1 JOIN table2 ON tab physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ HashJoinExec │ -06)│ -------------------- │ -07)│ on: ├──────────────┐ -08)│ (int_col@0 = int_col@0) │ │ -09)└─────────────┬─────────────┘ │ -10)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -11)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ -12)└─────────────┬─────────────┘└─────────────┬─────────────┘ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ HashJoinExec │ +09)│ -------------------- │ +10)│ on: ├──────────────┐ +11)│ (int_col = int_col) │ │ +12)└─────────────┬─────────────┘ │ 13)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -14)│ RepartitionExec ││ RepartitionExec │ -15)└─────────────┬─────────────┘└─────────────┬─────────────┘ -16)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -17)│ RepartitionExec ││ RepartitionExec │ +14)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ +15)│ -------------------- ││ -------------------- │ +16)│ target_batch_size: ││ target_batch_size: │ +17)│ 8192 ││ 8192 │ 18)└─────────────┬─────────────┘└─────────────┬─────────────┘ 19)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -20)│ DataSourceExec ││ DataSourceExec │ +20)│ RepartitionExec ││ RepartitionExec │ 21)│ -------------------- ││ -------------------- │ -22)│ files: 1 ││ files: 1 │ -23)│ format: csv ││ format: parquet │ -24)└───────────────────────────┘└───────────────────────────┘ +22)│ output_partition_count: ││ output_partition_count: │ +23)│ 4 ││ 4 │ +24)│ ││ │ +25)│ partitioning_scheme: ││ partitioning_scheme: │ +26)│ Hash([int_col@0], 4) ││ Hash([int_col@0], 4) │ +27)└─────────────┬─────────────┘└─────────────┬─────────────┘ +28)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +29)│ RepartitionExec ││ RepartitionExec │ +30)│ -------------------- ││ -------------------- │ +31)│ output_partition_count: ││ output_partition_count: │ +32)│ 1 ││ 1 │ +33)│ ││ │ +34)│ partitioning_scheme: ││ partitioning_scheme: │ +35)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │ +36)└─────────────┬─────────────┘└─────────────┬─────────────┘ +37)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +38)│ DataSourceExec ││ DataSourceExec │ +39)│ -------------------- ││ -------------------- │ +40)│ files: 1 ││ files: 1 │ +41)│ format: csv ││ format: parquet │ +42)└───────────────────────────┘└───────────────────────────┘ # 3 Joins query TT @@ -247,41 +346,62 @@ FROM physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ HashJoinExec │ -06)│ -------------------- │ -07)│ on: ├───────────────────────────────────────────┐ -08)│ (int_col@1 = int_col@0) │ │ -09)└─────────────┬─────────────┘ │ -10)┌─────────────┴─────────────┐ ┌─────────────┴─────────────┐ -11)│ CoalesceBatchesExec │ │ CoalesceBatchesExec │ -12)└─────────────┬─────────────┘ └─────────────┬─────────────┘ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ HashJoinExec │ +09)│ -------------------- │ +10)│ on: ├───────────────────────────────────────────┐ +11)│ (int_col = int_col) │ │ +12)└─────────────┬─────────────┘ │ 13)┌─────────────┴─────────────┐ ┌─────────────┴─────────────┐ -14)│ HashJoinExec │ │ RepartitionExec │ -15)│ -------------------- │ │ │ -16)│ on: ├──────────────┐ │ │ -17)│ (int_col@0 = int_col@0) │ │ │ │ -18)└─────────────┬─────────────┘ │ └─────────────┬─────────────┘ -19)┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -20)│ CoalesceBatchesExec ││ CoalesceBatchesExec ││ DataSourceExec │ -21)│ ││ ││ -------------------- │ -22)│ ││ ││ bytes: 1560 │ -23)│ ││ ││ format: memory │ -24)│ ││ ││ rows: 1 │ -25)└─────────────┬─────────────┘└─────────────┬─────────────┘└───────────────────────────┘ -26)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -27)│ RepartitionExec ││ RepartitionExec │ -28)└─────────────┬─────────────┘└─────────────┬─────────────┘ -29)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -30)│ RepartitionExec ││ RepartitionExec │ -31)└─────────────┬─────────────┘└─────────────┬─────────────┘ -32)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -33)│ DataSourceExec ││ DataSourceExec │ -34)│ -------------------- ││ -------------------- │ -35)│ files: 1 ││ files: 1 │ -36)│ format: csv ││ format: parquet │ -37)└───────────────────────────┘└───────────────────────────┘ +14)│ CoalesceBatchesExec │ │ CoalesceBatchesExec │ +15)│ -------------------- │ │ -------------------- │ +16)│ target_batch_size: │ │ target_batch_size: │ +17)│ 8192 │ │ 8192 │ +18)└─────────────┬─────────────┘ └─────────────┬─────────────┘ +19)┌─────────────┴─────────────┐ ┌─────────────┴─────────────┐ +20)│ HashJoinExec │ │ RepartitionExec │ +21)│ -------------------- │ │ -------------------- │ +22)│ on: │ │ output_partition_count: │ +23)│ (int_col = int_col) ├──────────────┐ │ 1 │ +24)│ │ │ │ │ +25)│ │ │ │ partitioning_scheme: │ +26)│ │ │ │ Hash([int_col@0], 4) │ +27)└─────────────┬─────────────┘ │ └─────────────┬─────────────┘ +28)┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +29)│ CoalesceBatchesExec ││ CoalesceBatchesExec ││ DataSourceExec │ +30)│ -------------------- ││ -------------------- ││ -------------------- │ +31)│ target_batch_size: ││ target_batch_size: ││ bytes: 1560 │ +32)│ 8192 ││ 8192 ││ format: memory │ +33)│ ││ ││ rows: 1 │ +34)└─────────────┬─────────────┘└─────────────┬─────────────┘└───────────────────────────┘ +35)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +36)│ RepartitionExec ││ RepartitionExec │ +37)│ -------------------- ││ -------------------- │ +38)│ output_partition_count: ││ output_partition_count: │ +39)│ 4 ││ 4 │ +40)│ ││ │ +41)│ partitioning_scheme: ││ partitioning_scheme: │ +42)│ Hash([int_col@0], 4) ││ Hash([int_col@0], 4) │ +43)└─────────────┬─────────────┘└─────────────┬─────────────┘ +44)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +45)│ RepartitionExec ││ RepartitionExec │ +46)│ -------------------- ││ -------------------- │ +47)│ output_partition_count: ││ output_partition_count: │ +48)│ 1 ││ 1 │ +49)│ ││ │ +50)│ partitioning_scheme: ││ partitioning_scheme: │ +51)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │ +52)└─────────────┬─────────────┘└─────────────┬─────────────┘ +53)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +54)│ DataSourceExec ││ DataSourceExec │ +55)│ -------------------- ││ -------------------- │ +56)│ files: 1 ││ files: 1 │ +57)│ format: csv ││ format: parquet │ +58)└───────────────────────────┘└───────────────────────────┘ # Long Filter (demonstrate what happens with wrapping) query TT @@ -292,26 +412,35 @@ WHERE string_col != 'foo' AND string_col != 'bar' AND string_col != 'a really lo physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != foo AND │ -09)│ string_col@1 != bar │ -10)│ AND string_col@1 != a │ -11)│ really long string │ -12)│ constant │ -13)└─────────────┬─────────────┘ -14)┌─────────────┴─────────────┐ -15)│ RepartitionExec │ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != foo AND │ +12)│ string_col != bar │ +13)│ AND string_col != a │ +14)│ really long string │ +15)│ constant │ 16)└─────────────┬─────────────┘ 17)┌─────────────┴─────────────┐ -18)│ DataSourceExec │ +18)│ RepartitionExec │ 19)│ -------------------- │ -20)│ files: 1 │ -21)│ format: csv │ -22)└───────────────────────────┘ +20)│ output_partition_count: │ +21)│ 1 │ +22)│ │ +23)│ partitioning_scheme: │ +24)│ RoundRobinBatch(4) │ +25)└─────────────┬─────────────┘ +26)┌─────────────┴─────────────┐ +27)│ DataSourceExec │ +28)│ -------------------- │ +29)│ files: 1 │ +30)│ format: csv │ +31)└───────────────────────────┘ # Check maximum line limit. query TT @@ -321,16 +450,16 @@ WHERE string_col != 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != │ -09)│ aaaaaaaaaaaaaa │ -10)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ -11)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ -12)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != │ +12)│ aaaaaaaaaaaa │ 13)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ 14)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ 15)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ @@ -354,68 +483,96 @@ physical_plan 33)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ 34)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ 35)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ -36)│ ... │ -37)└─────────────┬─────────────┘ -38)┌─────────────┴─────────────┐ -39)│ RepartitionExec │ +36)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ +37)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ +38)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│ +39)│ ... │ 40)└─────────────┬─────────────┘ 41)┌─────────────┴─────────────┐ -42)│ DataSourceExec │ +42)│ RepartitionExec │ 43)│ -------------------- │ -44)│ files: 1 │ -45)│ format: csv │ -46)└───────────────────────────┘ +44)│ output_partition_count: │ +45)│ 1 │ +46)│ │ +47)│ partitioning_scheme: │ +48)│ RoundRobinBatch(4) │ +49)└─────────────┬─────────────┘ +50)┌─────────────┴─────────────┐ +51)│ DataSourceExec │ +52)│ -------------------- │ +53)│ files: 1 │ +54)│ format: csv │ +55)└───────────────────────────┘ # Check exactly the render width. query TT explain SELECT int_col FROM table1 -WHERE string_col != 'aaaaaaaaaaa'; +WHERE string_col != 'aaaaaaaaaaaaa'; ---- physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│string_col@1 != aaaaaaaaaaa│ -09)└─────────────┬─────────────┘ -10)┌─────────────┴─────────────┐ -11)│ RepartitionExec │ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│string_col != aaaaaaaaaaaaa│ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ -14)│ DataSourceExec │ +14)│ RepartitionExec │ 15)│ -------------------- │ -16)│ files: 1 │ -17)│ format: csv │ -18)└───────────────────────────┘ +16)│ output_partition_count: │ +17)│ 1 │ +18)│ │ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: csv │ +27)└───────────────────────────┘ # Check with the render witdth + 1. query TT explain SELECT int_col FROM table1 -WHERE string_col != 'aaaaaaaaaaaa'; +WHERE string_col != 'aaaaaaaaaaaaaaa'; ---- physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != │ -09)│ aaaaaaaaaaaa │ -10)└─────────────┬─────────────┘ -11)┌─────────────┴─────────────┐ -12)│ RepartitionExec │ -13)└─────────────┬─────────────┘ -14)┌─────────────┴─────────────┐ -15)│ DataSourceExec │ -16)│ -------------------- │ -17)│ files: 1 │ -18)│ format: csv │ -19)└───────────────────────────┘ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != │ +12)│ aaaaaaaaaaaa │ +13)│ aaa │ +14)└─────────────┬─────────────┘ +15)┌─────────────┴─────────────┐ +16)│ RepartitionExec │ +17)│ -------------------- │ +18)│ output_partition_count: │ +19)│ 1 │ +20)│ │ +21)│ partitioning_scheme: │ +22)│ RoundRobinBatch(4) │ +23)└─────────────┬─────────────┘ +24)┌─────────────┴─────────────┐ +25)│ DataSourceExec │ +26)│ -------------------- │ +27)│ files: 1 │ +28)│ format: csv │ +29)└───────────────────────────┘ # Query with filter on csv query TT @@ -424,22 +581,31 @@ explain SELECT int_col FROM table1 WHERE string_col != 'foo'; physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != foo │ -09)└─────────────┬─────────────┘ -10)┌─────────────┴─────────────┐ -11)│ RepartitionExec │ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != foo │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ -14)│ DataSourceExec │ +14)│ RepartitionExec │ 15)│ -------------------- │ -16)│ files: 1 │ -17)│ format: csv │ -18)└───────────────────────────┘ +16)│ output_partition_count: │ +17)│ 1 │ +18)│ │ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: csv │ +27)└───────────────────────────┘ # Query with filter on parquet @@ -449,25 +615,34 @@ explain SELECT int_col FROM table2 WHERE string_col != 'foo'; physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != foo │ -09)└─────────────┬─────────────┘ -10)┌─────────────┴─────────────┐ -11)│ RepartitionExec │ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != foo │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ -14)│ DataSourceExec │ +14)│ RepartitionExec │ 15)│ -------------------- │ -16)│ files: 1 │ -17)│ format: parquet │ +16)│ output_partition_count: │ +17)│ 1 │ 18)│ │ -19)│ predicate: │ -20)│ string_col@1 != foo │ -21)└───────────────────────────┘ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: parquet │ +27)│ │ +28)│ predicate: │ +29)│ string_col != foo │ +30)└───────────────────────────┘ # Query with filter on memory query TT @@ -476,20 +651,23 @@ explain SELECT int_col FROM table3 WHERE string_col != 'foo'; physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != foo │ -09)└─────────────┬─────────────┘ -10)┌─────────────┴─────────────┐ -11)│ DataSourceExec │ -12)│ -------------------- │ -13)│ bytes: 1560 │ -14)│ format: memory │ -15)│ rows: 1 │ -16)└───────────────────────────┘ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != foo │ +12)└─────────────┬─────────────┘ +13)┌─────────────┴─────────────┐ +14)│ DataSourceExec │ +15)│ -------------------- │ +16)│ bytes: 1560 │ +17)│ format: memory │ +18)│ rows: 1 │ +19)└───────────────────────────┘ # Query with filter on json query TT @@ -498,22 +676,31 @@ explain SELECT int_col FROM table4 WHERE string_col != 'foo'; physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != foo │ -09)└─────────────┬─────────────┘ -10)┌─────────────┴─────────────┐ -11)│ RepartitionExec │ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != foo │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ -14)│ DataSourceExec │ +14)│ RepartitionExec │ 15)│ -------------------- │ -16)│ files: 1 │ -17)│ format: json │ -18)└───────────────────────────┘ +16)│ output_partition_count: │ +17)│ 1 │ +18)│ │ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: json │ +27)└───────────────────────────┘ # Query with filter on arrow query TT @@ -522,22 +709,31 @@ explain SELECT int_col FROM table5 WHERE string_col != 'foo'; physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ FilterExec │ -06)│ -------------------- │ -07)│ predicate: │ -08)│ string_col@1 != foo │ -09)└─────────────┬─────────────┘ -10)┌─────────────┴─────────────┐ -11)│ RepartitionExec │ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ FilterExec │ +09)│ -------------------- │ +10)│ predicate: │ +11)│ string_col != foo │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ -14)│ DataSourceExec │ +14)│ RepartitionExec │ 15)│ -------------------- │ -16)│ files: 1 │ -17)│ format: arrow │ -18)└───────────────────────────┘ +16)│ output_partition_count: │ +17)│ 1 │ +18)│ │ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: arrow │ +27)└───────────────────────────┘ # Query with window agg. @@ -554,7 +750,7 @@ physical_plan 07)│ count(Int64(1)) ROWS │ 08)│ BETWEEN UNBOUNDED │ 09)│ PRECEDING AND UNBOUNDED │ -10)│ FOLLOWING@0 │ +10)│ FOLLOWING │ 11)└─────────────┬─────────────┘ 12)┌─────────────┴─────────────┐ 13)│ WindowAggExec │ @@ -587,9 +783,9 @@ physical_plan 05)│ sum(t1.v1) ORDER BY [t1.v1│ 06)│ ASC NULLS LAST] ROWS │ 07)│ BETWEEN 1 PRECEDING │ -08)│ AND CURRENT ROW@1 │ +08)│ AND CURRENT ROW │ 09)│ │ -10)│ v1: v1@0 │ +10)│ v1: v1 │ 11)└─────────────┬─────────────┘ 12)┌─────────────┴─────────────┐ 13)│ BoundedWindowAggExec │ @@ -610,11 +806,16 @@ physical_plan 28)┌─────────────┴─────────────┐ 29)│ ProjectionExec │ 30)│ -------------------- │ -31)│ v1: value@0 │ +31)│ v1: value │ 32)└─────────────┬─────────────┘ 33)┌─────────────┴─────────────┐ 34)│ LazyMemoryExec │ -35)└───────────────────────────┘ +35)│ -------------------- │ +36)│ batch_generators: │ +37)│ generate_series: start=1, │ +38)│ end=1000, batch_size │ +39)│ =8192 │ +40)└───────────────────────────┘ query TT explain select @@ -632,14 +833,14 @@ physical_plan 07)│ count(Int64(1)) ROWS │ 08)│ BETWEEN UNBOUNDED │ 09)│ PRECEDING AND UNBOUNDED │ -10)│ FOLLOWING@0 │ +10)│ FOLLOWING │ 11)│ │ 12)│ row_number() ROWS BETWEEN │ 13)│ UNBOUNDED PRECEDING AND │ 14)│ UNBOUNDED FOLLOWING: │ 15)│ row_number() ROWS BETWEEN │ 16)│ UNBOUNDED PRECEDING AND │ -17)│ UNBOUNDED FOLLOWING@1 │ +17)│ UNBOUNDED FOLLOWING │ 18)└─────────────┬─────────────┘ 19)┌─────────────┴─────────────┐ 20)│ WindowAggExec │ @@ -705,23 +906,29 @@ physical_plan 02)│ ProjectionExec │ 03)│ -------------------- │ 04)│ bigint_col: │ -05)│ bigint_col@1 │ +05)│ bigint_col │ 06)│ │ -07)│ int_col: int_col@0 │ +07)│ int_col: int_col │ 08)│ │ 09)│ sum_col: │ -10)│ CAST(int_col@0 AS Int64) +│ -11)│ bigint_col@1 │ +10)│ CAST(int_col AS Int64) + │ +11)│ bigint_col │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ 14)│ RepartitionExec │ -15)└─────────────┬─────────────┘ -16)┌─────────────┴─────────────┐ -17)│ DataSourceExec │ -18)│ -------------------- │ -19)│ files: 1 │ -20)│ format: csv │ -21)└───────────────────────────┘ +15)│ -------------------- │ +16)│ output_partition_count: │ +17)│ 1 │ +18)│ │ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: csv │ +27)└───────────────────────────┘ query TT explain select @@ -742,7 +949,7 @@ physical_plan 10)│ .int_col DESC NULLS │ 11)│ FIRST] RANGE BETWEEN │ 12)│ UNBOUNDED PRECEDING AND │ -13)│ CURRENT ROW@1 │ +13)│ CURRENT ROW │ 14)│ │ 15)│ row_number() ORDER BY │ 16)│ [table1.int_col ASC │ @@ -755,7 +962,7 @@ physical_plan 23)│ NULLS LAST] RANGE │ 24)│ BETWEEN UNBOUNDED │ 25)│ PRECEDING AND CURRENT │ -26)│ ROW@2 │ +26)│ ROW │ 27)└─────────────┬─────────────┘ 28)┌─────────────┴─────────────┐ 29)│ BoundedWindowAggExec │ @@ -808,23 +1015,29 @@ physical_plan 02)│ ProjectionExec │ 03)│ -------------------- │ 04)│ bigint_col: │ -05)│ bigint_col@1 │ +05)│ bigint_col │ 06)│ │ -07)│ int_col: int_col@0 │ +07)│ int_col: int_col │ 08)│ │ 09)│ sum_col: │ -10)│ CAST(int_col@0 AS Int64) +│ -11)│ bigint_col@1 │ +10)│ CAST(int_col AS Int64) + │ +11)│ bigint_col │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ 14)│ RepartitionExec │ -15)└─────────────┬─────────────┘ -16)┌─────────────┴─────────────┐ -17)│ DataSourceExec │ -18)│ -------------------- │ -19)│ files: 1 │ -20)│ format: parquet │ -21)└───────────────────────────┘ +15)│ -------------------- │ +16)│ output_partition_count: │ +17)│ 1 │ +18)│ │ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: parquet │ +27)└───────────────────────────┘ # Query with projection on memory @@ -836,13 +1049,13 @@ physical_plan 02)│ ProjectionExec │ 03)│ -------------------- │ 04)│ bigint_col: │ -05)│ bigint_col@1 │ +05)│ bigint_col │ 06)│ │ -07)│ int_col: int_col@0 │ +07)│ int_col: int_col │ 08)│ │ 09)│ sum_col: │ -10)│ CAST(int_col@0 AS Int64) +│ -11)│ bigint_col@1 │ +10)│ CAST(int_col AS Int64) + │ +11)│ bigint_col │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ 14)│ DataSourceExec │ @@ -861,22 +1074,28 @@ physical_plan 02)│ ProjectionExec │ 03)│ -------------------- │ 04)│ bigint_col: │ -05)│ bigint_col@0 │ +05)│ bigint_col │ 06)│ │ -07)│ int_col: int_col@1 │ +07)│ int_col: int_col │ 08)│ │ 09)│ sum_col: │ -10)│ int_col@1 + bigint_col@0 │ +10)│ int_col + bigint_col │ 11)└─────────────┬─────────────┘ 12)┌─────────────┴─────────────┐ 13)│ RepartitionExec │ -14)└─────────────┬─────────────┘ -15)┌─────────────┴─────────────┐ -16)│ DataSourceExec │ -17)│ -------------------- │ -18)│ files: 1 │ -19)│ format: json │ -20)└───────────────────────────┘ +14)│ -------------------- │ +15)│ output_partition_count: │ +16)│ 1 │ +17)│ │ +18)│ partitioning_scheme: │ +19)│ RoundRobinBatch(4) │ +20)└─────────────┬─────────────┘ +21)┌─────────────┴─────────────┐ +22)│ DataSourceExec │ +23)│ -------------------- │ +24)│ files: 1 │ +25)│ format: json │ +26)└───────────────────────────┘ # Query with projection on arrow @@ -888,23 +1107,29 @@ physical_plan 02)│ ProjectionExec │ 03)│ -------------------- │ 04)│ bigint_col: │ -05)│ bigint_col@1 │ +05)│ bigint_col │ 06)│ │ -07)│ int_col: int_col@0 │ +07)│ int_col: int_col │ 08)│ │ 09)│ sum_col: │ -10)│ CAST(int_col@0 AS Int64) +│ -11)│ bigint_col@1 │ +10)│ CAST(int_col AS Int64) + │ +11)│ bigint_col │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ 14)│ RepartitionExec │ -15)└─────────────┬─────────────┘ -16)┌─────────────┴─────────────┐ -17)│ DataSourceExec │ -18)│ -------------------- │ -19)│ files: 1 │ -20)│ format: arrow │ -21)└───────────────────────────┘ +15)│ -------------------- │ +16)│ output_partition_count: │ +17)│ 1 │ +18)│ │ +19)│ partitioning_scheme: │ +20)│ RoundRobinBatch(4) │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ DataSourceExec │ +24)│ -------------------- │ +25)│ files: 1 │ +26)│ format: arrow │ +27)└───────────────────────────┘ # Query with PartialSortExec. query TT @@ -957,51 +1182,68 @@ explain select * from table1 inner join table2 on table1.int_col = table2.int_co physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ HashJoinExec │ -06)│ -------------------- │ -07)│ on: │ -08)│ (int_col@0 = int_col@0), ├──────────────┐ -09)│ (CAST(table1.string_col │ │ -10)│ AS Utf8View)@4 = │ │ -11)│ string_col@1) │ │ -12)└─────────────┬─────────────┘ │ -13)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -14)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ -15)└─────────────┬─────────────┘└─────────────┬─────────────┘ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ HashJoinExec │ +09)│ -------------------- │ +10)│ on: │ +11)│ (int_col = int_col), (CAST├──────────────┐ +12)│ (table1.string_col AS │ │ +13)│ Utf8View) = │ │ +14)│ string_col) │ │ +15)└─────────────┬─────────────┘ │ 16)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -17)│ RepartitionExec ││ RepartitionExec │ -18)└─────────────┬─────────────┘└─────────────┬─────────────┘ -19)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -20)│ ProjectionExec ││ RepartitionExec │ -21)│ -------------------- ││ │ -22)│ CAST(table1.string_col AS ││ │ -23)│ Utf8View): ││ │ -24)│ CAST(string_col@1 AS ││ │ -25)│ Utf8View) ││ │ -26)│ ││ │ -27)│ bigint_col: ││ │ -28)│ bigint_col@2 ││ │ -29)│ ││ │ -30)│ date_col: date_col@3 ││ │ -31)│ int_col: int_col@0 ││ │ -32)│ ││ │ -33)│ string_col: ││ │ -34)│ string_col@1 ││ │ -35)└─────────────┬─────────────┘└─────────────┬─────────────┘ -36)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -37)│ RepartitionExec ││ DataSourceExec │ -38)│ ││ -------------------- │ -39)│ ││ files: 1 │ -40)│ ││ format: parquet │ -41)└─────────────┬─────────────┘└───────────────────────────┘ -42)┌─────────────┴─────────────┐ -43)│ DataSourceExec │ -44)│ -------------------- │ -45)│ files: 1 │ -46)│ format: csv │ -47)└───────────────────────────┘ +17)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ +18)│ -------------------- ││ -------------------- │ +19)│ target_batch_size: ││ target_batch_size: │ +20)│ 8192 ││ 8192 │ +21)└─────────────┬─────────────┘└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +23)│ RepartitionExec ││ RepartitionExec │ +24)│ -------------------- ││ -------------------- │ +25)│ output_partition_count: ││ output_partition_count: │ +26)│ 4 ││ 4 │ +27)│ ││ │ +28)│ partitioning_scheme: ││ partitioning_scheme: │ +29)│ Hash([int_col@0, CAST ││ Hash([int_col@0, │ +30)│ (table1.string_col ││ string_col@1], │ +31)│ AS Utf8View)@4], 4) ││ 4) │ +32)└─────────────┬─────────────┘└─────────────┬─────────────┘ +33)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +34)│ ProjectionExec ││ RepartitionExec │ +35)│ -------------------- ││ -------------------- │ +36)│ CAST(table1.string_col AS ││ output_partition_count: │ +37)│ Utf8View): ││ 1 │ +38)│ CAST(string_col AS ││ │ +39)│ Utf8View) ││ partitioning_scheme: │ +40)│ ││ RoundRobinBatch(4) │ +41)│ bigint_col: ││ │ +42)│ bigint_col ││ │ +43)│ ││ │ +44)│ date_col: date_col ││ │ +45)│ int_col: int_col ││ │ +46)│ ││ │ +47)│ string_col: ││ │ +48)│ string_col ││ │ +49)└─────────────┬─────────────┘└─────────────┬─────────────┘ +50)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +51)│ RepartitionExec ││ DataSourceExec │ +52)│ -------------------- ││ -------------------- │ +53)│ output_partition_count: ││ files: 1 │ +54)│ 1 ││ format: parquet │ +55)│ ││ │ +56)│ partitioning_scheme: ││ │ +57)│ RoundRobinBatch(4) ││ │ +58)└─────────────┬─────────────┘└───────────────────────────┘ +59)┌─────────────┴─────────────┐ +60)│ DataSourceExec │ +61)│ -------------------- │ +62)│ files: 1 │ +63)│ format: csv │ +64)└───────────────────────────┘ # Query with outer hash join. query TT @@ -1010,53 +1252,70 @@ explain select * from table1 left outer join table2 on table1.int_col = table2.i physical_plan 01)┌───────────────────────────┐ 02)│ CoalesceBatchesExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ HashJoinExec │ -06)│ -------------------- │ -07)│ join_type: Left │ -08)│ │ -09)│ on: ├──────────────┐ -10)│ (int_col@0 = int_col@0), │ │ -11)│ (CAST(table1.string_col │ │ -12)│ AS Utf8View)@4 = │ │ -13)│ string_col@1) │ │ -14)└─────────────┬─────────────┘ │ -15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -16)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ -17)└─────────────┬─────────────┘└─────────────┬─────────────┘ +03)│ -------------------- │ +04)│ target_batch_size: │ +05)│ 8192 │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ HashJoinExec │ +09)│ -------------------- │ +10)│ join_type: Left │ +11)│ │ +12)│ on: ├──────────────┐ +13)│ (int_col = int_col), (CAST│ │ +14)│ (table1.string_col AS │ │ +15)│ Utf8View) = │ │ +16)│ string_col) │ │ +17)└─────────────┬─────────────┘ │ 18)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -19)│ RepartitionExec ││ RepartitionExec │ -20)└─────────────┬─────────────┘└─────────────┬─────────────┘ -21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -22)│ ProjectionExec ││ RepartitionExec │ -23)│ -------------------- ││ │ -24)│ CAST(table1.string_col AS ││ │ -25)│ Utf8View): ││ │ -26)│ CAST(string_col@1 AS ││ │ -27)│ Utf8View) ││ │ -28)│ ││ │ -29)│ bigint_col: ││ │ -30)│ bigint_col@2 ││ │ -31)│ ││ │ -32)│ date_col: date_col@3 ││ │ -33)│ int_col: int_col@0 ││ │ -34)│ ││ │ -35)│ string_col: ││ │ -36)│ string_col@1 ││ │ -37)└─────────────┬─────────────┘└─────────────┬─────────────┘ -38)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -39)│ RepartitionExec ││ DataSourceExec │ -40)│ ││ -------------------- │ -41)│ ││ files: 1 │ -42)│ ││ format: parquet │ -43)└─────────────┬─────────────┘└───────────────────────────┘ -44)┌─────────────┴─────────────┐ -45)│ DataSourceExec │ -46)│ -------------------- │ -47)│ files: 1 │ -48)│ format: csv │ -49)└───────────────────────────┘ +19)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ +20)│ -------------------- ││ -------------------- │ +21)│ target_batch_size: ││ target_batch_size: │ +22)│ 8192 ││ 8192 │ +23)└─────────────┬─────────────┘└─────────────┬─────────────┘ +24)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +25)│ RepartitionExec ││ RepartitionExec │ +26)│ -------------------- ││ -------------------- │ +27)│ output_partition_count: ││ output_partition_count: │ +28)│ 4 ││ 4 │ +29)│ ││ │ +30)│ partitioning_scheme: ││ partitioning_scheme: │ +31)│ Hash([int_col@0, CAST ││ Hash([int_col@0, │ +32)│ (table1.string_col ││ string_col@1], │ +33)│ AS Utf8View)@4], 4) ││ 4) │ +34)└─────────────┬─────────────┘└─────────────┬─────────────┘ +35)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +36)│ ProjectionExec ││ RepartitionExec │ +37)│ -------------------- ││ -------------------- │ +38)│ CAST(table1.string_col AS ││ output_partition_count: │ +39)│ Utf8View): ││ 1 │ +40)│ CAST(string_col AS ││ │ +41)│ Utf8View) ││ partitioning_scheme: │ +42)│ ││ RoundRobinBatch(4) │ +43)│ bigint_col: ││ │ +44)│ bigint_col ││ │ +45)│ ││ │ +46)│ date_col: date_col ││ │ +47)│ int_col: int_col ││ │ +48)│ ││ │ +49)│ string_col: ││ │ +50)│ string_col ││ │ +51)└─────────────┬─────────────┘└─────────────┬─────────────┘ +52)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +53)│ RepartitionExec ││ DataSourceExec │ +54)│ -------------------- ││ -------------------- │ +55)│ output_partition_count: ││ files: 1 │ +56)│ 1 ││ format: parquet │ +57)│ ││ │ +58)│ partitioning_scheme: ││ │ +59)│ RoundRobinBatch(4) ││ │ +60)└─────────────┬─────────────┘└───────────────────────────┘ +61)┌─────────────┴─────────────┐ +62)│ DataSourceExec │ +63)│ -------------------- │ +64)│ files: 1 │ +65)│ format: csv │ +66)└───────────────────────────┘ # Query with nested loop join. query TT @@ -1076,22 +1335,34 @@ physical_plan 11)└───────────────────────────┘└─────────────┬─────────────┘ 12)-----------------------------┌─────────────┴─────────────┐ 13)-----------------------------│ AggregateExec │ -14)-----------------------------└─────────────┬─────────────┘ -15)-----------------------------┌─────────────┴─────────────┐ -16)-----------------------------│ CoalescePartitionsExec │ +14)-----------------------------│ -------------------- │ +15)-----------------------------│ aggr: count(1) │ +16)-----------------------------│ mode: Final │ 17)-----------------------------└─────────────┬─────────────┘ 18)-----------------------------┌─────────────┴─────────────┐ -19)-----------------------------│ AggregateExec │ +19)-----------------------------│ CoalescePartitionsExec │ 20)-----------------------------└─────────────┬─────────────┘ 21)-----------------------------┌─────────────┴─────────────┐ -22)-----------------------------│ RepartitionExec │ -23)-----------------------------└─────────────┬─────────────┘ -24)-----------------------------┌─────────────┴─────────────┐ -25)-----------------------------│ DataSourceExec │ -26)-----------------------------│ -------------------- │ -27)-----------------------------│ files: 1 │ -28)-----------------------------│ format: parquet │ -29)-----------------------------└───────────────────────────┘ +22)-----------------------------│ AggregateExec │ +23)-----------------------------│ -------------------- │ +24)-----------------------------│ aggr: count(1) │ +25)-----------------------------│ mode: Partial │ +26)-----------------------------└─────────────┬─────────────┘ +27)-----------------------------┌─────────────┴─────────────┐ +28)-----------------------------│ RepartitionExec │ +29)-----------------------------│ -------------------- │ +30)-----------------------------│ output_partition_count: │ +31)-----------------------------│ 1 │ +32)-----------------------------│ │ +33)-----------------------------│ partitioning_scheme: │ +34)-----------------------------│ RoundRobinBatch(4) │ +35)-----------------------------└─────────────┬─────────────┘ +36)-----------------------------┌─────────────┴─────────────┐ +37)-----------------------------│ DataSourceExec │ +38)-----------------------------│ -------------------- │ +39)-----------------------------│ files: 1 │ +40)-----------------------------│ format: parquet │ +41)-----------------------------└───────────────────────────┘ # Query with cross join. query TT @@ -1103,16 +1374,19 @@ physical_plan 03)└─────────────┬─────────────┘ │ 04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 05)│ DataSourceExec ││ RepartitionExec │ -06)│ -------------------- ││ │ -07)│ files: 1 ││ │ -08)│ format: csv ││ │ -09)└───────────────────────────┘└─────────────┬─────────────┘ -10)-----------------------------┌─────────────┴─────────────┐ -11)-----------------------------│ DataSourceExec │ -12)-----------------------------│ -------------------- │ -13)-----------------------------│ files: 1 │ -14)-----------------------------│ format: parquet │ -15)-----------------------------└───────────────────────────┘ +06)│ -------------------- ││ -------------------- │ +07)│ files: 1 ││ output_partition_count: │ +08)│ format: csv ││ 1 │ +09)│ ││ │ +10)│ ││ partitioning_scheme: │ +11)│ ││ RoundRobinBatch(4) │ +12)└───────────────────────────┘└─────────────┬─────────────┘ +13)-----------------------------┌─────────────┴─────────────┐ +14)-----------------------------│ DataSourceExec │ +15)-----------------------------│ -------------------- │ +16)-----------------------------│ files: 1 │ +17)-----------------------------│ format: parquet │ +18)-----------------------------└───────────────────────────┘ # Query with sort merge join. @@ -1126,7 +1400,7 @@ physical_plan 01)┌───────────────────────────┐ 02)│ SortMergeJoinExec │ 03)│ -------------------- ├──────────────┐ -04)│ on: (c1@0 = c1@0) │ │ +04)│ on: (c1 = c1) │ │ 05)└─────────────┬─────────────┘ │ 06)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 07)│ SortExec ││ SortExec │ @@ -1160,6 +1434,134 @@ drop table table4; statement ok drop table table5; +# Create table for InterleaveExec +statement ok +CREATE TABLE t1( + id INT, + name TEXT +) as VALUES + (1, 'Alex'), + (2, 'Bob'), + (3, 'Alice') +; + +statement ok +CREATE TABLE t2( + id TINYINT, + name TEXT +) as VALUES + (1, 'Alex'), + (2, 'Bob'), + (3, 'John') +; + +# Test explain tree for InterleaveExec +query TT +EXPLAIN +SELECT count(*) FROM ( + SELECT distinct name FROM t1 + UNION ALL + SELECT distinct name FROM t2 +) GROUP BY name +---- +physical_plan +01)┌───────────────────────────┐ +02)│ ProjectionExec │ +03)│ -------------------- │ +04)│ count(*): │ +05)│ count(Int64(1)) │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ AggregateExec │ +09)│ -------------------- │ +10)│ aggr: count(1) │ +11)│ group_by: name │ +12)│ │ +13)│ mode: │ +14)│ SinglePartitioned │ +15)└─────────────┬─────────────┘ +16)┌─────────────┴─────────────┐ +17)│ InterleaveExec ├──────────────┐ +18)└─────────────┬─────────────┘ │ +19)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +20)│ AggregateExec ││ AggregateExec │ +21)│ -------------------- ││ -------------------- │ +22)│ group_by: name ││ group_by: name │ +23)│ ││ │ +24)│ mode: ││ mode: │ +25)│ FinalPartitioned ││ FinalPartitioned │ +26)└─────────────┬─────────────┘└─────────────┬─────────────┘ +27)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +28)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ +29)│ -------------------- ││ -------------------- │ +30)│ target_batch_size: ││ target_batch_size: │ +31)│ 8192 ││ 8192 │ +32)└─────────────┬─────────────┘└─────────────┬─────────────┘ +33)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +34)│ RepartitionExec ││ RepartitionExec │ +35)│ -------------------- ││ -------------------- │ +36)│ output_partition_count: ││ output_partition_count: │ +37)│ 4 ││ 4 │ +38)│ ││ │ +39)│ partitioning_scheme: ││ partitioning_scheme: │ +40)│ Hash([name@0], 4) ││ Hash([name@0], 4) │ +41)└─────────────┬─────────────┘└─────────────┬─────────────┘ +42)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +43)│ RepartitionExec ││ RepartitionExec │ +44)│ -------------------- ││ -------------------- │ +45)│ output_partition_count: ││ output_partition_count: │ +46)│ 1 ││ 1 │ +47)│ ││ │ +48)│ partitioning_scheme: ││ partitioning_scheme: │ +49)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │ +50)└─────────────┬─────────────┘└─────────────┬─────────────┘ +51)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +52)│ AggregateExec ││ AggregateExec │ +53)│ -------------------- ││ -------------------- │ +54)│ group_by: name ││ group_by: name │ +55)│ mode: Partial ││ mode: Partial │ +56)└─────────────┬─────────────┘└─────────────┬─────────────┘ +57)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +58)│ DataSourceExec ││ DataSourceExec │ +59)│ -------------------- ││ -------------------- │ +60)│ bytes: 1320 ││ bytes: 1312 │ +61)│ format: memory ││ format: memory │ +62)│ rows: 1 ││ rows: 1 │ +63)└───────────────────────────┘└───────────────────────────┘ + +# Test explain tree for UnionExec +query TT +EXPLAIN +SELECT id, name FROM t1 +UNION ALL +SELECT id, name FROM t2; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ UnionExec ├──────────────┐ +03)└─────────────┬─────────────┘ │ +04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +05)│ DataSourceExec ││ ProjectionExec │ +06)│ -------------------- ││ -------------------- │ +07)│ bytes: 1320 ││ id: CAST(id AS Int32) │ +08)│ format: memory ││ name: name │ +09)│ rows: 1 ││ │ +10)└───────────────────────────┘└─────────────┬─────────────┘ +11)-----------------------------┌─────────────┴─────────────┐ +12)-----------------------------│ DataSourceExec │ +13)-----------------------------│ -------------------- │ +14)-----------------------------│ bytes: 1312 │ +15)-----------------------------│ format: memory │ +16)-----------------------------│ rows: 1 │ +17)-----------------------------└───────────────────────────┘ + +# cleanup +statement ok +drop table t1; + +statement ok +drop table t2; + # Test on StreamingTableExec # prepare table statement ok @@ -1181,25 +1583,36 @@ ORDER BY "date", "time"; physical_plan 01)┌───────────────────────────┐ 02)│ SortPreservingMergeExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ CoalesceBatchesExec │ +03)│ -------------------- │ +04)│ date ASC NULLS LAST, time │ +05)│ ASC NULLS LAST │ 06)└─────────────┬─────────────┘ 07)┌─────────────┴─────────────┐ -08)│ FilterExec │ +08)│ CoalesceBatchesExec │ 09)│ -------------------- │ -10)│ predicate: │ -11)│ ticker@1 = A │ +10)│ target_batch_size: │ +11)│ 8192 │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ -14)│ RepartitionExec │ -15)└─────────────┬─────────────┘ -16)┌─────────────┴─────────────┐ -17)│ StreamingTableExec │ -18)│ -------------------- │ -19)│ infinite: true │ -20)│ limit: None │ -21)└───────────────────────────┘ +14)│ FilterExec │ +15)│ -------------------- │ +16)│ predicate: ticker = A │ +17)└─────────────┬─────────────┘ +18)┌─────────────┴─────────────┐ +19)│ RepartitionExec │ +20)│ -------------------- │ +21)│ output_partition_count: │ +22)│ 1 │ +23)│ │ +24)│ partitioning_scheme: │ +25)│ RoundRobinBatch(4) │ +26)└─────────────┬─────────────┘ +27)┌─────────────┴─────────────┐ +28)│ StreamingTableExec │ +29)│ -------------------- │ +30)│ infinite: true │ +31)│ limit: None │ +32)└───────────────────────────┘ # constant ticker, CAST(time AS DATE) = time, order by time @@ -1211,26 +1624,37 @@ ORDER BY "time" physical_plan 01)┌───────────────────────────┐ 02)│ SortPreservingMergeExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ CoalesceBatchesExec │ -06)└─────────────┬─────────────┘ -07)┌─────────────┴─────────────┐ -08)│ FilterExec │ -09)│ -------------------- │ -10)│ predicate: │ -11)│ ticker@1 = A AND CAST(time│ -12)│ @2 AS Date32) = date@0 │ -13)└─────────────┬─────────────┘ -14)┌─────────────┴─────────────┐ -15)│ RepartitionExec │ -16)└─────────────┬─────────────┘ -17)┌─────────────┴─────────────┐ -18)│ StreamingTableExec │ -19)│ -------------------- │ -20)│ infinite: true │ -21)│ limit: None │ -22)└───────────────────────────┘ +03)│ -------------------- │ +04)│ time ASC NULLS LAST │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ CoalesceBatchesExec │ +08)│ -------------------- │ +09)│ target_batch_size: │ +10)│ 8192 │ +11)└─────────────┬─────────────┘ +12)┌─────────────┴─────────────┐ +13)│ FilterExec │ +14)│ -------------------- │ +15)│ predicate: │ +16)│ ticker = A AND CAST(time │ +17)│ AS Date32) = date │ +18)└─────────────┬─────────────┘ +19)┌─────────────┴─────────────┐ +20)│ RepartitionExec │ +21)│ -------------------- │ +22)│ output_partition_count: │ +23)│ 1 │ +24)│ │ +25)│ partitioning_scheme: │ +26)│ RoundRobinBatch(4) │ +27)└─────────────┬─────────────┘ +28)┌─────────────┴─────────────┐ +29)│ StreamingTableExec │ +30)│ -------------------- │ +31)│ infinite: true │ +32)│ limit: None │ +33)└───────────────────────────┘ # same thing but order by date query TT @@ -1241,26 +1665,37 @@ ORDER BY "date" physical_plan 01)┌───────────────────────────┐ 02)│ SortPreservingMergeExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ CoalesceBatchesExec │ -06)└─────────────┬─────────────┘ -07)┌─────────────┴─────────────┐ -08)│ FilterExec │ -09)│ -------------------- │ -10)│ predicate: │ -11)│ ticker@1 = A AND CAST(time│ -12)│ @2 AS Date32) = date@0 │ -13)└─────────────┬─────────────┘ -14)┌─────────────┴─────────────┐ -15)│ RepartitionExec │ -16)└─────────────┬─────────────┘ -17)┌─────────────┴─────────────┐ -18)│ StreamingTableExec │ -19)│ -------------------- │ -20)│ infinite: true │ -21)│ limit: None │ -22)└───────────────────────────┘ +03)│ -------------------- │ +04)│ date ASC NULLS LAST │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ CoalesceBatchesExec │ +08)│ -------------------- │ +09)│ target_batch_size: │ +10)│ 8192 │ +11)└─────────────┬─────────────┘ +12)┌─────────────┴─────────────┐ +13)│ FilterExec │ +14)│ -------------------- │ +15)│ predicate: │ +16)│ ticker = A AND CAST(time │ +17)│ AS Date32) = date │ +18)└─────────────┬─────────────┘ +19)┌─────────────┴─────────────┐ +20)│ RepartitionExec │ +21)│ -------------------- │ +22)│ output_partition_count: │ +23)│ 1 │ +24)│ │ +25)│ partitioning_scheme: │ +26)│ RoundRobinBatch(4) │ +27)└─────────────┬─────────────┘ +28)┌─────────────┴─────────────┐ +29)│ StreamingTableExec │ +30)│ -------------------- │ +31)│ infinite: true │ +32)│ limit: None │ +33)└───────────────────────────┘ # same thing but order by ticker query TT @@ -1274,23 +1709,32 @@ physical_plan 03)└─────────────┬─────────────┘ 04)┌─────────────┴─────────────┐ 05)│ CoalesceBatchesExec │ -06)└─────────────┬─────────────┘ -07)┌─────────────┴─────────────┐ -08)│ FilterExec │ -09)│ -------------------- │ -10)│ predicate: │ -11)│ ticker@1 = A AND CAST(time│ -12)│ @2 AS Date32) = date@0 │ -13)└─────────────┬─────────────┘ -14)┌─────────────┴─────────────┐ -15)│ RepartitionExec │ +06)│ -------------------- │ +07)│ target_batch_size: │ +08)│ 8192 │ +09)└─────────────┬─────────────┘ +10)┌─────────────┴─────────────┐ +11)│ FilterExec │ +12)│ -------------------- │ +13)│ predicate: │ +14)│ ticker = A AND CAST(time │ +15)│ AS Date32) = date │ 16)└─────────────┬─────────────┘ 17)┌─────────────┴─────────────┐ -18)│ StreamingTableExec │ +18)│ RepartitionExec │ 19)│ -------------------- │ -20)│ infinite: true │ -21)│ limit: None │ -22)└───────────────────────────┘ +20)│ output_partition_count: │ +21)│ 1 │ +22)│ │ +23)│ partitioning_scheme: │ +24)│ RoundRobinBatch(4) │ +25)└─────────────┬─────────────┘ +26)┌─────────────┴─────────────┐ +27)│ StreamingTableExec │ +28)│ -------------------- │ +29)│ infinite: true │ +30)│ limit: None │ +31)└───────────────────────────┘ # same thing but order by time, date @@ -1302,26 +1746,38 @@ ORDER BY "time", "date"; physical_plan 01)┌───────────────────────────┐ 02)│ SortPreservingMergeExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ CoalesceBatchesExec │ +03)│ -------------------- │ +04)│ time ASC NULLS LAST, date │ +05)│ ASC NULLS LAST │ 06)└─────────────┬─────────────┘ 07)┌─────────────┴─────────────┐ -08)│ FilterExec │ +08)│ CoalesceBatchesExec │ 09)│ -------------------- │ -10)│ predicate: │ -11)│ ticker@1 = A AND CAST(time│ -12)│ @2 AS Date32) = date@0 │ -13)└─────────────┬─────────────┘ -14)┌─────────────┴─────────────┐ -15)│ RepartitionExec │ -16)└─────────────┬─────────────┘ -17)┌─────────────┴─────────────┐ -18)│ StreamingTableExec │ -19)│ -------------------- │ -20)│ infinite: true │ -21)│ limit: None │ -22)└───────────────────────────┘ +10)│ target_batch_size: │ +11)│ 8192 │ +12)└─────────────┬─────────────┘ +13)┌─────────────┴─────────────┐ +14)│ FilterExec │ +15)│ -------------------- │ +16)│ predicate: │ +17)│ ticker = A AND CAST(time │ +18)│ AS Date32) = date │ +19)└─────────────┬─────────────┘ +20)┌─────────────┴─────────────┐ +21)│ RepartitionExec │ +22)│ -------------------- │ +23)│ output_partition_count: │ +24)│ 1 │ +25)│ │ +26)│ partitioning_scheme: │ +27)│ RoundRobinBatch(4) │ +28)└─────────────┬─────────────┘ +29)┌─────────────┴─────────────┐ +30)│ StreamingTableExec │ +31)│ -------------------- │ +32)│ infinite: true │ +33)│ limit: None │ +34)└───────────────────────────┘ @@ -1335,22 +1791,295 @@ ORDER BY "ticker", "time"; physical_plan 01)┌───────────────────────────┐ 02)│ SortPreservingMergeExec │ -03)└─────────────┬─────────────┘ -04)┌─────────────┴─────────────┐ -05)│ CoalesceBatchesExec │ +03)│ -------------------- │ +04)│ ticker ASC NULLS LAST, │ +05)│ time ASC NULLS LAST │ 06)└─────────────┬─────────────┘ 07)┌─────────────┴─────────────┐ -08)│ FilterExec │ +08)│ CoalesceBatchesExec │ 09)│ -------------------- │ -10)│ predicate: │ -11)│ date@0 = 2006-01-02 │ +10)│ target_batch_size: │ +11)│ 8192 │ 12)└─────────────┬─────────────┘ 13)┌─────────────┴─────────────┐ -14)│ RepartitionExec │ +14)│ FilterExec │ +15)│ -------------------- │ +16)│ predicate: │ +17)│ date = 2006-01-02 │ +18)└─────────────┬─────────────┘ +19)┌─────────────┴─────────────┐ +20)│ RepartitionExec │ +21)│ -------------------- │ +22)│ output_partition_count: │ +23)│ 1 │ +24)│ │ +25)│ partitioning_scheme: │ +26)│ RoundRobinBatch(4) │ +27)└─────────────┬─────────────┘ +28)┌─────────────┴─────────────┐ +29)│ StreamingTableExec │ +30)│ -------------------- │ +31)│ infinite: true │ +32)│ limit: None │ +33)└───────────────────────────┘ + + + +# Test explain tree for WorkTableExec +query TT +EXPLAIN WITH RECURSIVE nodes AS ( + SELECT 1 as id + UNION ALL + SELECT id + 1 as id + FROM nodes + WHERE id < 10 +) +SELECT * FROM nodes +---- +physical_plan +01)┌───────────────────────────┐ +02)│ RecursiveQueryExec ├──────────────┐ +03)└─────────────┬─────────────┘ │ +04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +05)│ ProjectionExec ││ CoalescePartitionsExec │ +06)│ -------------------- ││ │ +07)│ id: 1 ││ │ +08)└─────────────┬─────────────┘└─────────────┬─────────────┘ +09)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +10)│ PlaceholderRowExec ││ ProjectionExec │ +11)│ ││ -------------------- │ +12)│ ││ id: id + 1 │ +13)└───────────────────────────┘└─────────────┬─────────────┘ +14)-----------------------------┌─────────────┴─────────────┐ +15)-----------------------------│ CoalesceBatchesExec │ +16)-----------------------------│ -------------------- │ +17)-----------------------------│ target_batch_size: │ +18)-----------------------------│ 8192 │ +19)-----------------------------└─────────────┬─────────────┘ +20)-----------------------------┌─────────────┴─────────────┐ +21)-----------------------------│ FilterExec │ +22)-----------------------------│ -------------------- │ +23)-----------------------------│ predicate: id < 10 │ +24)-----------------------------└─────────────┬─────────────┘ +25)-----------------------------┌─────────────┴─────────────┐ +26)-----------------------------│ RepartitionExec │ +27)-----------------------------│ -------------------- │ +28)-----------------------------│ output_partition_count: │ +29)-----------------------------│ 1 │ +30)-----------------------------│ │ +31)-----------------------------│ partitioning_scheme: │ +32)-----------------------------│ RoundRobinBatch(4) │ +33)-----------------------------└─────────────┬─────────────┘ +34)-----------------------------┌─────────────┴─────────────┐ +35)-----------------------------│ WorkTableExec │ +36)-----------------------------│ -------------------- │ +37)-----------------------------│ name: nodes │ +38)-----------------------------└───────────────────────────┘ + +query TT +explain COPY (VALUES (1, 'foo', 1, '2023-01-01'), (2, 'bar', 2, '2023-01-02'), (3, 'baz', 3, '2023-01-03')) +TO 'test_files/scratch/explain_tree/1.json'; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ DataSinkExec │ +03)│ -------------------- │ +04)│ file: │ +05)│ test_files/scratch │ +06)│ /explain_tree/1 │ +07)│ .json │ +08)│ │ +09)│ format: json │ +10)└─────────────┬─────────────┘ +11)┌─────────────┴─────────────┐ +12)│ DataSourceExec │ +13)│ -------------------- │ +14)│ bytes: 2672 │ +15)│ format: memory │ +16)│ rows: 1 │ +17)└───────────────────────────┘ + +query TT +explain COPY (VALUES (1, 'foo', 1, '2023-01-01'), (2, 'bar', 2, '2023-01-02'), (3, 'baz', 3, '2023-01-03')) +TO 'test_files/scratch/explain_tree/2.csv'; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ DataSinkExec │ +03)│ -------------------- │ +04)│ file: │ +05)│ test_files/scratch │ +06)│ /explain_tree/2 │ +07)│ .csv │ +08)│ │ +09)│ format: csv │ +10)└─────────────┬─────────────┘ +11)┌─────────────┴─────────────┐ +12)│ DataSourceExec │ +13)│ -------------------- │ +14)│ bytes: 2672 │ +15)│ format: memory │ +16)│ rows: 1 │ +17)└───────────────────────────┘ + +query TT +explain COPY (VALUES (1, 'foo', 1, '2023-01-01'), (2, 'bar', 2, '2023-01-02'), (3, 'baz', 3, '2023-01-03')) +TO 'test_files/scratch/explain_tree/3.arrow'; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ DataSinkExec │ +03)│ -------------------- │ +04)│ file: │ +05)│ test_files/scratch │ +06)│ /explain_tree/3 │ +07)│ .arrow │ +08)│ │ +09)│ format: arrow │ +10)└─────────────┬─────────────┘ +11)┌─────────────┴─────────────┐ +12)│ DataSourceExec │ +13)│ -------------------- │ +14)│ bytes: 2672 │ +15)│ format: memory │ +16)│ rows: 1 │ +17)└───────────────────────────┘ + + +# Test explain tree rendering for CoalesceBatchesExec with limit +statement ok +CREATE TABLE IF NOT EXISTS t1 (a INT) AS VALUES(1),(2),(3),(4),(5),(6),(7),(8),(9),(10); + +query TT +EXPLAIN SELECT COUNT(*) FROM (SELECT a FROM t1 WHERE a > 3 LIMIT 3 OFFSET 6); +---- +physical_plan +01)┌───────────────────────────┐ +02)│ ProjectionExec │ +03)│ -------------------- │ +04)│ count(*): │ +05)│ count(Int64(1)) │ +06)└─────────────┬─────────────┘ +07)┌─────────────┴─────────────┐ +08)│ AggregateExec │ +09)│ -------------------- │ +10)│ aggr: count(1) │ +11)│ mode: Final │ +12)└─────────────┬─────────────┘ +13)┌─────────────┴─────────────┐ +14)│ CoalescePartitionsExec │ 15)└─────────────┬─────────────┘ 16)┌─────────────┴─────────────┐ -17)│ StreamingTableExec │ +17)│ AggregateExec │ 18)│ -------------------- │ -19)│ infinite: true │ -20)│ limit: None │ -21)└───────────────────────────┘ +19)│ aggr: count(1) │ +20)│ mode: Partial │ +21)└─────────────┬─────────────┘ +22)┌─────────────┴─────────────┐ +23)│ RepartitionExec │ +24)│ -------------------- │ +25)│ output_partition_count: │ +26)│ 1 │ +27)│ │ +28)│ partitioning_scheme: │ +29)│ RoundRobinBatch(4) │ +30)└─────────────┬─────────────┘ +31)┌─────────────┴─────────────┐ +32)│ ProjectionExec │ +33)└─────────────┬─────────────┘ +34)┌─────────────┴─────────────┐ +35)│ GlobalLimitExec │ +36)│ -------------------- │ +37)│ limit: 3 │ +38)│ skip: 6 │ +39)└─────────────┬─────────────┘ +40)┌─────────────┴─────────────┐ +41)│ CoalesceBatchesExec │ +42)│ -------------------- │ +43)│ limit: 9 │ +44)│ │ +45)│ target_batch_size: │ +46)│ 8192 │ +47)└─────────────┬─────────────┘ +48)┌─────────────┴─────────────┐ +49)│ FilterExec │ +50)│ -------------------- │ +51)│ predicate: a > 3 │ +52)└─────────────┬─────────────┘ +53)┌─────────────┴─────────────┐ +54)│ DataSourceExec │ +55)│ -------------------- │ +56)│ bytes: 160 │ +57)│ format: memory │ +58)│ rows: 1 │ +59)└───────────────────────────┘ + +# clean up +statement ok +drop table t1; + +# Test explain tree for LazyMemoryExec +query TT +EXPLAIN SELECT * FROM generate_series(1, 100) +---- +physical_plan +01)┌───────────────────────────┐ +02)│ LazyMemoryExec │ +03)│ -------------------- │ +04)│ batch_generators: │ +05)│ generate_series: start=1, │ +06)│ end=100, batch_size=8192 │ +07)└───────────────────────────┘ + +# Test explain tree for CoalescePartitionsExec +query TT +EXPLAIN SELECT c1, c2, c3 FROM sink_table WHERE c3 > 0 LIMIT 5; +---- +physical_plan +01)┌───────────────────────────┐ +02)│ CoalescePartitionsExec │ +03)│ -------------------- │ +04)│ limit: 5 │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ CoalesceBatchesExec │ +08)│ -------------------- │ +09)│ limit: 5 │ +10)│ │ +11)│ target_batch_size: │ +12)│ 8192 │ +13)└─────────────┬─────────────┘ +14)┌─────────────┴─────────────┐ +15)│ FilterExec │ +16)│ -------------------- │ +17)│ predicate: c3 > 0 │ +18)└─────────────┬─────────────┘ +19)┌─────────────┴─────────────┐ +20)│ RepartitionExec │ +21)│ -------------------- │ +22)│ output_partition_count: │ +23)│ 1 │ +24)│ │ +25)│ partitioning_scheme: │ +26)│ RoundRobinBatch(4) │ +27)└─────────────┬─────────────┘ +28)┌─────────────┴─────────────┐ +29)│ StreamingTableExec │ +30)│ -------------------- │ +31)│ infinite: true │ +32)│ limit: None │ +33)└───────────────────────────┘ + +# Test explain tree for PlaceholderRowExec +query TT +EXPLAIN select count(*) from (values ('a', 'b'), ('c', 'd')) as t (c1, c2) +---- +physical_plan +01)┌───────────────────────────┐ +02)│ ProjectionExec │ +03)│ -------------------- │ +04)│ count(*): 2 │ +05)└─────────────┬─────────────┘ +06)┌─────────────┴─────────────┐ +07)│ PlaceholderRowExec │ +08)└───────────────────────────┘ diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 74e9fe065a73..24f7c3ea15c6 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -478,8 +478,14 @@ a statement ok create table foo (a varchar, b varchar) as values ('a', 'b'); + +query T +SELECT concat_ws('', a, b,'c') from foo +---- +abc + query T -SELECT concat_ws('',a,b,'c') from foo +SELECT concat_ws('',arrow_cast(a, 'Utf8View'),arrow_cast(b, 'Utf8View'),'c') from foo ---- abc diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index df468a85af82..d1c557bdeb85 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -3468,7 +3468,7 @@ SELECT r.sn, SUM(l.amount), r.amount # to associate it with other fields, aggregate should contain all the composite columns # if any of the composite column is missing, we cannot use associated indices, inside select expression # below query should fail -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.amount could not be resolved from available columns: r\.sn, sum\(l\.amount\) +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.amount" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, sum\(l\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, SUM(l.amount), r.amount FROM sales_global_with_composite_pk AS l JOIN sales_global_with_composite_pk AS r @@ -3496,7 +3496,7 @@ NULL NULL NULL # left join shouldn't propagate right side constraint, # if right side is a unique key (unique and can contain null) # Please note that, above query and this one is same except the constraint in the table. -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.amount could not be resolved from available columns: r\.sn, sum\(r\.amount\) +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.amount" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, sum\(r\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, r.amount, SUM(r.amount) FROM (SELECT * FROM sales_global_with_unique as l @@ -3542,7 +3542,7 @@ SELECT column1, COUNT(*) as column2 FROM (VALUES (['a', 'b'], 1), (['c', 'd', 'e # primary key should be aware from which columns it is associated -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression r\.sn could not be resolved from available columns: l\.sn, l\.zip_code, l\.country, l\.ts, l\.currency, l\.amount, sum\(l\.amount\) +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "r\.sn" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "l\.sn, l\.zip_code, l\.country, l\.ts, l\.currency, l\.amount, sum\(l\.amount\)" appears in the SELECT clause satisfies this requirement SELECT l.sn, r.sn, SUM(l.amount), r.amount FROM sales_global_with_pk AS l JOIN sales_global_with_pk AS r @@ -3633,7 +3633,7 @@ ORDER BY r.sn 4 100 2022-01-03T10:00:00 # after join, new window expressions shouldn't be associated with primary keys -statement error DataFusion error: Error during planning: Projection references non\-aggregate values: Expression rn1 could not be resolved from available columns: r\.sn, r\.ts, r\.amount, sum\(r\.amount\) +statement error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "rn1" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "r\.sn, r\.ts, r\.amount, sum\(r\.amount\)" appears in the SELECT clause satisfies this requirement SELECT r.sn, SUM(r.amount), rn1 FROM (SELECT r.ts, r.sn, r.amount, @@ -5135,7 +5135,7 @@ statement ok CREATE TABLE test_case_expr(a INT, b TEXT) AS VALUES (1,'hello'), (2,'world') query T -SELECT (CASE WHEN CONCAT(b, 'hello') = 'test' THEN 'good' ELSE 'bad' END) AS c +SELECT (CASE WHEN CONCAT(b, 'hello') = 'test' THEN 'good' ELSE 'bad' END) AS c FROM test_case_expr GROUP BY c; ---- bad @@ -5537,3 +5537,32 @@ drop view t statement ok drop table source; + + +# test select_wildcard_with_groupby +statement count 0 +create table t(a int, b int, c int, "😀" int); + +query TT +explain select * from t group by a, b, c, "😀"; +---- +logical_plan +01)Aggregate: groupBy=[[t.a, t.b, t.c, t.😀]], aggr=[[]] +02)--TableScan: t projection=[a, b, c, 😀] +physical_plan +01)AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b, c@2 as c, 😀@3 as 😀], aggr=[] +02)--DataSourceExec: partitions=1, partition_sizes=[0] + +query TT +explain select * from (select a, b from t) as c group by a, b; +---- +logical_plan +01)Aggregate: groupBy=[[c.a, c.b]], aggr=[[]] +02)--SubqueryAlias: c +03)----TableScan: t projection=[a, b] +physical_plan +01)AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b], aggr=[] +02)--DataSourceExec: partitions=1, partition_sizes=[0] + +statement count 0 +drop table t; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 454055b53930..496f24abf6ed 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -263,6 +263,7 @@ datafusion.sql_parser.collect_spans false datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false +datafusion.sql_parser.map_varchar_to_utf8view false datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.recursion_limit 50 datafusion.sql_parser.support_varchar_with_length true @@ -361,6 +362,7 @@ datafusion.sql_parser.collect_spans false When set to true, the source locations datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. +datafusion.sql_parser.map_varchar_to_utf8view false If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 0397e0c367b1..50af06dc40fc 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -4541,3 +4541,269 @@ DROP TABLE test statement ok set datafusion.execution.target_partitions = 1; + +# test using_join_multiple_keys_subquery +statement count 0 +create table person(id int, age int, state int); + +statement count 0 +create table lineitem(c1 int); + +query TT +explain SELECT * FROM person a join person b using (id, age); +---- +logical_plan +01)Projection: a.id, a.age, a.state, b.state +02)--Inner Join: a.id = b.id, a.age = b.age +03)----SubqueryAlias: a +04)------TableScan: person projection=[id, age, state] +05)----SubqueryAlias: b +06)------TableScan: person projection=[id, age, state] +physical_plan +01)CoalesceBatchesExec: target_batch_size=3 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2, state@5] +03)----DataSourceExec: partitions=1, partition_sizes=[0] +04)----DataSourceExec: partitions=1, partition_sizes=[0] + +query TT +explain SELECT age FROM (SELECT * FROM person a join person b using (id, age, state)); +---- +logical_plan +01)Projection: a.age +02)--Inner Join: a.id = b.id, a.age = b.age, a.state = b.state +03)----SubqueryAlias: a +04)------TableScan: person projection=[id, age, state] +05)----SubqueryAlias: b +06)------TableScan: person projection=[id, age, state] +physical_plan +01)CoalesceBatchesExec: target_batch_size=3 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[age@1] +03)----DataSourceExec: partitions=1, partition_sizes=[0] +04)----DataSourceExec: partitions=1, partition_sizes=[0] + +query TT +explain SELECT a.* FROM person a join person b using (id, age); +---- +logical_plan +01)Projection: a.id, a.age, a.state +02)--Inner Join: a.id = b.id, a.age = b.age +03)----SubqueryAlias: a +04)------TableScan: person projection=[id, age, state] +05)----SubqueryAlias: b +06)------TableScan: person projection=[id, age] +physical_plan +01)CoalesceBatchesExec: target_batch_size=3 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2] +03)----DataSourceExec: partitions=1, partition_sizes=[0] +04)----DataSourceExec: partitions=1, partition_sizes=[0] + +query TT +explain SELECT a.*, b.* FROM person a join person b using (id, age); +---- +logical_plan +01)Inner Join: a.id = b.id, a.age = b.age +02)--SubqueryAlias: a +03)----TableScan: person projection=[id, age, state] +04)--SubqueryAlias: b +05)----TableScan: person projection=[id, age, state] +physical_plan +01)CoalesceBatchesExec: target_batch_size=3 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)] +03)----DataSourceExec: partitions=1, partition_sizes=[0] +04)----DataSourceExec: partitions=1, partition_sizes=[0] + +query TT +explain SELECT * FROM person a join person b using (id, age, state) join person c using (id, age, state); +---- +logical_plan +01)Projection: a.id, a.age, a.state +02)--Inner Join: a.id = c.id, a.age = c.age, a.state = c.state +03)----Projection: a.id, a.age, a.state +04)------Inner Join: a.id = b.id, a.age = b.age, a.state = b.state +05)--------SubqueryAlias: a +06)----------TableScan: person projection=[id, age, state] +07)--------SubqueryAlias: b +08)----------TableScan: person projection=[id, age, state] +09)----SubqueryAlias: c +10)------TableScan: person projection=[id, age, state] +physical_plan +01)CoalesceBatchesExec: target_batch_size=3 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[id@0, age@1, state@2] +03)----CoalesceBatchesExec: target_batch_size=3 +04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[id@0, age@1, state@2] +05)--------DataSourceExec: partitions=1, partition_sizes=[0] +06)--------DataSourceExec: partitions=1, partition_sizes=[0] +07)----DataSourceExec: partitions=1, partition_sizes=[0] + +query TT +explain SELECT * FROM person a NATURAL JOIN lineitem b; +---- +logical_plan +01)Cross Join: +02)--SubqueryAlias: a +03)----TableScan: person projection=[id, age, state] +04)--SubqueryAlias: b +05)----TableScan: lineitem projection=[c1] +physical_plan +01)CrossJoinExec +02)--DataSourceExec: partitions=1, partition_sizes=[0] +03)--DataSourceExec: partitions=1, partition_sizes=[0] + +query TT +explain SELECT * FROM lineitem JOIN lineitem as lineitem2 USING (c1) +---- +logical_plan +01)Projection: lineitem.c1 +02)--Inner Join: lineitem.c1 = lineitem2.c1 +03)----TableScan: lineitem projection=[c1] +04)----SubqueryAlias: lineitem2 +05)------TableScan: lineitem projection=[c1] +physical_plan +01)CoalesceBatchesExec: target_batch_size=3 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c1@0, c1@0)], projection=[c1@0] +03)----DataSourceExec: partitions=1, partition_sizes=[0] +04)----DataSourceExec: partitions=1, partition_sizes=[0] + +statement count 0 +drop table person; + +statement count 0 +drop table lineitem; + +statement count 0 +create table j1(j1_string varchar, j1_id int); + +statement count 0 +create table j2(j2_string varchar, j2_id int); + +statement count 0 +create table j3(j3_string varchar, j3_id int); + +statement count 0 +create table j4(j4_string varchar, j4_id int); + +query TT +explain SELECT j1_string, j2_string FROM j1, LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2; +---- +logical_plan +01)Cross Join: +02)--TableScan: j1 projection=[j1_string] +03)--SubqueryAlias: j2 +04)----Projection: j2.j2_string +05)------Subquery: +06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id +07)----------TableScan: j2 projection=[j2_string, j2_id] +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Int32, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) + +query TT +explain SELECT * FROM j1 JOIN (j2 JOIN j3 ON(j2_id = j3_id - 2)) ON(j1_id = j2_id), LATERAL (SELECT * FROM j3 WHERE j3_string = j2_string) as j4 +---- +logical_plan +01)Cross Join: +02)--Inner Join: CAST(j2.j2_id AS Int64) = CAST(j3.j3_id AS Int64) - Int64(2) +03)----Inner Join: j1.j1_id = j2.j2_id +04)------TableScan: j1 projection=[j1_string, j1_id] +05)------TableScan: j2 projection=[j2_string, j2_id] +06)----TableScan: j3 projection=[j3_string, j3_id] +07)--SubqueryAlias: j4 +08)----Subquery: +09)------Filter: j3.j3_string = outer_ref(j2.j2_string) +10)--------TableScan: j3 projection=[j3_string, j3_id] +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Utf8, Column { relation: Some(Bare { table: "j2" }), name: "j2_string" }) + +query TT +explain SELECT * FROM j1, LATERAL (SELECT * FROM j1, LATERAL (SELECT * FROM j2 WHERE j1_id = j2_id) as j2) as j2; +---- +logical_plan +01)Cross Join: +02)--TableScan: j1 projection=[j1_string, j1_id] +03)--SubqueryAlias: j2 +04)----Subquery: +05)------Cross Join: +06)--------TableScan: j1 projection=[j1_string, j1_id] +07)--------SubqueryAlias: j2 +08)----------Subquery: +09)------------Filter: outer_ref(j1.j1_id) = j2.j2_id +10)--------------TableScan: j2 projection=[j2_string, j2_id] +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Int32, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) + +query TT +explain SELECT j1_string, j2_string FROM j1 LEFT JOIN LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2 ON(true); +---- +logical_plan +01)Left Join: +02)--TableScan: j1 projection=[j1_string] +03)--SubqueryAlias: j2 +04)----Projection: j2.j2_string +05)------Subquery: +06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id +07)----------TableScan: j2 projection=[j2_string, j2_id] +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Int32, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) + +query TT +explain SELECT * FROM j1, (j2 LEFT JOIN LATERAL (SELECT * FROM j3 WHERE j1_id + j2_id = j3_id) AS j3 ON(true)); +---- +logical_plan +01)Cross Join: +02)--TableScan: j1 projection=[j1_string, j1_id] +03)--Left Join: +04)----TableScan: j2 projection=[j2_string, j2_id] +05)----SubqueryAlias: j3 +06)------Subquery: +07)--------Filter: outer_ref(j1.j1_id) + outer_ref(j2.j2_id) = j3.j3_id +08)----------TableScan: j3 projection=[j3_string, j3_id] +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Int32, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) + +query TT +explain SELECT * FROM j1, LATERAL (SELECT 1) AS j2; +---- +logical_plan +01)Cross Join: +02)--TableScan: j1 projection=[j1_string, j1_id] +03)--SubqueryAlias: j2 +04)----Projection: Int64(1) +05)------EmptyRelation +physical_plan +01)CrossJoinExec +02)--DataSourceExec: partitions=1, partition_sizes=[0] +03)--ProjectionExec: expr=[1 as Int64(1)] +04)----PlaceholderRowExec + +statement count 0 +drop table j1; + +statement count 0 +drop table j2; + +statement count 0 +drop table j3; + +statement count 0 +drop table j4; + +statement count 0 +create table person(id int); + +statement count 0 +create table orders(customer_id int); + +query TT +explain SELECT * FROM person INNER JOIN orders ON orders.customer_id * 2 = person.id + 10 +---- +logical_plan +01)Inner Join: CAST(person.id AS Int64) + Int64(10) = CAST(orders.customer_id AS Int64) * Int64(2) +02)--TableScan: person projection=[id] +03)--TableScan: orders projection=[customer_id] +physical_plan +01)CoalesceBatchesExec: target_batch_size=3 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(person.id + Int64(10)@1, orders.customer_id * Int64(2)@1)], projection=[id@0, customer_id@2] +03)----ProjectionExec: expr=[id@0 as id, CAST(id@0 AS Int64) + 10 as person.id + Int64(10)] +04)------DataSourceExec: partitions=1, partition_sizes=[0] +05)----ProjectionExec: expr=[customer_id@0 as customer_id, CAST(customer_id@0 AS Int64) * 2 as orders.customer_id * Int64(2)] +06)------DataSourceExec: partitions=1, partition_sizes=[0] + +statement count 0 +drop table person; + +statement count 0 +drop table orders; diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index 067b23ac2fb0..93ffa313b8f7 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -654,7 +654,7 @@ explain select * FROM ( ---- logical_plan 01)Limit: skip=4, fetch=10 -02)--Sort: ordered_table.c DESC NULLS FIRST, fetch=14 +02)--Sort: c DESC NULLS FIRST, fetch=14 03)----Union 04)------Projection: CAST(ordered_table.c AS Int64) AS c 05)--------TableScan: ordered_table projection=[c] diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index f088e071d7e7..4e8be56f3377 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -774,7 +774,7 @@ SELECT * FROM v ORDER BY 1, 2; ---- logical_plan -01)Sort: u.m ASC NULLS LAST, u.t ASC NULLS LAST +01)Sort: m ASC NULLS LAST, t ASC NULLS LAST 02)--Union 03)----SubqueryAlias: u 04)------Projection: Int64(0) AS m, m0.t @@ -1248,7 +1248,7 @@ order by d, c, a, a0, b limit 2; ---- logical_plan -01)Sort: t1.d ASC NULLS LAST, t1.c ASC NULLS LAST, t1.a ASC NULLS LAST, t1.a0 ASC NULLS LAST, t1.b ASC NULLS LAST, fetch=2 +01)Sort: d ASC NULLS LAST, c ASC NULLS LAST, a ASC NULLS LAST, a0 ASC NULLS LAST, b ASC NULLS LAST, fetch=2 02)--Union 03)----SubqueryAlias: t1 04)------Projection: ordered_table.b, ordered_table.c, ordered_table.a, Int32(NULL) AS a0, ordered_table.d diff --git a/datafusion/sqllogictest/test_files/prepare.slt b/datafusion/sqllogictest/test_files/prepare.slt index 5d0f417640ec..33df0d26f361 100644 --- a/datafusion/sqllogictest/test_files/prepare.slt +++ b/datafusion/sqllogictest/test_files/prepare.slt @@ -312,3 +312,18 @@ SET datafusion.explain.logical_plan_only=false; statement ok DROP TABLE person; + +statement ok +SET datafusion.explain.logical_plan_only=true; + +statement count 0 +PREPARE my_plan(STRING, STRING) AS SELECT * FROM (VALUES(1, $1), (2, $2)) AS t (num, letter); + +statement count 5 +explain PREPARE my_plan(STRING, STRING) AS SELECT * FROM (VALUES(1, $1), (2, $2)) AS t (num, letter); + +query IT +EXECUTE my_plan('a', 'b'); +---- +1 a +2 b diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index f1ac0696bff9..aa14faf984e4 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1820,6 +1820,9 @@ query I select a from t; ---- +statement count 0 +drop table t; + statement ok set datafusion.optimizer.max_passes=3; @@ -1842,3 +1845,29 @@ SELECT t1.v1 FROM (SELECT 1 AS "t1.v1"); # Test issue: https://github.com/apache/datafusion/issues/14124 query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 10000 \* 100000000000000000000000000000000000 SELECT ('0.54321543215432154321543215432154321'::DECIMAL(35,35) + 10000)::VARCHAR + +# where_selection_with_ambiguous_column +statement ok +CREATE TABLE t(a int, b int, id int); + +query error DataFusion error: Schema error: Ambiguous reference to unqualified field id +select * from t a, t b where id = id + 1; + +statement count 0 +drop table t; + + +# test wildcard + other columns +statement count 0 +create table t(a int) as values (1), (2), (3), (1); + +query II +select *, count(*) over() as ta from t; +---- +1 4 +2 4 +3 4 +1 4 + +statement count 0 +drop table t; diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index d10e603ea5f3..43193fb41cfa 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -statement count 0 -create table t(a int) as values (1); +statement ok +create table t(a int, b string) as values (1, 'a'), (2, NULL), (NULL, 'c'); # test between simplification query TT @@ -30,5 +30,38 @@ physical_plan 02)--FilterExec: a@0 = 3 03)----DataSourceExec: partitions=1, partition_sizes=[1] -statement count 0 +# test regex exprs +query TT +explain select b from t where b ~ '.*' +---- +logical_plan +01)Filter: t.b IS NOT NULL +02)--TableScan: t projection=[b] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: b@0 IS NOT NULL +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select b from t where b !~ '.*' +---- +logical_plan +01)Filter: t.b = Utf8("") +02)--TableScan: t projection=[b] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: b@0 = +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +query T +select b from t where b ~ '.*' +---- +a +c + +query T +select b from t where b !~ '.*' +---- + +statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 69c4b9bfcb4b..96fb2477598c 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -1100,7 +1100,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: CAST(test.column1_utf8view AS Utf8) LIKE Utf8("%an%") AS c1 +01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1 02)--TableScan: test projection=[column1_utf8view] # `~*` operator (regex match case-insensitive) @@ -1110,7 +1110,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: CAST(test.column1_utf8view AS Utf8) ~* Utf8("^a.{3}e") AS c1 +01)Projection: test.column1_utf8view ~* Utf8View("^a.{3}e") AS c1 02)--TableScan: test projection=[column1_utf8view] # `!~~` operator (not like match) @@ -1120,7 +1120,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: CAST(test.column1_utf8view AS Utf8) !~~ Utf8("xia_g%g") AS c1 +01)Projection: test.column1_utf8view !~~ Utf8View("xia_g%g") AS c1 02)--TableScan: test projection=[column1_utf8view] # `!~~*` operator (not like match case-insensitive) @@ -1130,7 +1130,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: CAST(test.column1_utf8view AS Utf8) !~~* Utf8("xia_g%g") AS c1 +01)Projection: test.column1_utf8view !~~* Utf8View("xia_g%g") AS c1 02)--TableScan: test projection=[column1_utf8view] # coercions between stringview and date types diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index b547271925aa..bdba73876103 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -286,7 +286,7 @@ drop table struct_values; statement ok CREATE OR REPLACE VIEW complex_view AS SELECT { - 'user': { + 'user_information': { 'info': { 'personal': { 'name': 'John Doe', @@ -347,22 +347,22 @@ SELECT { } AS complex_data; query T -SELECT complex_data.user.info.personal.name FROM complex_view; +SELECT complex_data.user_information.info.personal.name FROM complex_view; ---- John Doe query I -SELECT complex_data.user.info.personal.age FROM complex_view; +SELECT complex_data.user_information.info.personal.age FROM complex_view; ---- 30 query T -SELECT complex_data.user.info.address.city FROM complex_view; +SELECT complex_data.user_information.info.address.city FROM complex_view; ---- Anytown query T -SELECT complex_data.user.preferences.languages[2] FROM complex_view; +SELECT complex_data.user_information.preferences.languages[2] FROM complex_view; ---- es @@ -595,3 +595,40 @@ Struct([Field { name: "r", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ statement ok drop table t; + + +# Test struct field access with subscript notation +# This tests accessing struct fields using the subscript notation with string literals + +statement ok +create table test (struct_field struct(substruct int)) as values (struct(1)); + +query ?? +select * +from test as test1, test as test2 where +test1.struct_field['substruct'] = test2.struct_field['substruct']; +---- +{substruct: 1} {substruct: 1} + +statement ok +DROP TABLE test; + +statement ok +create table test (struct_field struct(substruct struct(subsubstruct int))) as values (struct(struct(1))); + +query ?? +select * +from test as test1, test as test2 where +test1.struct_field.substruct['subsubstruct'] = test2.struct_field.substruct['subsubstruct']; +---- +{substruct: {subsubstruct: 1}} {substruct: {subsubstruct: 1}} + +query ?? +select * +from test AS test1, test AS test2 where +test1.struct_field['substruct']['subsubstruct'] = test2.struct_field['substruct']['subsubstruct']; +---- +{substruct: {subsubstruct: 1}} {substruct: {subsubstruct: 1}} + +statement ok +drop table test; diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 207bb72fd549..4c1565c7f033 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -438,7 +438,7 @@ logical_plan 08)----------TableScan: t1 projection=[t1_int] #invalid_scalar_subquery -statement error DataFusion error: Invalid \(non-executable\) plan after Analyzer\ncaused by\nError during planning: Scalar subquery should only return one column, but found 2: t2.t2_id, t2.t2_name +statement error DataFusion error: Error during planning: Too many columns! The subquery should only return one column: t2.t2_id, t2.t2_name SELECT t1_id, t1_name, t1_int, (select t2_id, t2_name FROM t2 WHERE t2.t2_id = t1.t1_int) FROM t1 #subquery_not_allowed @@ -1431,3 +1431,23 @@ drop table t1; statement count 0 drop table t2; + + +# test exists_subquery_wildcard +statement count 0 +create table person(id int, last_name int, state int); + +query TT +explain SELECT id FROM person p WHERE EXISTS + (SELECT * FROM person WHERE last_name = p.last_name AND state = p.state) +---- +logical_plan +01)Projection: p.id +02)--LeftSemi Join: p.last_name = __correlated_sq_1.last_name, p.state = __correlated_sq_1.state +03)----SubqueryAlias: p +04)------TableScan: person projection=[id, last_name, state] +05)----SubqueryAlias: __correlated_sq_1 +06)------TableScan: person projection=[last_name, state] + +statement count 0 +drop table person; diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt b/datafusion/sqllogictest/test_files/subquery_sort.slt index 4ca19c0b6af7..5d22bf92e7e6 100644 --- a/datafusion/sqllogictest/test_files/subquery_sort.slt +++ b/datafusion/sqllogictest/test_files/subquery_sort.slt @@ -104,6 +104,35 @@ physical_plan 05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3, c9], file_type=csv, has_header=true +#Test with utf8view for window function +statement ok +CREATE TABLE sink_table_with_utf8view AS +SELECT arrow_cast(c1, 'Utf8View') AS c1, c2, c3, c9 +FROM sink_table; + + +query TT +EXPLAIN SELECT t2.c1, t2.r FROM (SELECT c1, RANK() OVER (ORDER BY c1 DESC) AS r, c3, c9 FROM sink_table_with_utf8view ORDER BY c1, c3 LIMIT 2) AS t2 ORDER BY t2.c1, t2.c3, t2.c9; +---- +logical_plan +01)Projection: t2.c1, t2.r +02)--Sort: t2.c1 ASC NULLS LAST, t2.c3 ASC NULLS LAST, t2.c9 ASC NULLS LAST +03)----SubqueryAlias: t2 +04)------Sort: sink_table_with_utf8view.c1 ASC NULLS LAST, sink_table_with_utf8view.c3 ASC NULLS LAST, fetch=2 +05)--------Projection: sink_table_with_utf8view.c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS r, sink_table_with_utf8view.c3, sink_table_with_utf8view.c9 +06)----------WindowAggr: windowExpr=[[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +07)------------TableScan: sink_table_with_utf8view projection=[c1, c3, c9] +physical_plan +01)ProjectionExec: expr=[c1@0 as c1, r@1 as r] +02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false] +03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9] +04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Utf8View(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] +05)--------SortPreservingMergeExec: [c1@0 DESC] +06)----------SortExec: expr=[c1@0 DESC], preserve_partitioning=[true] +07)------------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0] + +statement ok +DROP TABLE sink_table_with_utf8view; query TT EXPLAIN SELECT c1, c2 FROM (SELECT DISTINCT ON (c1) c1, c2, c3, c9 FROM sink_table ORDER BY c1, c3 DESC, c9) AS t2 ORDER BY t2.c1, t2.c3 DESC, t2.c9 diff --git a/datafusion/sqllogictest/test_files/type_coercion.slt b/datafusion/sqllogictest/test_files/type_coercion.slt index 0900c88c15c0..2c6079bc7039 100644 --- a/datafusion/sqllogictest/test_files/type_coercion.slt +++ b/datafusion/sqllogictest/test_files/type_coercion.slt @@ -187,7 +187,7 @@ EXPLAIN SELECT a FROM (select 1 a) x GROUP BY 1 (SELECT a FROM (select 1.1 a) x GROUP BY 1) ORDER BY 1 ---- logical_plan -01)Sort: x.a ASC NULLS LAST +01)Sort: a ASC NULLS LAST 02)--Union 03)----Projection: CAST(x.a AS Float64) AS a 04)------Aggregate: groupBy=[[x.a]], aggr=[[]] diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index 9ab732c65533..b229d89f4165 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -226,7 +226,7 @@ query TT EXPLAIN SELECT name FROM t1 UNION (SELECT name from t2 UNION SELECT name || '_new' from t2) ---- logical_plan -01)Aggregate: groupBy=[[t1.name]], aggr=[[]] +01)Aggregate: groupBy=[[name]], aggr=[[]] 02)--Union 03)----TableScan: t1 projection=[name] 04)----TableScan: t2 projection=[name] @@ -411,7 +411,7 @@ query TT explain SELECT c1, c9 FROM aggregate_test_100 UNION ALL SELECT c1, c3 FROM aggregate_test_100 ORDER BY c9 DESC LIMIT 5 ---- logical_plan -01)Sort: aggregate_test_100.c9 DESC NULLS FIRST, fetch=5 +01)Sort: c9 DESC NULLS FIRST, fetch=5 02)--Union 03)----Projection: aggregate_test_100.c1, CAST(aggregate_test_100.c9 AS Decimal128(20, 0)) AS c9 04)------TableScan: aggregate_test_100 projection=[c1, c9] @@ -449,7 +449,7 @@ SELECT count(*) FROM ( ---- logical_plan 01)Projection: count(Int64(1)) AS count(*) -02)--Aggregate: groupBy=[[t1.name]], aggr=[[count(Int64(1))]] +02)--Aggregate: groupBy=[[name]], aggr=[[count(Int64(1))]] 03)----Union 04)------Aggregate: groupBy=[[t1.name]], aggr=[[]] 05)--------TableScan: t1 projection=[name] @@ -601,7 +601,7 @@ UNION ALL ORDER BY c1 ---- logical_plan -01)Sort: t1.c1 ASC NULLS LAST +01)Sort: c1 ASC NULLS LAST 02)--Union 03)----TableScan: t1 projection=[c1] 04)----Projection: t2.c1a AS c1 @@ -709,6 +709,25 @@ SELECT t1.v2, t1.v0 FROM t2 NATURAL JOIN t1 SELECT t1.v2, t1.v0 FROM t2 NATURAL JOIN t1 WHERE (t1.v2 IS NULL); ---- +query IR +SELECT t1.v0, t2.v0 FROM t1,t2 + UNION ALL +SELECT t1.v0, t2.v0 FROM t1,t2 +ORDER BY v0; +---- +-1493773377 0.280145772929 +-1493773377 0.280145772929 +-1229445667 0.280145772929 +-1229445667 0.280145772929 +1541512604 0.280145772929 +1541512604 0.280145772929 +NULL 0.280145772929 +NULL 0.280145772929 +NULL 0.280145772929 +NULL 0.280145772929 +NULL 0.280145772929 +NULL 0.280145772929 + statement ok CREATE TABLE t3 ( id INT @@ -814,7 +833,7 @@ UNION ALL ORDER BY c1 ---- logical_plan -01)Sort: aggregate_test_100.c1 ASC NULLS LAST +01)Sort: c1 ASC NULLS LAST 02)--Union 03)----Filter: aggregate_test_100.c1 = Utf8("a") 04)------TableScan: aggregate_test_100 projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], partial_filters=[aggregate_test_100.c1 = Utf8("a")] @@ -860,3 +879,84 @@ FROM ( GROUP BY combined ---- AB + + +# Test union in view +statement ok +CREATE TABLE u1 (x INT, y INT); + +statement ok +INSERT INTO u1 VALUES (3, 3), (3, 3), (1, 1); + +statement ok +CREATE TABLE u2 (y BIGINT, z BIGINT); + +statement ok +INSERT INTO u2 VALUES (20, 20), (40, 40); + +statement ok +CREATE VIEW v1 AS +SELECT y FROM u1 UNION ALL SELECT y FROM u2 ORDER BY y; + +query I rowsort +SELECT * FROM (SELECT y FROM u1 UNION ALL SELECT y FROM u2) ORDER BY y; +---- +1 +20 +3 +3 +40 + +query TT +explain SELECT * FROM (SELECT y FROM u1 UNION ALL SELECT y FROM u2) ORDER BY y; +---- +logical_plan +01)Sort: y ASC NULLS LAST +02)--Union +03)----Projection: CAST(u1.y AS Int64) AS y +04)------TableScan: u1 projection=[y] +05)----TableScan: u2 projection=[y] +physical_plan +01)SortPreservingMergeExec: [y@0 ASC NULLS LAST] +02)--UnionExec +03)----SortExec: expr=[y@0 ASC NULLS LAST], preserve_partitioning=[true] +04)------ProjectionExec: expr=[CAST(y@0 AS Int64) as y] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------DataSourceExec: partitions=1, partition_sizes=[1] +07)----SortExec: expr=[y@0 ASC NULLS LAST], preserve_partitioning=[false] +08)------DataSourceExec: partitions=1, partition_sizes=[1] + +# optimize_subquery_sort in create_relation removes Sort so the result is not sorted. +query I +SELECT * FROM v1; +---- +20 +40 +3 +3 +1 + +query TT +explain SELECT * FROM v1; +---- +logical_plan +01)SubqueryAlias: v1 +02)--Union +03)----Projection: CAST(u1.y AS Int64) AS y +04)------TableScan: u1 projection=[y] +05)----TableScan: u2 projection=[y] +physical_plan +01)UnionExec +02)--ProjectionExec: expr=[CAST(y@0 AS Int64) as y] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)--DataSourceExec: partitions=1, partition_sizes=[1] + +statement count 0 +drop view v1; + +statement count 0 +drop table u1; + +statement count 0 +drop table u2; diff --git a/datafusion/sqllogictest/test_files/union_by_name.slt b/datafusion/sqllogictest/test_files/union_by_name.slt index 63a43a36ff16..3844dba68079 100644 --- a/datafusion/sqllogictest/test_files/union_by_name.slt +++ b/datafusion/sqllogictest/test_files/union_by_name.slt @@ -54,13 +54,13 @@ INSERT INTO t2 VALUES (2, 2), (4, 4); # Test binding query I -SELECT t1.x FROM t1 UNION BY NAME SELECT x FROM t1 ORDER BY t1.x; +SELECT t1.x FROM t1 UNION BY NAME SELECT x FROM t1 ORDER BY x; ---- 1 3 query I -SELECT t1.x FROM t1 UNION ALL BY NAME SELECT x FROM t1 ORDER BY t1.x; +SELECT t1.x FROM t1 UNION ALL BY NAME SELECT x FROM t1 ORDER BY x; ---- 1 1 @@ -70,13 +70,13 @@ SELECT t1.x FROM t1 UNION ALL BY NAME SELECT x FROM t1 ORDER BY t1.x; 3 query I -SELECT x FROM t1 UNION BY NAME SELECT x FROM t1 ORDER BY t1.x; +SELECT x FROM t1 UNION BY NAME SELECT x FROM t1 ORDER BY x; ---- 1 3 query I -SELECT x FROM t1 UNION ALL BY NAME SELECT x FROM t1 ORDER BY t1.x; +SELECT x FROM t1 UNION ALL BY NAME SELECT x FROM t1 ORDER BY x; ---- 1 1 @@ -124,8 +124,8 @@ NULL 3 # Ambiguous name -statement error DataFusion error: Schema error: No field named t1.x. Valid fields are a, b. -SELECT x AS a FROM t1 UNION BY NAME SELECT x AS b FROM t1 ORDER BY t1.x; +statement error DataFusion error: Schema error: No field named x. Valid fields are a, b. +SELECT x AS a FROM t1 UNION BY NAME SELECT x AS b FROM t1 ORDER BY x; query II (SELECT y FROM t1 UNION ALL SELECT x FROM t1) UNION BY NAME (SELECT z FROM t2 UNION ALL SELECT y FROM t2) ORDER BY y, z; diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index f68fd993ddd0..f9e741bf3ca3 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -32,14 +32,14 @@ AS VALUES statement ok CREATE TABLE nested_unnest_table -AS VALUES +AS VALUES (struct('a', 'b', struct('c')), (struct('a', 'b', [10,20])), [struct('a', 'b')]), (struct('d', 'e', struct('f')), (struct('x', 'y', [30,40, 50])), null) ; statement ok CREATE TABLE recursive_unnest_table -AS VALUES +AS VALUES (struct([1], 'a'), [[[1],[2]],[[1,1]]], [struct([1],[[1,2]])]), (struct([2], 'b'), [[[3,4],[5]],[[null,6],null,[7,8]]], [struct([2],[[3],[4]])]) ; @@ -264,9 +264,9 @@ NULL NULL 17 NULL NULL 18 query IIIT -select - unnest(column1), unnest(column2) + 2, - column3 * 10, unnest(array_remove(column1, '4')) +select + unnest(column1), unnest(column2) + 2, + column3 * 10, unnest(array_remove(column1, '4')) from unnest_table; ---- 1 9 10 1 @@ -795,7 +795,7 @@ select unnest(unnest(column2)) c2, count(column3) from recursive_unnest_table gr [NULL, 6] 1 NULL 1 -query error DataFusion error: Error during planning: Projection references non\-aggregate values +query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "nested_unnest_table\.column1" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "UNNEST\(nested_unnest_table\.column1\)\[c0\]" appears in the SELECT clause satisfies this requirement select unnest(column1) c1 from nested_unnest_table group by c1.c0; # TODO: this query should work. see issue: https://github.com/apache/datafusion/issues/12794 @@ -875,7 +875,7 @@ query TT explain select * from unnest_table u, unnest(u.column1); ---- logical_plan -01)Cross Join: +01)Cross Join: 02)--SubqueryAlias: u 03)----TableScan: unnest_table projection=[column1, column2, column3, column4, column5] 04)--Subquery: diff --git a/datafusion/sqllogictest/test_files/update.slt b/datafusion/sqllogictest/test_files/update.slt index 0f9582b04c58..908d2b34aea4 100644 --- a/datafusion/sqllogictest/test_files/update.slt +++ b/datafusion/sqllogictest/test_files/update.slt @@ -78,8 +78,8 @@ physical_plan_error This feature is not implemented: Unsupported logical plan: D statement ok create table t3(a int, b varchar, c double, d int); -# set from multiple tables, sqlparser only supports from one table -query error DataFusion error: SQL error: ParserError\("Expected end of statement, found: ,"\) +# set from multiple tables, DataFusion only supports from one table +query error DataFusion error: Error during planning: Multiple tables in UPDATE SET FROM not yet supported explain update t1 set b = t2.b, c = t3.a, d = 1 from t2, t3 where t1.a = t2.a and t1.a = t3.a; # test table alias diff --git a/datafusion/sqllogictest/test_files/wildcard.slt b/datafusion/sqllogictest/test_files/wildcard.slt index 7c076f040feb..1a480eac0cc3 100644 --- a/datafusion/sqllogictest/test_files/wildcard.slt +++ b/datafusion/sqllogictest/test_files/wildcard.slt @@ -145,3 +145,18 @@ DROP TABLE t2; statement ok DROP TABLE aggregate_simple; + +statement ok +create table t(a int, b int, c int) as values (1, 2, 3); + +query error DataFusion error: Error during planning: Projections require unique expression names but the expression "t\.a" at position 0 and "t\.a" at position 3 have the same name\. Consider aliasing \("AS"\) one of them\. +select *, a from t; + +# a is aliased to other name so the query is valid +query IIII +select *, a as aka from t; +---- +1 2 3 1 + +statement count 0 +drop table t; diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 6c0e69a467e1..d3c31e31ede2 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -5536,3 +5536,62 @@ physical_plan 01)ProjectionExec: expr=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as max_c5] 02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true + +# Testing Utf8View with window +statement ok +CREATE TABLE aggregate_test_100_utf8view AS SELECT + arrow_cast(c1, 'Utf8View') as c1, + c9, + c13 +FROM aggregate_test_100; + + +#fn window_frame_ranges_string_check +query II +SELECT +SUM(LENGTH(c13)) OVER(ORDER BY c13), +SUM(LENGTH(c1)) OVER(ORDER BY c1) +FROM aggregate_test_100_utf8view +ORDER BY c9 +LIMIT 5 +---- +2100 100 +510 79 +1440 21 +1830 61 +2010 21 + + +#fn test_window_rank +query IIIIIRR +SELECT + c9, + RANK() OVER(ORDER BY c1) AS rank1, + RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rank2, + DENSE_RANK() OVER(ORDER BY c1) as dense_rank1, + DENSE_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as dense_rank2, + PERCENT_RANK() OVER(ORDER BY c1) as percent_rank1, + PERCENT_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as percent_rank2 + FROM aggregate_test_100_utf8view + ORDER BY c9 + LIMIT 5 +---- +28774375 80 80 5 5 0.79797979798 0.79797979798 +63044568 62 62 4 4 0.616161616162 0.616161616162 +141047417 1 1 1 1 0 0 +141680161 41 41 3 3 0.40404040404 0.40404040404 +145294611 1 1 1 1 0 0 + + +# CTAS with NTILE function +statement ok +CREATE TABLE new_table AS SELECT NTILE(2) OVER(ORDER BY c1) AS ntile_2 FROM aggregate_test_100_utf8view; + +statement ok +DROP TABLE new_table; + +statement ok +DROP TABLE aggregate_test_100_utf8view; + +statement ok +DROP TABLE aggregate_test_100 diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 3e3ea7843ac9..edc3b8d2f214 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -39,7 +39,7 @@ itertools = { workspace = true } object_store = { workspace = true } pbjson-types = { workspace = true } prost = { workspace = true } -substrait = { version = "0.53", features = ["serde"] } +substrait = { version = "0.55", features = ["serde"] } url = { workspace = true } tokio = { workspace = true, features = ["fs"] } diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs index a7493a48e4c5..0f2fbf199be3 100644 --- a/datafusion/substrait/src/lib.rs +++ b/datafusion/substrait/src/lib.rs @@ -20,6 +20,9 @@ html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +// Make sure fast / cheap clones on Arc are explicit: +// https://github.com/apache/datafusion/issues/11143 +#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))] //! Serialize / Deserialize DataFusion Plans to [Substrait.io] //! diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 7c6c45f44db7..61f3379735c7 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -24,7 +24,8 @@ use datafusion::arrow::datatypes::{ }; use datafusion::common::{ not_impl_datafusion_err, not_impl_err, plan_datafusion_err, plan_err, - substrait_datafusion_err, substrait_err, DFSchema, DFSchemaRef, TableReference, + substrait_datafusion_err, substrait_err, DFSchema, DFSchemaRef, Spans, + TableReference, }; use datafusion::datasource::provider_as_source; use datafusion::logical_expr::expr::{Exists, InSubquery, Sort, WindowFunctionParams}; @@ -104,10 +105,11 @@ use substrait::proto::{ rel::RelType, rel_common, sort_field::{SortDirection, SortKind::*}, - AggregateFunction, AggregateRel, ConsistentPartitionWindowRel, CrossRel, ExchangeRel, - Expression, ExtendedExpression, ExtensionLeafRel, ExtensionMultiRel, - ExtensionSingleRel, FetchRel, FilterRel, FunctionArgument, JoinRel, NamedStruct, - Plan, ProjectRel, ReadRel, Rel, RelCommon, SetRel, SortField, SortRel, Type, + AggregateFunction, AggregateRel, ConsistentPartitionWindowRel, CrossRel, + DynamicParameter, ExchangeRel, Expression, ExtendedExpression, ExtensionLeafRel, + ExtensionMultiRel, ExtensionSingleRel, FetchRel, FilterRel, FunctionArgument, + JoinRel, NamedStruct, Plan, ProjectRel, ReadRel, Rel, RelCommon, SetRel, SortField, + SortRel, Type, }; #[async_trait] @@ -392,6 +394,14 @@ pub trait SubstraitConsumer: Send + Sync + Sized { not_impl_err!("Enum expression not supported") } + async fn consume_dynamic_parameter( + &self, + _expr: &DynamicParameter, + _input_schema: &DFSchema, + ) -> Result { + not_impl_err!("Dynamic Parameter expression not supported") + } + // User-Defined Functionality // The details of extension relations, and how to handle them, are fully up to users to specify. @@ -766,7 +776,7 @@ pub async fn from_substrait_plan_with_consumer( return Ok(plan); } let renamed_schema = make_renamed_schema(plan.schema(), &root.names)?; - if renamed_schema.equivalent_names_and_types(plan.schema()) { + if renamed_schema.has_equivalent_names_and_types(plan.schema()).is_ok() { // Nothing to do if the schema is already equivalent return Ok(plan); } @@ -1049,8 +1059,8 @@ pub async fn from_project_rel( p: &ProjectRel, ) -> Result { if let Some(input) = p.input.as_ref() { - let mut input = LogicalPlanBuilder::from(consumer.consume_rel(input).await?); - let original_schema = input.schema().clone(); + let input = consumer.consume_rel(input).await?; + let original_schema = Arc::clone(input.schema()); // Ensure that all expressions have a unique display name, so that // validate_unique_names does not fail when constructing the project. @@ -1065,6 +1075,10 @@ pub async fn from_project_rel( // leaving only explicit expressions. let mut explicit_exprs: Vec = vec![]; + // For WindowFunctions, we need to wrap them in a Window relation. If there are duplicates, + // we can do the window'ing only once, then the project will duplicate the result. + // Order here doesn't matter since LPB::window_plan sorts the expressions. + let mut window_exprs: HashSet = HashSet::new(); for expr in &p.expressions { let e = consumer .consume_expression(expr, input.clone().schema()) @@ -1074,18 +1088,24 @@ pub async fn from_project_rel( // Adding the same expression here and in the project below // works because the project's builder uses columnize_expr(..) // to transform it into a column reference - input = input.window(vec![e.clone()])? + window_exprs.insert(e.clone()); } explicit_exprs.push(name_tracker.get_uniquely_named_expr(e)?); } + let input = if !window_exprs.is_empty() { + LogicalPlanBuilder::window_plan(input, window_exprs)? + } else { + input + }; + let mut final_exprs: Vec = vec![]; for index in 0..original_schema.fields().len() { let e = Expr::Column(Column::from(original_schema.qualified_field(index))); final_exprs.push(name_tracker.get_uniquely_named_expr(e)?); } final_exprs.append(&mut explicit_exprs); - input.project(final_exprs)?.build() + project(input, final_exprs) } else { not_impl_err!("Projection without an input is not supported") } @@ -1616,7 +1636,7 @@ fn apply_emit_kind( .get(field as usize) .ok_or_else(|| substrait_datafusion_err!( "Emit output field {} cannot be resolved in input schema {}", - field, proj.input.schema().clone() + field, proj.input.schema() ))?; exprs.push(name_tracker.get_uniquely_named_expr(expr.clone())?); } @@ -1955,6 +1975,15 @@ pub async fn from_substrait_agg_func( let args = from_substrait_func_args(consumer, &f.arguments, input_schema).await?; + // Datafusion does not support aggregate functions with no arguments, so + // we inject a dummy argument that does not affect the query, but allows + // us to bypass this limitation. + let args = if udaf.name() == "count" && args.is_empty() { + vec![Expr::Literal(ScalarValue::Int64(Some(1)))] + } else { + args + }; + Ok(Arc::new(Expr::AggregateFunction( expr::AggregateFunction::new_udf(udaf, args, distinct, filter, order_by, None), ))) @@ -1999,6 +2028,9 @@ pub async fn from_substrait_rex( } RexType::Nested(expr) => consumer.consume_nested(expr, input_schema).await, RexType::Enum(expr) => consumer.consume_enum(expr, input_schema).await, + RexType::DynamicParameter(expr) => { + consumer.consume_dynamic_parameter(expr, input_schema).await + } }, None => substrait_err!("Expression must set rex_type: {:?}", expression), } @@ -2225,11 +2257,19 @@ pub async fn from_window_function( window_frame.regularize_order_bys(&mut order_by)?; + // Datafusion does not support aggregate functions with no arguments, so + // we inject a dummy argument that does not affect the query, but allows + // us to bypass this limitation. + let args = if fun.name() == "count" && window.arguments.is_empty() { + vec![Expr::Literal(ScalarValue::Int64(Some(1)))] + } else { + from_substrait_func_args(consumer, &window.arguments, input_schema).await? + }; + Ok(Expr::WindowFunction(expr::WindowFunction { fun, params: WindowFunctionParams { - args: from_substrait_func_args(consumer, &window.arguments, input_schema) - .await?, + args, partition_by: from_substrait_rex_vec( consumer, &window.partitions, @@ -2268,6 +2308,7 @@ pub async fn from_subquery( subquery: Subquery { subquery: Arc::new(haystack_expr), outer_ref_columns: outer_refs, + spans: Spans::new(), }, negated: false, })) @@ -2286,6 +2327,7 @@ pub async fn from_subquery( Ok(Expr::ScalarSubquery(Subquery { subquery: Arc::new(plan), outer_ref_columns, + spans: Spans::new(), })) } SubqueryType::SetPredicate(predicate) => { @@ -2301,6 +2343,7 @@ pub async fn from_subquery( Subquery { subquery: Arc::new(plan), outer_ref_columns, + spans: Spans::new(), }, false, ))) @@ -3380,4 +3423,31 @@ mod test { Ok(()) } + + #[tokio::test] + async fn window_function_with_count() -> Result<()> { + let substrait = substrait::proto::Expression { + rex_type: Some(substrait::proto::expression::RexType::WindowFunction( + substrait::proto::expression::WindowFunction { + function_reference: 0, + ..Default::default() + }, + )), + }; + + let mut consumer = test_consumer(); + + let mut extensions = Extensions::default(); + extensions.register_function("count".to_string()); + consumer.extensions = &extensions; + + match from_substrait_rex(&consumer, &substrait, &DFSchema::empty()).await? { + Expr::WindowFunction(window_function) => { + assert_eq!(window_function.params.args.len(), 1) + } + _ => panic!("expr was not a WindowFunction"), + }; + + Ok(()) + } } diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index cc7efed419c2..44baf277786d 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -15,9 +15,6 @@ // specific language governing permissions and limitations // under the License. -use datafusion::config::ConfigOptions; -use datafusion::optimizer::analyzer::expand_wildcard_rule::ExpandWildcardRule; -use datafusion::optimizer::AnalyzerRule; use std::sync::Arc; use substrait::proto::expression_reference::ExprType; @@ -435,14 +432,10 @@ pub fn to_substrait_plan(plan: &LogicalPlan, state: &SessionState) -> Result Result= CAST(Utf8(\"1993-07-01\") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8(\"1993-10-01\") AS Date32) AND EXISTS ()\ \n Subquery:\ @@ -269,10 +270,10 @@ mod tests { let plan_str = tpch_plan_to_string(13).await?; assert_eq!( plan_str, - "Projection: count(ORDERS.O_ORDERKEY) AS C_COUNT, count() AS CUSTDIST\ - \n Sort: count() DESC NULLS FIRST, count(ORDERS.O_ORDERKEY) DESC NULLS FIRST\ - \n Projection: count(ORDERS.O_ORDERKEY), count()\ - \n Aggregate: groupBy=[[count(ORDERS.O_ORDERKEY)]], aggr=[[count()]]\ + "Projection: count(ORDERS.O_ORDERKEY) AS C_COUNT, count(Int64(1)) AS CUSTDIST\ + \n Sort: count(Int64(1)) DESC NULLS FIRST, count(ORDERS.O_ORDERKEY) DESC NULLS FIRST\ + \n Projection: count(ORDERS.O_ORDERKEY), count(Int64(1))\ + \n Aggregate: groupBy=[[count(ORDERS.O_ORDERKEY)]], aggr=[[count(Int64(1))]]\ \n Projection: count(ORDERS.O_ORDERKEY)\ \n Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY]], aggr=[[count(ORDERS.O_ORDERKEY)]]\ \n Projection: CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY\ @@ -410,10 +411,10 @@ mod tests { let plan_str = tpch_plan_to_string(21).await?; assert_eq!( plan_str, - "Projection: SUPPLIER.S_NAME, count() AS NUMWAIT\ + "Projection: SUPPLIER.S_NAME, count(Int64(1)) AS NUMWAIT\ \n Limit: skip=0, fetch=100\ - \n Sort: count() DESC NULLS FIRST, SUPPLIER.S_NAME ASC NULLS LAST\ - \n Aggregate: groupBy=[[SUPPLIER.S_NAME]], aggr=[[count()]]\ + \n Sort: count(Int64(1)) DESC NULLS FIRST, SUPPLIER.S_NAME ASC NULLS LAST\ + \n Aggregate: groupBy=[[SUPPLIER.S_NAME]], aggr=[[count(Int64(1))]]\ \n Projection: SUPPLIER.S_NAME\ \n Filter: SUPPLIER.S_SUPPKEY = LINEITEM.L_SUPPKEY AND ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY AND ORDERS.O_ORDERSTATUS = Utf8(\"F\") AND LINEITEM.L_RECEIPTDATE > LINEITEM.L_COMMITDATE AND EXISTS () AND NOT EXISTS () AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8(\"SAUDI ARABIA\")\ \n Subquery:\ @@ -438,9 +439,9 @@ mod tests { let plan_str = tpch_plan_to_string(22).await?; assert_eq!( plan_str, - "Projection: substr(CUSTOMER.C_PHONE,Int32(1),Int32(2)) AS CNTRYCODE, count() AS NUMCUST, sum(CUSTOMER.C_ACCTBAL) AS TOTACCTBAL\ + "Projection: substr(CUSTOMER.C_PHONE,Int32(1),Int32(2)) AS CNTRYCODE, count(Int64(1)) AS NUMCUST, sum(CUSTOMER.C_ACCTBAL) AS TOTACCTBAL\ \n Sort: substr(CUSTOMER.C_PHONE,Int32(1),Int32(2)) ASC NULLS LAST\ - \n Aggregate: groupBy=[[substr(CUSTOMER.C_PHONE,Int32(1),Int32(2))]], aggr=[[count(), sum(CUSTOMER.C_ACCTBAL)]]\ + \n Aggregate: groupBy=[[substr(CUSTOMER.C_PHONE,Int32(1),Int32(2))]], aggr=[[count(Int64(1)), sum(CUSTOMER.C_ACCTBAL)]]\ \n Projection: substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)), CUSTOMER.C_ACCTBAL\ \n Filter: (substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"13\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"31\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"23\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"29\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"30\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"18\") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8(\"17\") AS Utf8)) AND CUSTOMER.C_ACCTBAL > () AND NOT EXISTS ()\ \n Subquery:\ @@ -455,4 +456,43 @@ mod tests { ); Ok(()) } + + async fn test_plan_to_string(name: &str) -> Result { + let path = format!("tests/testdata/test_plans/{name}"); + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto)?; + let plan = from_substrait_plan(&ctx.state(), &proto).await?; + ctx.state().create_physical_plan(&plan).await?; + Ok(format!("{}", plan)) + } + + #[tokio::test] + async fn test_select_count_from_select_1() -> Result<()> { + let plan_str = + test_plan_to_string("select_count_from_select_1.substrait.json").await?; + + assert_eq!( + plan_str, + "Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]\ + \n Values: (Int64(0))" + ); + Ok(()) + } + + #[tokio::test] + async fn test_select_window_count() -> Result<()> { + let plan_str = test_plan_to_string("select_window_count.substrait.json").await?; + + assert_eq!( + plan_str, + "Projection: count(Int64(1)) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING AS LEAD_EXPR\ + \n WindowAggr: windowExpr=[[count(Int64(1)) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]\ + \n TableScan: DATA" + ); + Ok(()) + } } diff --git a/datafusion/substrait/tests/cases/logical_plans.rs b/datafusion/substrait/tests/cases/logical_plans.rs index 6f5899595548..579e3535f16d 100644 --- a/datafusion/substrait/tests/cases/logical_plans.rs +++ b/datafusion/substrait/tests/cases/logical_plans.rs @@ -45,6 +45,10 @@ mod tests { "Projection: NOT DATA.D AS EXPR$0\ \n TableScan: DATA" ); + + // Trigger execution to ensure plan validity + DataFrame::new(ctx.state(), plan).show().await?; + Ok(()) } @@ -71,6 +75,63 @@ mod tests { \n WindowAggr: windowExpr=[[sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]\ \n TableScan: DATA" ); + + // Trigger execution to ensure plan validity + DataFrame::new(ctx.state(), plan).show().await?; + + Ok(()) + } + + #[tokio::test] + async fn double_window_function() -> Result<()> { + // Confirms a WindowExpr can be repeated in the same project. + // This wouldn't normally happen with DF-created plans since CSE would eliminate the duplicate. + + // File generated with substrait-java's Isthmus: + // ./isthmus-cli/build/graal/isthmus --create "create table data (a int)" "select ROW_NUMBER() OVER (), ROW_NUMBER() OVER () AS aliased from data"; + let proto_plan = + read_json("tests/testdata/test_plans/double_window.substrait.json"); + let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?; + let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?; + + assert_eq!( + format!("{}", plan), + "Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW__temp__0 AS ALIASED\ + \n WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\ + \n TableScan: DATA" + ); + + // Trigger execution to ensure plan validity + DataFrame::new(ctx.state(), plan).show().await?; + + Ok(()) + } + + #[tokio::test] + async fn double_window_function_distinct_windows() -> Result<()> { + // Confirms a single project can have multiple window functions with separate windows in it. + // This wouldn't normally happen with DF-created plans since logical optimizer would + // separate them out. + + // File generated with substrait-java's Isthmus: + // ./isthmus-cli/build/graal/isthmus --create "create table data (a int)" "select ROW_NUMBER() OVER (), ROW_NUMBER() OVER (PARTITION BY a) from data"; + let proto_plan = read_json( + "tests/testdata/test_plans/double_window_distinct_windows.substrait.json", + ); + let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?; + let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?; + + assert_eq!( + format!("{}", plan), + "Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$1\ + \n WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\ + \n WindowAggr: windowExpr=[[row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\ + \n TableScan: DATA" + ); + + // Trigger execution to ensure plan validity + DataFrame::new(ctx.state(), plan).show().await?; + Ok(()) } @@ -86,7 +147,7 @@ mod tests { assert_eq!(format!("{}", &plan), "Values: (List([1, 2]))"); - // Need to trigger execution to ensure that Arrow has validated the plan + // Trigger execution to ensure plan validity DataFrame::new(ctx.state(), plan).show().await?; Ok(()) @@ -107,6 +168,9 @@ mod tests { \n TableScan: sales" ); + // Trigger execution to ensure plan validity + DataFrame::new(ctx.state(), plan).show().await?; + Ok(()) } } diff --git a/datafusion/substrait/tests/testdata/test_plans/double_window.substrait.json b/datafusion/substrait/tests/testdata/test_plans/double_window.substrait.json new file mode 100644 index 000000000000..880f6fcae6cb --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/double_window.substrait.json @@ -0,0 +1,126 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "/functions_arithmetic.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "row_number:" + } + } + ], + "relations": [ + { + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 1, + 2 + ] + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "A" + ], + "struct": { + "types": [ + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": [ + "DATA" + ] + } + } + }, + "expressions": [ + { + "windowFunction": { + "functionReference": 0, + "partitions": [], + "sorts": [], + "upperBound": { + "currentRow": { + } + }, + "lowerBound": { + "unbounded": { + } + }, + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "args": [], + "arguments": [], + "invocation": "AGGREGATION_INVOCATION_ALL", + "options": [], + "boundsType": "BOUNDS_TYPE_ROWS" + } + }, + { + "windowFunction": { + "functionReference": 0, + "partitions": [], + "sorts": [], + "upperBound": { + "currentRow": { + } + }, + "lowerBound": { + "unbounded": { + } + }, + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "args": [], + "arguments": [], + "invocation": "AGGREGATION_INVOCATION_ALL", + "options": [], + "boundsType": "BOUNDS_TYPE_ROWS" + } + } + ] + } + }, + "names": [ + "EXPR$0", + "ALIASED" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/test_plans/double_window_distinct_windows.substrait.json b/datafusion/substrait/tests/testdata/test_plans/double_window_distinct_windows.substrait.json new file mode 100644 index 000000000000..a8906e94c666 --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/double_window_distinct_windows.substrait.json @@ -0,0 +1,138 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "/functions_arithmetic.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "row_number:" + } + } + ], + "relations": [ + { + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 1, + 2 + ] + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "A" + ], + "struct": { + "types": [ + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": [ + "DATA" + ] + } + } + }, + "expressions": [ + { + "windowFunction": { + "functionReference": 0, + "partitions": [], + "sorts": [], + "upperBound": { + "currentRow": { + } + }, + "lowerBound": { + "unbounded": { + } + }, + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "args": [], + "arguments": [], + "invocation": "AGGREGATION_INVOCATION_ALL", + "options": [], + "boundsType": "BOUNDS_TYPE_ROWS" + } + }, + { + "windowFunction": { + "functionReference": 0, + "partitions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ], + "sorts": [], + "upperBound": { + "currentRow": { + } + }, + "lowerBound": { + "unbounded": { + } + }, + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "args": [], + "arguments": [], + "invocation": "AGGREGATION_INVOCATION_ALL", + "options": [], + "boundsType": "BOUNDS_TYPE_ROWS" + } + } + ] + } + }, + "names": [ + "EXPR$0", + "EXPR$1" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/test_plans/select_count_from_select_1.substrait.json b/datafusion/substrait/tests/testdata/test_plans/select_count_from_select_1.substrait.json new file mode 100644 index 000000000000..e9f679588018 --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/select_count_from_select_1.substrait.json @@ -0,0 +1,92 @@ +{ + "extensionUris": [ + { + "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_aggregate_generic.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "functionAnchor": 185, + "name": "count:any" + } + } + ], + "relations": [ + { + "root": { + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "dummy" + ], + "struct": { + "types": [ + { + "i64": { + "nullability": "NULLABILITY_REQUIRED" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "virtualTable": { + "values": [ + { + "fields": [ + { + "i64": "0", + "nullable": false + } + ] + } + ] + } + } + }, + "groupings": [ + { + "groupingExpressions": [], + "expressionReferences": [] + } + ], + "measures": [ + { + "measure": { + "functionReference": 185, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [], + "options": [] + } + } + ], + "groupingExpressions": [] + } + }, + "names": [ + "count(*)" + ] + } + } + ] +} \ No newline at end of file diff --git a/datafusion/substrait/tests/testdata/test_plans/select_window_count.substrait.json b/datafusion/substrait/tests/testdata/test_plans/select_window_count.substrait.json new file mode 100644 index 000000000000..5b50145e13d6 --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/select_window_count.substrait.json @@ -0,0 +1,137 @@ +{ + "extensionUris": [ + { + "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_aggregate_generic.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "functionAnchor": 185, + "name": "count:any" + } + } + ], + "relations": [ + { + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 3 + ] + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "D", + "PART", + "ORD" + ], + "struct": { + "types": [ + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": [ + "DATA" + ] + } + } + }, + "expressions": [ + { + "windowFunction": { + "functionReference": 185, + "partitions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ], + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + } + ], + "upperBound": { + "unbounded": { + } + }, + "lowerBound": { + "preceding": { + "offset": "1" + } + }, + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "args": [], + "arguments": [], + "invocation": "AGGREGATION_INVOCATION_ALL", + "options": [], + "boundsType": "BOUNDS_TYPE_ROWS" + } + } + ] + } + }, + "names": [ + "LEAD_EXPR" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/utils.rs b/datafusion/substrait/tests/utils.rs index 0034cc27bf6e..e3e3ec3fab01 100644 --- a/datafusion/substrait/tests/utils.rs +++ b/datafusion/substrait/tests/utils.rs @@ -480,6 +480,7 @@ pub mod test { } } } + RexType::DynamicParameter(_) => {} // Enum is deprecated RexType::Enum(_) => {} } diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index 30d5bcaedcb7..94515c6754a7 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -45,7 +45,7 @@ chrono = { version = "0.4", features = ["wasmbind"] } # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for # code size when deploying. console_error_panic_hook = { version = "0.1.1", optional = true } -datafusion = { workspace = true } +datafusion = { workspace = true, features = ["parquet"] } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } diff --git a/datafusion/wasmtest/src/lib.rs b/datafusion/wasmtest/src/lib.rs index e2ba50beb657..df0d9d6cbf37 100644 --- a/datafusion/wasmtest/src/lib.rs +++ b/datafusion/wasmtest/src/lib.rs @@ -182,4 +182,29 @@ mod test { let task_ctx = ctx.task_ctx(); let _ = collect(physical_plan, task_ctx).await.unwrap(); } + + #[wasm_bindgen_test(unsupported = tokio::test)] + async fn test_parquet_write() { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("value", DataType::Utf8, false), + ])); + + let data: Vec = vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec!["a"])), + ]; + + let batch = RecordBatch::try_new(schema.clone(), data).unwrap(); + let mut buffer = Vec::new(); + let mut writer = datafusion::parquet::arrow::ArrowWriter::try_new( + &mut buffer, + schema.clone(), + None, + ) + .unwrap(); + + writer.write(&batch).unwrap(); + writer.close().unwrap(); + } } diff --git a/dev/changelog/46.0.1.md b/dev/changelog/46.0.1.md new file mode 100644 index 000000000000..17308bea87ac --- /dev/null +++ b/dev/changelog/46.0.1.md @@ -0,0 +1,38 @@ + + +# Apache DataFusion 46.0.1 Changelog + +This release consists of 3 commits from 1 contributors. See credits at the end of this changelog for more information. + +**Other:** + +- [branch-46] Update ring to v0.17.13 (#15063) [#15228](https://github.com/apache/datafusion/pull/15228) (alamb) +- [branch-46] Fix broken `serde` feature (#15124) [#15227](https://github.com/apache/datafusion/pull/15227) (alamb) +- [branch-46] Fix wasm32 build on version 46 [#15229](https://github.com/apache/datafusion/pull/15229) (alamb) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 3 Andrew Lamb +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/docs/source/contributor-guide/testing.md b/docs/source/contributor-guide/testing.md index 2a9f22d22d66..2868125c7f3d 100644 --- a/docs/source/contributor-guide/testing.md +++ b/docs/source/contributor-guide/testing.md @@ -58,6 +58,18 @@ Like similar systems such as [DuckDB](https://duckdb.org/dev/testing), DataFusio DataFusion has integrated [sqlite's test suite](https://sqlite.org/sqllogictest/doc/trunk/about.wiki) as a supplemental test suite that is run whenever a PR is merged into DataFusion. To run it manually please refer to the [README](https://github.com/apache/datafusion/blob/main/datafusion/sqllogictest/README.md#running-tests-sqlite) file for instructions. +## Snapshot testing + +[Insta](https://github.com/mitsuhiko/insta) is used for snapshot testing. Snapshots are generated +and compared on each test run. If the output changes, tests will fail. + +To review the changes, you can use Insta CLI: + +```shell +cargo install cargo-insta +cargo insta review +``` + ## Extended Tests In addition to the standard CI test suite that is run on all PRs prior to merge, diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index a6679cbea9ad..11fd49566522 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -212,4 +212,99 @@ To include special characters (such as newlines via `\n`) you can use an `E` lit Elapsed 0.005 seconds. ``` +### Changes to array scalar function signatures + +DataFusion 46 has changed the way scalar array function signatures are +declared. Previously, functions needed to select from a list of predefined +signatures within the `ArrayFunctionSignature` enum. Now the signatures +can be defined via a `Vec` of psuedo-types, which each correspond to a +single argument. Those psuedo-types are the variants of the +`ArrayFunctionArgument` enum and are as follows: + +- `Array`: An argument of type List/LargeList/FixedSizeList. All Array + arguments must be coercible to the same type. +- `Element`: An argument that is coercible to the inner type of the `Array` + arguments. +- `Index`: An `Int64` argument. + +Each of the old variants can be converted to the new format as follows: + +`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndElement)`: + +```rust +# use datafusion::common::utils::ListCoercion; +# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature}; + +TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Element], + array_coercion: Some(ListCoercion::FixedSizedListToList), +}); +``` + +`TypeSignature::ArraySignature(ArrayFunctionSignature::ElementAndArray)`: + +```rust +# use datafusion::common::utils::ListCoercion; +# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature}; + +TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Element, ArrayFunctionArgument::Array], + array_coercion: Some(ListCoercion::FixedSizedListToList), +}); +``` + +`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndIndex)`: + +```rust +# use datafusion::common::utils::ListCoercion; +# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature}; + +TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Index], + array_coercion: None, +}); +``` + +`TypeSignature::ArraySignature(ArrayFunctionSignature::ArrayAndElementAndOptionalIndex)`: + +```rust +# use datafusion::common::utils::ListCoercion; +# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature}; + +TypeSignature::OneOf(vec![ + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Element], + array_coercion: None, + }), + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::Element, + ArrayFunctionArgument::Index, + ], + array_coercion: None, + }), +]); +``` + +`TypeSignature::ArraySignature(ArrayFunctionSignature::Array)`: + +```rust +# use datafusion::common::utils::ListCoercion; +# use datafusion_expr_common::signature::{ArrayFunctionArgument, ArrayFunctionSignature, TypeSignature}; + +TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Array], + array_coercion: None, +}); +``` + +Alternatively, you can switch to using one of the following functions which +take care of constructing the `TypeSignature` for you: + +- `Signature::array_and_element` +- `Signature::array_and_element_and_optional_index` +- `Signature::array_and_index` +- `Signature::array` + [ticket]: https://github.com/apache/datafusion/issues/13286 diff --git a/docs/source/library-user-guide/working-with-exprs.md b/docs/source/library-user-guide/working-with-exprs.md index 1a6e9123086d..df4e5e3940aa 100644 --- a/docs/source/library-user-guide/working-with-exprs.md +++ b/docs/source/library-user-guide/working-with-exprs.md @@ -50,6 +50,25 @@ As another example, the SQL expression `a + b * c` would be represented as an `E As the writer of a library, you can use `Expr`s to represent computations that you want to perform. This guide will walk you through how to make your own scalar UDF as an `Expr` and how to rewrite `Expr`s to inline the simple UDF. +## Arrow Schema and DataFusion DFSchema + +Apache Arrow `Schema` provides a lightweight structure for defining data, and Apache Datafusion`DFSchema` extends it with extra information such as column qualifiers and functional dependencies. Column qualifiers are multi part path to the table e.g table, schema, catalog. Functional Dependency is the relationship between attributes(characteristics) of a table related to each other. + +### Difference between Schema and DFSchema + +- Schema: A fundamental component of Apache Arrow, `Schema` defines a dataset's structure, specifying column names and their data types. + + > Please see [Struct Schema](https://docs.rs/arrow-schema/54.2.1/arrow_schema/struct.Schema.html) for a detailed document of Arrow Schema. + +- DFSchema: Extending `Schema`, `DFSchema` incorporates qualifiers such as table names, enabling it to carry additional context when required. This is particularly valuable for managing queries across multiple tables. + > Please see [Struct DFSchema](https://docs.rs/datafusion/latest/datafusion/common/struct.DFSchema.html) for a detailed document of DFSchema. + +### How to convert between Schema and DFSchema + +From Schema to DFSchema: Use `DFSchema::try_from_qualified_schema` with a table name and original schema, for detailed code example please see [creating-qualified-schemas](https://docs.rs/datafusion/latest/datafusion/common/struct.DFSchema.html#creating-qualified-schemas). + +From DFSchema to Schema: Since the `Into` trait has been implemented for DFSchema to convert it into an Arrow Schema, for detailed code example please see [converting-back-to-arrow-schema](https://docs.rs/datafusion/latest/datafusion/common/struct.DFSchema.html#converting-back-to-arrow-schema). + ## Creating and Evaluating `Expr`s Please see [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs) for well commented code for creating, evaluating, simplifying, and analyzing `Expr`s. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 635eb2b0a67f..68e21183938b 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -68,7 +68,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 46.0.0 | (writing) Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 46.0.1 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | | datafusion.execution.parquet.statistics_truncate_length | NULL | (writing) Sets statictics truncate length. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | @@ -128,5 +128,6 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | +| datafusion.sql_parser.map_varchar_to_utf8view | false | If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. | | datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | | datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | diff --git a/docs/source/user-guide/sql/explain.md b/docs/source/user-guide/sql/explain.md index 3f2c7de43eac..f89e854ebffd 100644 --- a/docs/source/user-guide/sql/explain.md +++ b/docs/source/user-guide/sql/explain.md @@ -21,41 +21,227 @@ The `EXPLAIN` command shows the logical and physical execution plan for the specified SQL statement. -See the [Reading Explain Plans](../explain-usage.md) page for more information on how to interpret these plans. +## Syntax
-EXPLAIN [ANALYZE] [VERBOSE] statement
+EXPLAIN [ANALYZE] [VERBOSE] [FORMAT format] statement
 
-## EXPLAIN +## `EXPLAIN` Shows the execution plan of a statement. If you need more detailed output, use `EXPLAIN VERBOSE`. +Note that `EXPLAIN VERBOSE` only supports the `indent` format. + +The optional `[FORMAT format]` clause controls how the plan is displayed as +explained below. If this clause is not specified, the plan is displayed using +the format from the [configuration value] `datafusion.explain.format`. + +[configuration value]: ../configs.md + +### `indent` format (default) + +The `indent` format shows both the logical and physical plan, with one line for +each operator in the plan. Child plans are indented to show the hierarchy. + +See [Reading Explain Plans](../explain-usage.md) for more information on how to interpret these plans. + +```sql +> CREATE TABLE t(x int, b int) AS VALUES (1, 2), (2, 3); +0 row(s) fetched. +Elapsed 0.004 seconds. + +> EXPLAIN SELECT SUM(x) FROM t GROUP BY b; ++---------------+-------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-------------------------------------------------------------------------------+ +| logical_plan | Projection: sum(t.x) | +| | Aggregate: groupBy=[[t.b]], aggr=[[sum(CAST(t.x AS Int64))]] | +| | TableScan: t projection=[x, b] | +| physical_plan | ProjectionExec: expr=[sum(t.x)@1 as sum(t.x)] | +| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[sum(t.x)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([b@0], 16), input_partitions=16 | +| | RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1 | +| | AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(t.x)] | +| | DataSourceExec: partitions=1, partition_sizes=[1] | +| | | ++---------------+-------------------------------------------------------------------------------+ +2 row(s) fetched. +Elapsed 0.004 seconds. +``` + +### `tree` format + +The `tree` format is modeled after [DuckDB plans] and is designed to be easier +to see the high level structure of the plan + +[duckdb plans]: https://duckdb.org/docs/stable/guides/meta/explain.html + +```sql +> EXPLAIN FORMAT TREE SELECT SUM(x) FROM t GROUP BY b; ++---------------+-------------------------------+ +| plan_type | plan | ++---------------+-------------------------------+ +| physical_plan | ┌───────────────────────────┐ | +| | │ ProjectionExec │ | +| | │ -------------------- │ | +| | │ sum(t.x): sum(t.x)@1 │ | +| | └─────────────┬─────────────┘ | +| | ┌─────────────┴─────────────┐ | +| | │ AggregateExec │ | +| | │ -------------------- │ | +| | │ aggr: sum(t.x) │ | +| | │ group_by: b@0 as b │ | +| | │ │ | +| | │ mode: │ | +| | │ FinalPartitioned │ | +| | └─────────────┬─────────────┘ | +| | ┌─────────────┴─────────────┐ | +| | │ CoalesceBatchesExec │ | +| | └─────────────┬─────────────┘ | +| | ┌─────────────┴─────────────┐ | +| | │ RepartitionExec │ | +| | │ -------------------- │ | +| | │ output_partition_count: │ | +| | │ 16 │ | +| | │ │ | +| | │ partitioning_scheme: │ | +| | │ Hash([b@0], 16) │ | +| | └─────────────┬─────────────┘ | +| | ┌─────────────┴─────────────┐ | +| | │ RepartitionExec │ | +| | │ -------------------- │ | +| | │ output_partition_count: │ | +| | │ 1 │ | +| | │ │ | +| | │ partitioning_scheme: │ | +| | │ RoundRobinBatch(16) │ | +| | └─────────────┬─────────────┘ | +| | ┌─────────────┴─────────────┐ | +| | │ AggregateExec │ | +| | │ -------------------- │ | +| | │ aggr: sum(t.x) │ | +| | │ group_by: b@1 as b │ | +| | │ mode: Partial │ | +| | └─────────────┬─────────────┘ | +| | ┌─────────────┴─────────────┐ | +| | │ DataSourceExec │ | +| | │ -------------------- │ | +| | │ bytes: 224 │ | +| | │ format: memory │ | +| | │ rows: 1 │ | +| | └───────────────────────────┘ | +| | | ++---------------+-------------------------------+ +1 row(s) fetched. +Elapsed 0.016 seconds. +``` + +### `pgjson` format + +The `pgjson` format is modeled after [Postgres JSON] format. + +You can use this format to visualize the plan in existing plan visualization +tools, such as [dalibo] + +[postgres json]: https://www.postgresql.org/docs/current/sql-explain.html +[dalibo]: https://explain.dalibo.com/ + +```sql +> EXPLAIN FORMAT PGJSON SELECT SUM(x) FROM t GROUP BY b; ++--------------+----------------------------------------------------+ +| plan_type | plan | ++--------------+----------------------------------------------------+ +| logical_plan | [ | +| | { | +| | "Plan": { | +| | "Expressions": [ | +| | "sum(t.x)" | +| | ], | +| | "Node Type": "Projection", | +| | "Output": [ | +| | "sum(t.x)" | +| | ], | +| | "Plans": [ | +| | { | +| | "Aggregates": "sum(CAST(t.x AS Int64))", | +| | "Group By": "t.b", | +| | "Node Type": "Aggregate", | +| | "Output": [ | +| | "b", | +| | "sum(t.x)" | +| | ], | +| | "Plans": [ | +| | { | +| | "Node Type": "TableScan", | +| | "Output": [ | +| | "x", | +| | "b" | +| | ], | +| | "Plans": [], | +| | "Relation Name": "t" | +| | } | +| | ] | +| | } | +| | ] | +| | } | +| | } | +| | ] | ++--------------+----------------------------------------------------+ +1 row(s) fetched. +Elapsed 0.008 seconds. +``` + +### `graphviz` format + +The `graphviz` format uses the [DOT language] that can be used with [Graphviz] to +generate a visual representation of the plan. + +[dot language]: https://graphviz.org/doc/info/lang.html +[graphviz]: https://graphviz.org/ ```sql -EXPLAIN SELECT SUM(x) FROM table GROUP BY b; - -+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #SUM(table.x) | -| | Aggregate: groupBy=[[#table.b]], aggr=[[SUM(#table.x)]] | -| | TableScan: table projection=[x, b] | -| physical_plan | ProjectionExec: expr=[SUM(table.x)@1 as SUM(table.x)] | -| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[SUM(table.x)] | -| | CoalesceBatchesExec: target_batch_size=4096 | -| | RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 }], 16) | -| | AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[SUM(table.x)] | -| | RepartitionExec: partitioning=RoundRobinBatch(16) | -| | DataSourceExec: file_groups={1 group: [[/tmp/table.csv]]}, projection=[x, b], has_header=false | -| | | -+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ +> EXPLAIN FORMAT GRAPHVIZ SELECT SUM(x) FROM t GROUP BY b; ++--------------+------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++--------------+------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | | +| | // Begin DataFusion GraphViz Plan, | +| | // display it online here: https://dreampuf.github.io/GraphvizOnline | +| | | +| | digraph { | +| | subgraph cluster_1 | +| | { | +| | graph[label="LogicalPlan"] | +| | 2[shape=box label="Projection: sum(t.x)"] | +| | 3[shape=box label="Aggregate: groupBy=[[t.b]], aggr=[[sum(CAST(t.x AS Int64))]]"] | +| | 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back] | +| | 4[shape=box label="TableScan: t projection=[x, b]"] | +| | 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back] | +| | } | +| | subgraph cluster_5 | +| | { | +| | graph[label="Detailed LogicalPlan"] | +| | 6[shape=box label="Projection: sum(t.x)\nSchema: [sum(t.x):Int64;N]"] | +| | 7[shape=box label="Aggregate: groupBy=[[t.b]], aggr=[[sum(CAST(t.x AS Int64))]]\nSchema: [b:Int32;N, sum(t.x):Int64;N]"] | +| | 6 -> 7 [arrowhead=none, arrowtail=normal, dir=back] | +| | 8[shape=box label="TableScan: t projection=[x, b]\nSchema: [x:Int32;N, b:Int32;N]"] | +| | 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back] | +| | } | +| | } | +| | // End DataFusion GraphViz Plan | +| | | ++--------------+------------------------------------------------------------------------------------------------------------------------------+ +1 row(s) fetched. +Elapsed 0.010 seconds. ``` -## EXPLAIN ANALYZE +## `EXPLAIN ANALYZE` -Shows the execution plan and metrics of a statement. -If you need more information output, use `EXPLAIN ANALYZE VERBOSE`. +Shows the execution plan and metrics of a statement. If you need more +information output, use `EXPLAIN ANALYZE VERBOSE`. Note that `EXPLAIN ANALYZE` +only supports the `indent` format. ```sql EXPLAIN ANALYZE SELECT SUM(x) FROM table GROUP BY b; diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index 4ad6e213cda3..6e42593cc238 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -27,7 +27,7 @@ workspace = true [dependencies] arrow = { workspace = true } -chrono-tz = { version = "0.10.0", default-features = false } +chrono-tz = { version = "0.10.2", default-features = false } datafusion-common = { workspace = true, default-features = true } env_logger = { workspace = true } rand = { workspace = true } From b9cf11728d16a829ebd855f6d9208c8ffd249ad6 Mon Sep 17 00:00:00 2001 From: blaginin Date: Mon, 24 Mar 2025 17:18:35 +0000 Subject: [PATCH 7/7] Remove extra alias --- datafusion/optimizer/tests/optimizer_integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index 5e66c7ec0313..76549235e1f8 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -310,7 +310,7 @@ fn eliminate_redundant_null_check_on_count() { GROUP BY col_int32 HAVING c IS NOT NULL"; let plan = test_sql(sql).unwrap(); - let expected = "Projection: test.col_int32, count(Int64(1)) AS count(*) AS c\ + let expected = "Projection: test.col_int32, count(Int64(1)) AS c\ \n Aggregate: groupBy=[[test.col_int32]], aggr=[[count(Int64(1))]]\ \n TableScan: test projection=[col_int32]"; assert_eq!(expected, format!("{plan}"));