From f1a01eb6f27e76d6494c5e6cd2ac3f999748d3e0 Mon Sep 17 00:00:00 2001 From: wiedld Date: Mon, 18 Mar 2024 10:54:04 -0700 Subject: [PATCH 1/9] test(9678): reproducer of short-circuiting causing expr elimination to error --- .../test_files/common_subexpr_eliminate.slt | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt diff --git a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt new file mode 100644 index 000000000000..0d332dd42321 --- /dev/null +++ b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############# +## Common Subexpr Eliminate Tests +############# + +statement ok +CREATE TABLE doubles ( + f64 DOUBLE +) as VALUES + (10.1) +; + +# common subexpr with alias +query RRR rowsort +select f64, round(1.0 / f64) as i64_1, acos(round(1.0 / f64)) from doubles; +---- +10.1 0 1.570796326795 + +# common subexpr with coalesce (short-circuited) +query RRR rowsort +select f64, coalesce(1.0 / f64, 0.0), acos(coalesce(1.0 / f64, 0.0)) from doubles; +---- +10.1 0.09900990099 1.471623942989 + +# BUG: should be 1.471623942989 +# common subexpr with coalesce (short-circuited) and alias +query RRR rowsort +select f64, coalesce(1.0 / f64, 0.0) as f64_1, acos(coalesce(1.0 / f64, 0.0)) from doubles; +---- +10.1 0.09900990099 0.09900990099 + +# BUG: should be 1.471623942989 +# common subexpr with case (short-circuited) +query RRR rowsort +select f64, case when f64 > 0 then 1.0 / f64 else null end, acos(case when f64 > 0 then 1.0 / f64 else null end) from doubles; +---- +10.1 0.09900990099 0.09900990099 From ff466149231a3c72dcceebf54d43d59f00d9c415 Mon Sep 17 00:00:00 2001 From: wiedld Date: Mon, 18 Mar 2024 15:45:23 -0700 Subject: [PATCH 2/9] fix(9678): populate visited stack for short-circuited expressions, during the common-expr elimination optimization --- .../optimizer/src/common_subexpr_eliminate.rs | 22 +++++++++---------- .../test_files/common_subexpr_eliminate.slt | 6 ++--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 7b8eccad5133..e7fa8e564995 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -582,20 +582,20 @@ impl ExprIdentifierVisitor<'_> { /// Find the first `EnterMark` in the stack, and accumulates every `ExprItem` /// before it. - fn pop_enter_mark(&mut self) -> Option<(usize, Identifier)> { + fn pop_enter_mark(&mut self) -> (usize, Identifier) { let mut desc = String::new(); while let Some(item) = self.visit_stack.pop() { match item { VisitRecord::EnterMark(idx) => { - return Some((idx, desc)); + return (idx, desc); } VisitRecord::ExprItem(s) => { desc.push_str(&s); } } } - None + unreachable!("Enter mark should paired with node number"); } } @@ -603,25 +603,25 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { type Node = Expr; fn f_down(&mut self, expr: &Expr) -> Result { - // related to https://github.com/apache/arrow-datafusion/issues/8814 - // If the expr contain volatile expression or is a short-circuit expression, skip it. - if expr.short_circuits() || is_volatile_expression(expr)? { - return Ok(TreeNodeRecursion::Jump); - } self.visit_stack .push(VisitRecord::EnterMark(self.node_count)); self.node_count += 1; // put placeholder self.id_array.push((0, "".to_string())); + + // related to https://github.com/apache/arrow-datafusion/issues/8814 + // If the expr contain volatile expression or is a short-circuit expression, skip it. + if expr.short_circuits() || is_volatile_expression(expr)? { + return Ok(TreeNodeRecursion::Jump); + } Ok(TreeNodeRecursion::Continue) } fn f_up(&mut self, expr: &Expr) -> Result { self.series_number += 1; - let Some((idx, sub_expr_desc)) = self.pop_enter_mark() else { - return Ok(TreeNodeRecursion::Continue); - }; + let (idx, sub_expr_desc) = self.pop_enter_mark(); + // skip exprs should not be recognize. if self.expr_mask.ignores(expr) { self.id_array[idx].0 = self.series_number; diff --git a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt index 0d332dd42321..33e87f43f7aa 100644 --- a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt +++ b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt @@ -38,16 +38,14 @@ select f64, coalesce(1.0 / f64, 0.0), acos(coalesce(1.0 / f64, 0.0)) from double ---- 10.1 0.09900990099 1.471623942989 -# BUG: should be 1.471623942989 # common subexpr with coalesce (short-circuited) and alias query RRR rowsort select f64, coalesce(1.0 / f64, 0.0) as f64_1, acos(coalesce(1.0 / f64, 0.0)) from doubles; ---- -10.1 0.09900990099 0.09900990099 +10.1 0.09900990099 1.471623942989 -# BUG: should be 1.471623942989 # common subexpr with case (short-circuited) query RRR rowsort select f64, case when f64 > 0 then 1.0 / f64 else null end, acos(case when f64 > 0 then 1.0 / f64 else null end) from doubles; ---- -10.1 0.09900990099 0.09900990099 +10.1 0.09900990099 1.471623942989 From de09f72842c78020e8f691a4a67134a1aaf6fe8c Mon Sep 17 00:00:00 2001 From: wiedld Date: Mon, 18 Mar 2024 23:49:39 -0700 Subject: [PATCH 3/9] test(9678): reproducer for optimizer error (in common_subexpr_eliminate), as seen in other test case --- .../test_files/common_subexpr_eliminate.slt | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt index 33e87f43f7aa..7ba28e1ff735 100644 --- a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt +++ b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt @@ -49,3 +49,68 @@ query RRR rowsort select f64, case when f64 > 0 then 1.0 / f64 else null end, acos(case when f64 > 0 then 1.0 / f64 else null end) from doubles; ---- 10.1 0.09900990099 1.471623942989 + + +statement ok +CREATE EXTERNAL TABLE aggregate_test_100 ( + c1 VARCHAR NOT NULL, + c2 TINYINT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT, + c5 INT, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 BIGINT UNSIGNED NOT NULL, + c10 VARCHAR NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION '../../testing/data/csv/aggregate_test_100.csv' + +# starting complex expression +query TIB +SELECT DISTINCT ON (chr(ascii(c1) + 3), c2 % 2) chr(ascii(upper(c1)) + 3), c2 % 2, c3 > 80 +FROM aggregate_test_100 +WHERE c1 IN ('a', 'b') +ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC; +---- +D 0 false +D 1 true +E 0 false +E 1 false + +# BUG: fix this +# added to boolean expr in the projections `AND c2 % 2 = 1` +# `Error during planning: No function matches the given name and argument types 'chr(Utf8)'` +# +query error DataFusion error: Optimizer rule 'common_sub_expression_eliminate' failed +SELECT DISTINCT ON (chr(ascii(c1) + 3), c2 % 2) chr(ascii(upper(c1)) + 3), c2 % 2, c3 > 80 AND c2 % 2 = 1 +FROM aggregate_test_100 +WHERE c1 IN ('a', 'b') +ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC; + +# another boolean is fine. It's the subexpr `c2 % 2 = 1`. +query TIB +SELECT DISTINCT ON (chr(ascii(c1) + 3), c2 % 2) chr(ascii(upper(c1)) + 3), c2 % 2, c3 > 80 AND true +FROM aggregate_test_100 +WHERE c1 IN ('a', 'b') +ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC; +---- +D 0 false +D 1 true +E 0 false +E 1 false + +# The projection `c2 % 2 = 1` is fine. +query B +SELECT c2 % 2 = 1 +FROM aggregate_test_100 +WHERE c1 IN ('a', 'b') +ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC +LIMIT 1; +---- +false \ No newline at end of file From 97938ada2e49cfe4e7bdf1ea9de7bb2d9296b8d9 Mon Sep 17 00:00:00 2001 From: wiedld Date: Tue, 19 Mar 2024 14:56:02 -0700 Subject: [PATCH 4/9] chore: extract id_array into abstraction, to make it more clear the relationship between the two visitors --- .../optimizer/src/common_subexpr_eliminate.rs | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index e7fa8e564995..ed09eddbe48c 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -42,6 +42,19 @@ use datafusion_expr::{col, Expr, ExprSchemable}; /// - DataType of this expression. type ExprSet = HashMap; +/// An ordered map of Identifiers encountered during visitation. +/// +/// Is created in the ExprIdentifierVisitor, which identifies the common expressions. +/// Is consumed in the CommonSubexprRewriter, which performs mutations. +/// +/// Vec idx is ordered by expr nodes visited on f_down. +/// - series_number. +/// - Incr in fn_up, start from 1. +/// - the higher idx have the lower series_number. +/// - Identifier. +/// - is empty ("") if expr should not be considered for common elimation. +type IdArray = Vec<(usize, Identifier)>; + /// Identifier type. Current implementation use describe of an expression (type String) as /// Identifier. /// @@ -532,18 +545,18 @@ impl ExprMask { /// Go through an expression tree and generate identifier. /// /// An identifier contains information of the expression itself and its sub-expression. -/// This visitor implementation use a stack `visit_stack` to track traversal, which +/// This visitor implementation use a stack `f_down()` to track traversal, which /// lets us know when a sub-tree's visiting is finished. When `pre_visit` is called /// (traversing to a new node), an `EnterMark` and an `ExprItem` will be pushed into stack. -/// And try to pop out a `EnterMark` on leaving a node (`post_visit()`). All `ExprItem` +/// And try to pop out a `EnterMark` on leaving a node (`f_up()`). All `ExprItem` /// before the first `EnterMark` is considered to be sub-tree of the leaving node. /// /// This visitor also records identifier in `id_array`. Makes the following traverse /// pass can get the identifier of a node without recalculate it. We assign each node /// in the expr tree a series number, start from 1, maintained by `series_number`. -/// Series number represents the order we left (`post_visit`) a node. Has the property +/// Series number represents the order we left (`f_up()`) a node. Has the property /// that child node's series number always smaller than parent's. While `id_array` is -/// organized in the order we enter (`pre_visit`) a node. `node_count` helps us to +/// organized in the order we enter (`f_down()`) a node. `node_count` helps us to /// get the index of `id_array` for each node. /// /// `Expr` without sub-expr (column, literal etc.) will not have identifier @@ -552,15 +565,15 @@ struct ExprIdentifierVisitor<'a> { // param expr_set: &'a mut ExprSet, /// series number (usize) and identifier. - id_array: &'a mut Vec<(usize, Identifier)>, + id_array: &'a mut IdArray, /// input schema for the node that we're optimizing, so we can determine the correct datatype /// for each subexpression input_schema: DFSchemaRef, // inner states visit_stack: Vec, - /// increased in pre_visit, start from 0. + /// increased in fn_down, start from 0. node_count: usize, - /// increased in post_visit, start from 1. + /// increased in fn_up, start from 1. series_number: usize, /// which expression should be skipped? expr_mask: ExprMask, @@ -606,7 +619,7 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { self.visit_stack .push(VisitRecord::EnterMark(self.node_count)); self.node_count += 1; - // put placeholder + // put placeholder, sets the proper array length self.id_array.push((0, "".to_string())); // related to https://github.com/apache/arrow-datafusion/issues/8814 @@ -671,7 +684,7 @@ fn expr_to_identifier( /// evaluate result of replaced expression. struct CommonSubexprRewriter<'a> { expr_set: &'a ExprSet, - id_array: &'a [(usize, Identifier)], + id_array: &'a IdArray, /// Which identifier is replaced. affected_id: &'a mut BTreeSet, @@ -765,7 +778,7 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { fn replace_common_expr( expr: Expr, - id_array: &[(usize, Identifier)], + id_array: &IdArray, expr_set: &ExprSet, affected_id: &mut BTreeSet, ) -> Result { From 1931b3b439e6386ec1bdfe5616d89cfe3c7e3300 Mon Sep 17 00:00:00 2001 From: wiedld Date: Tue, 19 Mar 2024 15:08:58 -0700 Subject: [PATCH 5/9] refactor: tweak the fix and make code more explicit (JumpMark, node_to_identifier) --- .../optimizer/src/common_subexpr_eliminate.rs | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index ed09eddbe48c..359438face6c 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -284,8 +284,9 @@ impl CommonSubexprEliminate { agg_exprs.push(expr.alias(&name)); proj_exprs.push(Expr::Column(Column::from_name(name))); } else { - let id = - ExprIdentifierVisitor::<'static>::desc_expr(&expr_rewritten); + let id = ExprIdentifierVisitor::<'static>::expr_identifier( + &expr_rewritten, + ); let out_name = expr_rewritten.to_field(&new_input_schema)?.qualified_name(); agg_exprs.push(expr_rewritten.alias(&id)); @@ -584,12 +585,14 @@ enum VisitRecord { /// `usize` is the monotone increasing series number assigned in pre_visit(). /// Starts from 0. Is used to index the identifier array `id_array` in post_visit(). EnterMark(usize), + /// the node's children were skipped => jump to f_up on same node + JumpMark(usize), /// Accumulated identifier of sub expression. ExprItem(Identifier), } impl ExprIdentifierVisitor<'_> { - fn desc_expr(expr: &Expr) -> String { + fn expr_identifier(expr: &Expr) -> Identifier { format!("{expr}") } @@ -600,11 +603,11 @@ impl ExprIdentifierVisitor<'_> { while let Some(item) = self.visit_stack.pop() { match item { - VisitRecord::EnterMark(idx) => { + VisitRecord::EnterMark(idx) | VisitRecord::JumpMark(idx) => { return (idx, desc); } - VisitRecord::ExprItem(s) => { - desc.push_str(&s); + VisitRecord::ExprItem(id) => { + desc.push_str(&id); } } } @@ -616,34 +619,39 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { type Node = Expr; fn f_down(&mut self, expr: &Expr) -> Result { - self.visit_stack - .push(VisitRecord::EnterMark(self.node_count)); - self.node_count += 1; // put placeholder, sets the proper array length self.id_array.push((0, "".to_string())); // related to https://github.com/apache/arrow-datafusion/issues/8814 // If the expr contain volatile expression or is a short-circuit expression, skip it. if expr.short_circuits() || is_volatile_expression(expr)? { - return Ok(TreeNodeRecursion::Jump); + self.visit_stack + .push(VisitRecord::JumpMark(self.node_count)); + return Ok(TreeNodeRecursion::Jump); // go to f_up } + + self.visit_stack + .push(VisitRecord::EnterMark(self.node_count)); + self.node_count += 1; + Ok(TreeNodeRecursion::Continue) } fn f_up(&mut self, expr: &Expr) -> Result { self.series_number += 1; - let (idx, sub_expr_desc) = self.pop_enter_mark(); + let (idx, sub_expr_identifier) = self.pop_enter_mark(); // skip exprs should not be recognize. if self.expr_mask.ignores(expr) { - self.id_array[idx].0 = self.series_number; - let desc = Self::desc_expr(expr); - self.visit_stack.push(VisitRecord::ExprItem(desc)); + let curr_expr_identifier = Self::expr_identifier(expr); + self.visit_stack + .push(VisitRecord::ExprItem(curr_expr_identifier)); + self.id_array[idx].0 = self.series_number; // leave Identifer as empty "", since will not use as common expr return Ok(TreeNodeRecursion::Continue); } - let mut desc = Self::desc_expr(expr); - desc.push_str(&sub_expr_desc); + let mut desc = Self::expr_identifier(expr); + desc.push_str(&sub_expr_identifier); self.id_array[idx] = (self.series_number, desc.clone()); self.visit_stack.push(VisitRecord::ExprItem(desc.clone())); From 552e0f4cbc4178d375c4b7d4096d462ecad130d9 Mon Sep 17 00:00:00 2001 From: wiedld Date: Tue, 19 Mar 2024 23:22:09 -0700 Subject: [PATCH 6/9] fix: get the series_number and curr_id with the correct self.current_idx, before the various incr/decr --- .../optimizer/src/common_subexpr_eliminate.rs | 17 ++++++++++------- .../test_files/common_subexpr_eliminate.slt | 14 ++------------ 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 359438face6c..2b44daf64e07 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -714,16 +714,19 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { if expr.short_circuits() || is_volatile_expression(&expr)? { return Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump)); } + + let (series_number, curr_id) = &self.id_array[self.curr_index]; + + // halting conditions if self.curr_index >= self.id_array.len() - || self.max_series_number > self.id_array[self.curr_index].0 + || self.max_series_number > *series_number { return Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump)); } - let curr_id = &self.id_array[self.curr_index].1; // skip `Expr`s without identifier (empty identifier). if curr_id.is_empty() { - self.curr_index += 1; + self.curr_index += 1; // incr idx for id_array, when not jumping return Ok(Transformed::no(expr)); } match self.expr_set.get(curr_id) { @@ -740,14 +743,14 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { )); } - let (series_number, id) = &self.id_array[self.curr_index]; + // incr idx for id_array, when not jumping self.curr_index += 1; // Skip sub-node of a replaced tree, or without identifier, or is not repeated expr. - let expr_set_item = self.expr_set.get(id).ok_or_else(|| { + let expr_set_item = self.expr_set.get(curr_id).ok_or_else(|| { internal_datafusion_err!("expr_set invalid state") })?; if *series_number < self.max_series_number - || id.is_empty() + || curr_id.is_empty() || expr_set_item.1 <= 1 { return Ok(Transformed::new( @@ -770,7 +773,7 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { // `projection_push_down` optimizer use "expr name" to eliminate useless // projections. Ok(Transformed::new( - col(id).alias(expr_name), + col(curr_id).alias(expr_name), true, TreeNodeRecursion::Jump, )) diff --git a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt index 7ba28e1ff735..8ff73d441d30 100644 --- a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt +++ b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt @@ -83,19 +83,9 @@ D 1 true E 0 false E 1 false -# BUG: fix this -# added to boolean expr in the projections `AND c2 % 2 = 1` -# `Error during planning: No function matches the given name and argument types 'chr(Utf8)'` -# -query error DataFusion error: Optimizer rule 'common_sub_expression_eliminate' failed -SELECT DISTINCT ON (chr(ascii(c1) + 3), c2 % 2) chr(ascii(upper(c1)) + 3), c2 % 2, c3 > 80 AND c2 % 2 = 1 -FROM aggregate_test_100 -WHERE c1 IN ('a', 'b') -ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC; - -# another boolean is fine. It's the subexpr `c2 % 2 = 1`. +# regression test query TIB -SELECT DISTINCT ON (chr(ascii(c1) + 3), c2 % 2) chr(ascii(upper(c1)) + 3), c2 % 2, c3 > 80 AND true +SELECT DISTINCT ON (chr(ascii(c1) + 3), c2 % 2) chr(ascii(upper(c1)) + 3), c2 % 2, c3 > 80 AND c2 % 2 = 1 FROM aggregate_test_100 WHERE c1 IN ('a', 'b') ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC; From a55ced48d691613446f2f53020b0e9935de595fc Mon Sep 17 00:00:00 2001 From: wiedld Date: Tue, 19 Mar 2024 23:40:18 -0700 Subject: [PATCH 7/9] chore: remove unneeded conditional check (already done earlier), and add code comments --- .../optimizer/src/common_subexpr_eliminate.rs | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 2b44daf64e07..33b6699a2d38 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -29,8 +29,7 @@ use datafusion_common::tree_node::{ TreeNodeVisitor, }; use datafusion_common::{ - internal_datafusion_err, internal_err, Column, DFField, DFSchema, DFSchemaRef, - DataFusionError, Result, + internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; @@ -729,8 +728,11 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { self.curr_index += 1; // incr idx for id_array, when not jumping return Ok(Transformed::no(expr)); } + + // lookup previously visited expression match self.expr_set.get(curr_id) { Some((_, counter, _)) => { + // if has a commonly used (a.k.a. 1+ use) expr if *counter > 1 { self.affected_id.insert(curr_id.clone()); @@ -745,21 +747,8 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { // incr idx for id_array, when not jumping self.curr_index += 1; - // Skip sub-node of a replaced tree, or without identifier, or is not repeated expr. - let expr_set_item = self.expr_set.get(curr_id).ok_or_else(|| { - internal_datafusion_err!("expr_set invalid state") - })?; - if *series_number < self.max_series_number - || curr_id.is_empty() - || expr_set_item.1 <= 1 - { - return Ok(Transformed::new( - expr, - false, - TreeNodeRecursion::Jump, - )); - } + // series_number was the inverse number ordering (when doing f_up) self.max_series_number = *series_number; // step index to skip all sub-node (which has smaller series number). while self.curr_index < self.id_array.len() From 04ddb6eb08988c54782bd808e110e7889b6ec170 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 20 Mar 2024 09:52:42 -0400 Subject: [PATCH 8/9] Refine documentation in common_subexpr_eliminate.rs --- .../optimizer/src/common_subexpr_eliminate.rs | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 33b6699a2d38..0f3a3874c553 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -41,21 +41,36 @@ use datafusion_expr::{col, Expr, ExprSchemable}; /// - DataType of this expression. type ExprSet = HashMap; -/// An ordered map of Identifiers encountered during visitation. +/// An ordered map of Identifiers assigned by `ExprIdentifierVisitor` in an +/// initial expression walk. /// -/// Is created in the ExprIdentifierVisitor, which identifies the common expressions. -/// Is consumed in the CommonSubexprRewriter, which performs mutations. +/// Used by `CommonSubexprRewriter`, which rewrites the expressions to remove +/// common subexpressions. /// -/// Vec idx is ordered by expr nodes visited on f_down. +/// Elements in this array are created on the walk down the expression tree +/// during `f_down`. Thus element 0 is the root of the expression tree. The +/// tuple contains: /// - series_number. -/// - Incr in fn_up, start from 1. -/// - the higher idx have the lower series_number. -/// - Identifier. -/// - is empty ("") if expr should not be considered for common elimation. +/// - Incremented during `f_up`, start from 1. +/// - Thus, items with higher idx have the lower series_number. +/// - [`Identifier`] +/// - Identifier of the expression. If empty (`""`), expr should not be considered for common elimination. +/// +/// # Example +/// An expression like `(a + b)` would have the following `IdArray`: +/// ```text +/// [ +/// (3, "a + b"), +/// (2, "a"), +/// (1, "b") +/// ] +/// ``` type IdArray = Vec<(usize, Identifier)>; -/// Identifier type. Current implementation use describe of an expression (type String) as -/// Identifier. +/// Identifier for each subexpression. +/// +/// Note that the current implementation uses the `Display` of an expression +/// (a `String`) as `Identifier`. /// /// An identifier should (ideally) be able to "hash", "accumulate", "equal" and "have no /// collision (as low as possible)" From 6223239a2adcf65d95adf32b2bd16ea2a535137a Mon Sep 17 00:00:00 2001 From: wiedld Date: Wed, 20 Mar 2024 10:31:08 -0700 Subject: [PATCH 9/9] chore: cleanup -- fix 1 doc comment and consolidate common-expr-elimination test with other expr test --- .../optimizer/src/common_subexpr_eliminate.rs | 2 +- .../test_files/common_subexpr_eliminate.slt | 106 ------------------ datafusion/sqllogictest/test_files/expr.slt | 36 ++++++ 3 files changed, 37 insertions(+), 107 deletions(-) delete mode 100644 datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 0f3a3874c553..ae0d37f5e724 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -560,7 +560,7 @@ impl ExprMask { /// Go through an expression tree and generate identifier. /// /// An identifier contains information of the expression itself and its sub-expression. -/// This visitor implementation use a stack `f_down()` to track traversal, which +/// This visitor implementation use a stack `visit_stack` to track traversal, which /// lets us know when a sub-tree's visiting is finished. When `pre_visit` is called /// (traversing to a new node), an `EnterMark` and an `ExprItem` will be pushed into stack. /// And try to pop out a `EnterMark` on leaving a node (`f_up()`). All `ExprItem` diff --git a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt b/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt deleted file mode 100644 index 8ff73d441d30..000000000000 --- a/datafusion/sqllogictest/test_files/common_subexpr_eliminate.slt +++ /dev/null @@ -1,106 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -############# -## Common Subexpr Eliminate Tests -############# - -statement ok -CREATE TABLE doubles ( - f64 DOUBLE -) as VALUES - (10.1) -; - -# common subexpr with alias -query RRR rowsort -select f64, round(1.0 / f64) as i64_1, acos(round(1.0 / f64)) from doubles; ----- -10.1 0 1.570796326795 - -# common subexpr with coalesce (short-circuited) -query RRR rowsort -select f64, coalesce(1.0 / f64, 0.0), acos(coalesce(1.0 / f64, 0.0)) from doubles; ----- -10.1 0.09900990099 1.471623942989 - -# common subexpr with coalesce (short-circuited) and alias -query RRR rowsort -select f64, coalesce(1.0 / f64, 0.0) as f64_1, acos(coalesce(1.0 / f64, 0.0)) from doubles; ----- -10.1 0.09900990099 1.471623942989 - -# common subexpr with case (short-circuited) -query RRR rowsort -select f64, case when f64 > 0 then 1.0 / f64 else null end, acos(case when f64 > 0 then 1.0 / f64 else null end) from doubles; ----- -10.1 0.09900990099 1.471623942989 - - -statement ok -CREATE EXTERNAL TABLE aggregate_test_100 ( - c1 VARCHAR NOT NULL, - c2 TINYINT NOT NULL, - c3 SMALLINT NOT NULL, - c4 SMALLINT, - c5 INT, - c6 BIGINT NOT NULL, - c7 SMALLINT NOT NULL, - c8 INT NOT NULL, - c9 BIGINT UNSIGNED NOT NULL, - c10 VARCHAR NOT NULL, - c11 FLOAT NOT NULL, - c12 DOUBLE NOT NULL, - c13 VARCHAR NOT NULL -) -STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv' - -# starting complex expression -query TIB -SELECT DISTINCT ON (chr(ascii(c1) + 3), c2 % 2) chr(ascii(upper(c1)) + 3), c2 % 2, c3 > 80 -FROM aggregate_test_100 -WHERE c1 IN ('a', 'b') -ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC; ----- -D 0 false -D 1 true -E 0 false -E 1 false - -# regression test -query TIB -SELECT DISTINCT ON (chr(ascii(c1) + 3), c2 % 2) chr(ascii(upper(c1)) + 3), c2 % 2, c3 > 80 AND c2 % 2 = 1 -FROM aggregate_test_100 -WHERE c1 IN ('a', 'b') -ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC; ----- -D 0 false -D 1 true -E 0 false -E 1 false - -# The projection `c2 % 2 = 1` is fine. -query B -SELECT c2 % 2 = 1 -FROM aggregate_test_100 -WHERE c1 IN ('a', 'b') -ORDER BY chr(ascii(c1) + 3), c2 % 2, c3 DESC -LIMIT 1; ----- -false \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 73fb5eec97d5..8a898633bc76 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1918,3 +1918,39 @@ false true false true NULL NULL NULL NULL false false true true false false true false + + +############# +## Common Subexpr Eliminate Tests +############# + +statement ok +CREATE TABLE doubles ( + f64 DOUBLE +) as VALUES + (10.1) +; + +# common subexpr with alias +query RRR rowsort +select f64, round(1.0 / f64) as i64_1, acos(round(1.0 / f64)) from doubles; +---- +10.1 0 1.570796326795 + +# common subexpr with coalesce (short-circuited) +query RRR rowsort +select f64, coalesce(1.0 / f64, 0.0), acos(coalesce(1.0 / f64, 0.0)) from doubles; +---- +10.1 0.09900990099 1.471623942989 + +# common subexpr with coalesce (short-circuited) and alias +query RRR rowsort +select f64, coalesce(1.0 / f64, 0.0) as f64_1, acos(coalesce(1.0 / f64, 0.0)) from doubles; +---- +10.1 0.09900990099 1.471623942989 + +# common subexpr with case (short-circuited) +query RRR rowsort +select f64, case when f64 > 0 then 1.0 / f64 else null end, acos(case when f64 > 0 then 1.0 / f64 else null end) from doubles; +---- +10.1 0.09900990099 1.471623942989