From 539b81b97c22722a262cf5871e9278ebf7b903a3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Jun 2021 14:03:28 -0400 Subject: [PATCH 1/4] Add test for not eq pruning --- datafusion/src/physical_optimizer/pruning.rs | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index a7e1fb00c230..0c64f0530b14 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -1190,6 +1190,34 @@ mod tests { assert_eq!(result, expected); } + #[test] + fn prune_not_eq_data() { + let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); + + // Prune using s2 != 'M' + let expr = col("s1").not_eq(lit("M")); + + let statistics = TestStatistics::new().with( + "s1", + ContainerStats::new_utf8( + vec![Some("A"), Some("A"), Some("N"), None, Some("A")], // min + vec![Some("Z"), Some("L"), Some("Z"), None, None], // max + ), + ); + + // s1 [A, Z] ==> might have values that pass predicate + // s1 [A, L] ==> all rows pass the predicate + // s1 [N, Z] ==> all rows pass the predicate + // No stats for s2 ==> some rows could pass + // s2 [3, None] (null max) ==> some rows could pass + + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + let expected = vec![true, true, true, true, true]; + + assert_eq!(result, expected); + } + /// Creates setup for boolean chunk pruning /// /// For predicate "b1" (boolean expr) From 840cb0f3d7749eb74d811796976190be060c1a86 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Jun 2021 14:03:35 -0400 Subject: [PATCH 2/4] Revert "#420: Support for not_eq predicate in pruning predicates (#544)" This reverts commit 2568323dbd85e05f2bf3e6e484f7cc39983ff26c. --- datafusion/src/physical_optimizer/pruning.rs | 28 -------------------- 1 file changed, 28 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index 0c64f0530b14..556a16207bc8 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -552,14 +552,6 @@ fn build_predicate_expression( }; let corrected_op = expr_builder.correct_operator(op); let statistics_expr = match corrected_op { - Operator::NotEq => { - // column != literal => (min, max) = literal => min > literal || literal > max - let min_column_expr = expr_builder.min_column_expr()?; - let max_column_expr = expr_builder.max_column_expr()?; - min_column_expr - .gt(expr_builder.scalar_expr().clone()) - .or(expr_builder.scalar_expr().clone().gt(max_column_expr)) - } Operator::Eq => { // column = literal => (min, max) = literal => min <= literal && literal <= max // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) @@ -937,26 +929,6 @@ mod tests { Ok(()) } - #[test] - fn row_group_predicate_not_eq() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "#c1_min Gt Int32(1) Or Int32(1) Gt #c1_max"; - - // test column on the left - let expr = col("c1").not_eq(lit(1)); - let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - // test column on the right - let expr = lit(1).not_eq(col("c1")); - let predicate_expr = - build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - Ok(()) - } - #[test] fn row_group_predicate_gt() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); From b1d348fad96972ade1b82be62c16d2c147fd7d46 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Jun 2021 14:07:47 -0400 Subject: [PATCH 3/4] fix test --- datafusion/src/physical_optimizer/pruning.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index 556a16207bc8..be0ba10c48e6 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -1184,10 +1184,17 @@ mod tests { // s2 [3, None] (null max) ==> some rows could pass let p = PruningPredicate::try_new(&expr, schema).unwrap(); - let result = p.prune(&statistics).unwrap(); - let expected = vec![true, true, true, true, true]; + let result = p.prune(&statistics).unwrap_err(); + assert!( + result + .to_string() + .contains("Invalid argument error: at least one column must be defined to create a record batch"), + "{}", + result + ); - assert_eq!(result, expected); + //let expected = vec![true, true, true, true, true]; + //assert_eq!(result, expected); } /// Creates setup for boolean chunk pruning From 54c4270770e779c7d5488ee7cd14095423cf7aea Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Jun 2021 14:21:27 -0400 Subject: [PATCH 4/4] fix logic --- datafusion/src/physical_optimizer/pruning.rs | 49 ++++++++++++++------ 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index be0ba10c48e6..9e8d9fa77858 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -552,6 +552,16 @@ fn build_predicate_expression( }; let corrected_op = expr_builder.correct_operator(op); let statistics_expr = match corrected_op { + Operator::NotEq => { + // column != literal => (min, max) = literal => + // !(min != literal && max != literal) ==> + // min != literal || literal != max + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + min_column_expr + .not_eq(expr_builder.scalar_expr().clone()) + .or(expr_builder.scalar_expr().clone().not_eq(max_column_expr)) + } Operator::Eq => { // column = literal => (min, max) = literal => min <= literal && literal <= max // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) @@ -929,6 +939,26 @@ mod tests { Ok(()) } + #[test] + fn row_group_predicate_not_eq() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "#c1_min NotEq Int32(1) Or Int32(1) NotEq #c1_max"; + + // test column on the left + let expr = col("c1").not_eq(lit(1)); + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + // test column on the right + let expr = lit(1).not_eq(col("c1")); + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + #[test] fn row_group_predicate_gt() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); @@ -1172,29 +1202,22 @@ mod tests { let statistics = TestStatistics::new().with( "s1", ContainerStats::new_utf8( - vec![Some("A"), Some("A"), Some("N"), None, Some("A")], // min - vec![Some("Z"), Some("L"), Some("Z"), None, None], // max + vec![Some("A"), Some("A"), Some("N"), Some("M"), None, Some("A")], // min + vec![Some("Z"), Some("L"), Some("Z"), Some("M"), None, None], // max ), ); // s1 [A, Z] ==> might have values that pass predicate // s1 [A, L] ==> all rows pass the predicate // s1 [N, Z] ==> all rows pass the predicate + // s1 [M, M] ==> all rows do not pass the predicate // No stats for s2 ==> some rows could pass // s2 [3, None] (null max) ==> some rows could pass let p = PruningPredicate::try_new(&expr, schema).unwrap(); - let result = p.prune(&statistics).unwrap_err(); - assert!( - result - .to_string() - .contains("Invalid argument error: at least one column must be defined to create a record batch"), - "{}", - result - ); - - //let expected = vec![true, true, true, true, true]; - //assert_eq!(result, expected); + let result = p.prune(&statistics).unwrap(); + let expected = vec![true, true, true, false, true, true]; + assert_eq!(result, expected); } /// Creates setup for boolean chunk pruning