From 3e2a299479d4cd96a88fe055c340a2a35038eaca Mon Sep 17 00:00:00 2001 From: acking-you Date: Wed, 3 Jul 2024 16:12:18 +0800 Subject: [PATCH 01/23] [draft] add shot circuit in BinaryExpr --- .../physical-expr/src/expressions/binary.rs | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index f21d3e7652cd..1f4b1243ed79 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -359,8 +359,47 @@ impl PhysicalExpr for BinaryExpr { use arrow::compute::kernels::numeric::*; let lhs = self.left.evaluate(batch)?; - let rhs = self.right.evaluate(batch)?; let left_data_type = lhs.data_type(); + + if left_data_type == DataType::Boolean && self.op == Operator::And { + match &lhs { + ColumnarValue::Array(array) => { + if let Ok(array) = as_boolean_array(&array) { + if array.true_count() == 0 { + return Ok(lhs); + } + } + } + ColumnarValue::Scalar(scalar) => { + if let ScalarValue::Boolean(Some(value)) = scalar { + if !value { + return Ok(lhs); + } + } + } + } + } + + if left_data_type == DataType::Boolean && self.op == Operator::Or { + match &lhs { + ColumnarValue::Array(array) => { + if let Ok(array) = as_boolean_array(&array) { + if array.true_count() == array.len() { + return Ok(lhs); + } + } + } + ColumnarValue::Scalar(scalar) => { + if let ScalarValue::Boolean(Some(value)) = scalar { + if *value { + return Ok(lhs); + } + } + } + } + } + + let rhs = self.right.evaluate(batch)?; let right_data_type = rhs.data_type(); let schema = batch.schema(); From 57d66456dca922b733748ce5f7493146ab9173ce Mon Sep 17 00:00:00 2001 From: acking-you Date: Wed, 3 Jul 2024 17:08:00 +0800 Subject: [PATCH 02/23] refactor: add check_short_circuit function --- .../physical-expr/src/expressions/binary.rs | 72 +++++++++++-------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 1f4b1243ed79..8d75eb7109e3 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -358,48 +358,58 @@ impl PhysicalExpr for BinaryExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { use arrow::compute::kernels::numeric::*; - let lhs = self.left.evaluate(batch)?; - let left_data_type = lhs.data_type(); - - if left_data_type == DataType::Boolean && self.op == Operator::And { - match &lhs { - ColumnarValue::Array(array) => { - if let Ok(array) = as_boolean_array(&array) { - if array.true_count() == 0 { - return Ok(lhs); + #[inline] + fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool { + let data_type = arg.data_type(); + if data_type == DataType::Boolean { + if *op == Operator::And { + match arg { + ColumnarValue::Array(array) => { + if let Ok(array) = as_boolean_array(&array) { + if array.true_count() == 0 { + return true; + } + } } - } - } - ColumnarValue::Scalar(scalar) => { - if let ScalarValue::Boolean(Some(value)) = scalar { - if !value { - return Ok(lhs); + ColumnarValue::Scalar(scalar) => { + if let ScalarValue::Boolean(Some(value)) = scalar { + if !value { + return true; + } + } } } - } - } - } - - if left_data_type == DataType::Boolean && self.op == Operator::Or { - match &lhs { - ColumnarValue::Array(array) => { - if let Ok(array) = as_boolean_array(&array) { - if array.true_count() == array.len() { - return Ok(lhs); + } else if *op == Operator::Or { + match arg { + ColumnarValue::Array(array) => { + if let Ok(array) = as_boolean_array(&array) { + if array.true_count() == array.len() { + return true; + } + } } - } - } - ColumnarValue::Scalar(scalar) => { - if let ScalarValue::Boolean(Some(value)) = scalar { - if *value { - return Ok(lhs); + ColumnarValue::Scalar(scalar) => { + if let ScalarValue::Boolean(Some(value)) = scalar { + if *value { + return true; + } + } } } } } + false + } + + let lhs = self.left.evaluate(batch)?; + + // Optimize for short-circuiting `Operator::And` or `Operator::Or` operations and return early. + if check_short_circuit(&lhs, &self.op) { + return Ok(lhs); } let rhs = self.right.evaluate(batch)?; + let left_data_type = lhs.data_type(); let right_data_type = rhs.data_type(); let schema = batch.schema(); From b85807dc8d7118003910e1d87187493e4ebb5903 Mon Sep 17 00:00:00 2001 From: acking-you Date: Thu, 4 Jul 2024 16:33:18 +0800 Subject: [PATCH 03/23] refactor: change if condition to match --- .../physical-expr/src/expressions/binary.rs | 28 ++++++++----------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 8d75eb7109e3..e59e538aeb62 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -358,47 +358,41 @@ impl PhysicalExpr for BinaryExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { use arrow::compute::kernels::numeric::*; - #[inline] fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool { let data_type = arg.data_type(); - if data_type == DataType::Boolean { - if *op == Operator::And { + match (data_type, op) { + (DataType::Boolean, Operator::And) => { match arg { ColumnarValue::Array(array) => { if let Ok(array) = as_boolean_array(&array) { - if array.true_count() == 0 { - return true; - } + return array.true_count() == 0; } } ColumnarValue::Scalar(scalar) => { if let ScalarValue::Boolean(Some(value)) = scalar { - if !value { - return true; - } + return !value; } } } - } else if *op == Operator::Or { + false + } + (DataType::Boolean, Operator::Or) => { match arg { ColumnarValue::Array(array) => { if let Ok(array) = as_boolean_array(&array) { - if array.true_count() == array.len() { - return true; - } + return array.true_count() == array.len(); } } ColumnarValue::Scalar(scalar) => { if let ScalarValue::Boolean(Some(value)) = scalar { - if *value { - return true; - } + return *value; } } } + false } + _ => false, } - false } let lhs = self.left.evaluate(batch)?; From bf4e2180b508febf7ca22db6e028a2e1b9213d12 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 14 Feb 2025 22:35:54 +0800 Subject: [PATCH 04/23] feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (#14642) * Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option From c41465eccc44227d9a95ae3e61d6283f95e8a426 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 11:21:57 -0500 Subject: [PATCH 05/23] Chore/Add additional FFI unit tests (#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb From f171227ae668172b3ba63d2265c25850db26f150 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 17 Mar 2025 11:35:14 -0400 Subject: [PATCH 06/23] Improve feature flag CI coverage `datafusion` and `datafusion-functions` (#15203) From ac3c918f2871d95b56eb38d1f657554de5e939eb Mon Sep 17 00:00:00 2001 From: ackingliu Date: Fri, 28 Mar 2025 02:34:33 +0800 Subject: [PATCH 07/23] add extend sql & docs --- benchmarks/queries/clickbench/README.md | 33 +++++++++++++++++++++- benchmarks/queries/clickbench/extended.sql | 3 +- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md index 6797797409c1..2032427e1ef2 100644 --- a/benchmarks/queries/clickbench/README.md +++ b/benchmarks/queries/clickbench/README.md @@ -93,12 +93,14 @@ LIMIT 10; Results look like +``` +-------------+---------------------+---+------+------+------+ | ClientIP | WatchID | c | tmin | tmed | tmax | +-------------+---------------------+---+------+------+------+ | 1611957945 | 6655575552203051303 | 2 | 0 | 0 | 0 | | -1402644643 | 8566928176839891583 | 2 | 0 | 0 | 0 | +-------------+---------------------+---+------+------+------+ +``` ### Q5: Response start time distribution analysis (p95) @@ -120,13 +122,42 @@ LIMIT 10; ``` Results look like - +``` +-------------+---------------------+---+------+------+------+ | ClientIP | WatchID | c | tmin | tp95 | tmax | +-------------+---------------------+---+------+------+------+ | 1611957945 | 6655575552203051303 | 2 | 0 | 0 | 0 | | -1402644643 | 8566928176839891583 | 2 | 0 | 0 | 0 | +-------------+---------------------+---+------+------+------+ +``` + +### Q6: How many social shares meet complex multi-stage filtering criteria? +**Question**: What is the count of sharing actions from iPhone mobile users on specific social networks, within common timezones, participating in seasonal campaigns, with high screen resolutions and closely matched UTM parameters? +**Important Query Properties**: Simple filter with high-selectivity, Costly string matching, A large number of filters with high overhead are positioned relatively later in the process + +```sql +SELECT COUNT(*) AS ShareCount +FROM hits +WHERE + -- Stage 1: High-selectivity filters (fast) + "IsMobile" = 1 -- Filter mobile users + AND "MobilePhoneModel" LIKE 'iPhone%' -- Match iPhone models + AND "SocialAction" = 'share' -- Identify social sharing actions + + -- Stage 2: Moderate filters (cheap) + AND "SocialSourceNetworkID" IN (5, 12) -- Filter specific social networks + AND "ClientTimeZone" BETWEEN -5 AND 5 -- Restrict to common timezones + + -- Stage 3: Heavy computations (expensive) + AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL -- Find campaign-specific referrers + AND CASE + WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' + THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT + ELSE 0 + END > 1920 -- Extract and validate resolution parameter + AND levenshtein("UTMSource", "UTMCampaign") < 3 -- Verify UTM parameter similarity +``` +Result is empty,Since it has already been filtered by `"SocialAction" = 'share'`. ## Data Notes diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql index fbabaf2a7021..ef3a409c9c02 100644 --- a/benchmarks/queries/clickbench/extended.sql +++ b/benchmarks/queries/clickbench/extended.sql @@ -3,4 +3,5 @@ SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTI SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10; SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10; SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10; -SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; \ No newline at end of file +SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; +SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein("UTMSource", "UTMCampaign") < 3; \ No newline at end of file From a64239443b8147f1b87f1db98938076d2bee15a5 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 14 Feb 2025 22:35:54 +0800 Subject: [PATCH 08/23] feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (#14642) * Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option From 39855d3dfe24e852d3d711311e49659197b5e859 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 11:21:57 -0500 Subject: [PATCH 09/23] Chore/Add additional FFI unit tests (#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb From 2205edba001d1b64a4c7904347bfa5ba8e9fd3bc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 17 Mar 2025 11:35:14 -0400 Subject: [PATCH 10/23] Improve feature flag CI coverage `datafusion` and `datafusion-functions` (#15203) From d79a75aaffd991193b60e2982d15edc094f4c55f Mon Sep 17 00:00:00 2001 From: ackingliu Date: Sat, 29 Mar 2025 01:01:35 +0800 Subject: [PATCH 11/23] fix: incorrect false judgment --- datafusion/physical-expr/src/expressions/binary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index e59e538aeb62..08619f14cdbd 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -365,7 +365,7 @@ impl PhysicalExpr for BinaryExpr { match arg { ColumnarValue::Array(array) => { if let Ok(array) = as_boolean_array(&array) { - return array.true_count() == 0; + return array.false_count() == array.len(); } } ColumnarValue::Scalar(scalar) => { From 575c3f309162b9fbfd6f3f3dc572d872d8689181 Mon Sep 17 00:00:00 2001 From: ackingliu Date: Sun, 30 Mar 2025 04:06:46 +0800 Subject: [PATCH 12/23] add test --- .../physical-expr/src/expressions/binary.rs | 113 ++++++++++++------ 1 file changed, 76 insertions(+), 37 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 08619f14cdbd..e920d29c87c4 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -358,43 +358,6 @@ impl PhysicalExpr for BinaryExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { use arrow::compute::kernels::numeric::*; - fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool { - let data_type = arg.data_type(); - match (data_type, op) { - (DataType::Boolean, Operator::And) => { - match arg { - ColumnarValue::Array(array) => { - if let Ok(array) = as_boolean_array(&array) { - return array.false_count() == array.len(); - } - } - ColumnarValue::Scalar(scalar) => { - if let ScalarValue::Boolean(Some(value)) = scalar { - return !value; - } - } - } - false - } - (DataType::Boolean, Operator::Or) => { - match arg { - ColumnarValue::Array(array) => { - if let Ok(array) = as_boolean_array(&array) { - return array.true_count() == array.len(); - } - } - ColumnarValue::Scalar(scalar) => { - if let ScalarValue::Boolean(Some(value)) = scalar { - return *value; - } - } - } - false - } - _ => false, - } - } - let lhs = self.left.evaluate(batch)?; // Optimize for short-circuiting `Operator::And` or `Operator::Or` operations and return early. @@ -848,6 +811,47 @@ impl BinaryExpr { } } +/// Check if it meets the short-circuit condition +/// 1. For the `AND` operator, if the `lhs` result all are `false` +/// 2. For the `OR` operator, if the `lhs` result all are `true` +/// 3. Otherwise, it does not meet the short-circuit condition +fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool { + let data_type = arg.data_type(); + match (data_type, op) { + (DataType::Boolean, Operator::And) => { + match arg { + ColumnarValue::Array(array) => { + if let Ok(array) = as_boolean_array(&array) { + return array.false_count() == array.len(); + } + } + ColumnarValue::Scalar(scalar) => { + if let ScalarValue::Boolean(Some(value)) = scalar { + return !value; + } + } + } + false + } + (DataType::Boolean, Operator::Or) => { + match arg { + ColumnarValue::Array(array) => { + if let Ok(array) = as_boolean_array(&array) { + return array.true_count() == array.len(); + } + } + ColumnarValue::Scalar(scalar) => { + if let ScalarValue::Boolean(Some(value)) = scalar { + return *value; + } + } + } + false + } + _ => false, + } +} + fn concat_elements(left: Arc, right: Arc) -> Result { Ok(match left.data_type() { DataType::Utf8 => Arc::new(concat_elements_utf8( @@ -4875,4 +4879,39 @@ mod tests { Ok(()) } + + #[test] + fn test_check_short_circuit() { + use crate::planner::logical2physical; + use datafusion_expr::col as logical_col; + use datafusion_expr::lit; + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + let a_array = Int32Array::from(vec![1, 3, 4, 5, 6]); + let b_array = Int32Array::from(vec![1, 2, 3, 4, 5]); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(a_array), Arc::new(b_array)], + ) + .unwrap(); + + // op: AND left: all false + let left_expr = logical2physical(&logical_col("a").eq(lit(2)), &schema); + let left_value = left_expr.evaluate(&batch).unwrap(); + assert!(check_short_circuit(&left_value, &Operator::And)); + // op: AND left: not all false + let left_expr = logical2physical(&logical_col("a").eq(lit(3)), &schema); + let left_value = left_expr.evaluate(&batch).unwrap(); + assert!(!check_short_circuit(&left_value, &Operator::And)); + // op: OR left: all true + let left_expr = logical2physical(&logical_col("a").gt(lit(0)), &schema); + let left_value = left_expr.evaluate(&batch).unwrap(); + assert!(check_short_circuit(&left_value, &Operator::Or)); + // op: OR left: not all true + let left_expr = logical2physical(&logical_col("a").gt(lit(2)), &schema); + let left_value = left_expr.evaluate(&batch).unwrap(); + assert!(!check_short_circuit(&left_value, &Operator::Or)); + } } From 880106344cd2b8af1bac464f347b8c5299b7945a Mon Sep 17 00:00:00 2001 From: ackingliu Date: Mon, 31 Mar 2025 11:17:18 +0800 Subject: [PATCH 13/23] separate q6 to new PR --- benchmarks/queries/clickbench/README.md | 33 +--------------------- benchmarks/queries/clickbench/extended.sql | 3 +- 2 files changed, 2 insertions(+), 34 deletions(-) diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md index 2032427e1ef2..6797797409c1 100644 --- a/benchmarks/queries/clickbench/README.md +++ b/benchmarks/queries/clickbench/README.md @@ -93,14 +93,12 @@ LIMIT 10; Results look like -``` +-------------+---------------------+---+------+------+------+ | ClientIP | WatchID | c | tmin | tmed | tmax | +-------------+---------------------+---+------+------+------+ | 1611957945 | 6655575552203051303 | 2 | 0 | 0 | 0 | | -1402644643 | 8566928176839891583 | 2 | 0 | 0 | 0 | +-------------+---------------------+---+------+------+------+ -``` ### Q5: Response start time distribution analysis (p95) @@ -122,42 +120,13 @@ LIMIT 10; ``` Results look like -``` + +-------------+---------------------+---+------+------+------+ | ClientIP | WatchID | c | tmin | tp95 | tmax | +-------------+---------------------+---+------+------+------+ | 1611957945 | 6655575552203051303 | 2 | 0 | 0 | 0 | | -1402644643 | 8566928176839891583 | 2 | 0 | 0 | 0 | +-------------+---------------------+---+------+------+------+ -``` - -### Q6: How many social shares meet complex multi-stage filtering criteria? -**Question**: What is the count of sharing actions from iPhone mobile users on specific social networks, within common timezones, participating in seasonal campaigns, with high screen resolutions and closely matched UTM parameters? -**Important Query Properties**: Simple filter with high-selectivity, Costly string matching, A large number of filters with high overhead are positioned relatively later in the process - -```sql -SELECT COUNT(*) AS ShareCount -FROM hits -WHERE - -- Stage 1: High-selectivity filters (fast) - "IsMobile" = 1 -- Filter mobile users - AND "MobilePhoneModel" LIKE 'iPhone%' -- Match iPhone models - AND "SocialAction" = 'share' -- Identify social sharing actions - - -- Stage 2: Moderate filters (cheap) - AND "SocialSourceNetworkID" IN (5, 12) -- Filter specific social networks - AND "ClientTimeZone" BETWEEN -5 AND 5 -- Restrict to common timezones - - -- Stage 3: Heavy computations (expensive) - AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL -- Find campaign-specific referrers - AND CASE - WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' - THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT - ELSE 0 - END > 1920 -- Extract and validate resolution parameter - AND levenshtein("UTMSource", "UTMCampaign") < 3 -- Verify UTM parameter similarity -``` -Result is empty,Since it has already been filtered by `"SocialAction" = 'share'`. ## Data Notes diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql index ef3a409c9c02..fbabaf2a7021 100644 --- a/benchmarks/queries/clickbench/extended.sql +++ b/benchmarks/queries/clickbench/extended.sql @@ -3,5 +3,4 @@ SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTI SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10; SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10; SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10; -SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; -SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein("UTMSource", "UTMCampaign") < 3; \ No newline at end of file +SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; \ No newline at end of file From e190119f11b565916e3f281a2080eecad0a00e89 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 14 Feb 2025 22:35:54 +0800 Subject: [PATCH 14/23] feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (#14642) * Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option From f2c4caa0d110ee73cb9652384326ad819e8cf21c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 11:21:57 -0500 Subject: [PATCH 15/23] Chore/Add additional FFI unit tests (#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb From 0ef29b1ba44029d4ed0c0e2a77ec154482e18c95 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 17 Mar 2025 11:35:14 -0400 Subject: [PATCH 16/23] Improve feature flag CI coverage `datafusion` and `datafusion-functions` (#15203) From 6ea250223b454adb95ab7d5c732d104b4f500a71 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 14 Feb 2025 22:35:54 +0800 Subject: [PATCH 17/23] feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (#14642) * Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option From f8f4d6fb2dbec1774850ec93486959f6371c8e59 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 11:21:57 -0500 Subject: [PATCH 18/23] Chore/Add additional FFI unit tests (#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb From 13742d2a838b285e9a615f8d97706f48779832d8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 17 Mar 2025 11:35:14 -0400 Subject: [PATCH 19/23] Improve feature flag CI coverage `datafusion` and `datafusion-functions` (#15203) From 1394f3fb23613a445efa6eec9d87ec0de1727297 Mon Sep 17 00:00:00 2001 From: ackingliu Date: Mon, 7 Apr 2025 18:13:37 +0800 Subject: [PATCH 20/23] add benchmark for boolean_op --- datafusion/physical-expr/Cargo.toml | 4 + .../physical-expr/benches/boolean_op.rs | 187 ++++++++++++++++++ .../physical-expr/src/expressions/binary.rs | 21 +- 3 files changed, 208 insertions(+), 4 deletions(-) create mode 100644 datafusion/physical-expr/benches/boolean_op.rs diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 72baa0db00a2..a8d18db2ff9c 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -71,3 +71,7 @@ name = "case_when" [[bench]] harness = false name = "is_null" + +[[bench]] +harness = false +name = "boolean_op" diff --git a/datafusion/physical-expr/benches/boolean_op.rs b/datafusion/physical-expr/benches/boolean_op.rs new file mode 100644 index 000000000000..a6b73322e59a --- /dev/null +++ b/datafusion/physical-expr/benches/boolean_op.rs @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::{ + array::BooleanArray, + compute::{bool_and, bool_or}, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use std::sync::{Arc, LazyLock}; + +/// Generates BooleanArrays with different true/false distributions for benchmarking. +/// +/// Returns a vector of tuples containing scenario name and corresponding BooleanArray. +/// +/// # Arguments +/// - `TEST_ALL_FALSE` - Used to generate what kind of test data +/// - `len` - Length of the BooleanArray to generate +fn generate_boolean_cases( + len: usize, +) -> Vec<(String, BooleanArray)> { + let mut cases = Vec::with_capacity(6); + + // Scenario 1: All elements false or all elements true + if TEST_ALL_FALSE { + let all_false = BooleanArray::from(vec![false; len]); + cases.push(("all_false".to_string(), all_false)); + } else { + let all_true = BooleanArray::from(vec![true; len]); + cases.push(("all_true".to_string(), all_true)); + } + + // Scenario 2: Single true at first position or single false at first position + if TEST_ALL_FALSE { + let mut first_true = vec![false; len]; + first_true[0] = true; + cases.push(("one_true_first".to_string(), BooleanArray::from(first_true))); + } else { + let mut first_false = vec![true; len]; + first_false[0] = false; + cases.push(( + "one_false_first".to_string(), + BooleanArray::from(first_false), + )); + } + + // Scenario 3: Single true at last position or single false at last position + if TEST_ALL_FALSE { + let mut last_true = vec![false; len]; + last_true[len - 1] = true; + cases.push(("one_true_last".to_string(), BooleanArray::from(last_true))); + } else { + let mut last_false = vec![true; len]; + last_false[len - 1] = false; + cases.push(("one_false_last".to_string(), BooleanArray::from(last_false))); + } + + // Scenario 4: Single true at exact middle or single false at exact middle + let mid = len / 2; + if TEST_ALL_FALSE { + let mut mid_true = vec![false; len]; + mid_true[mid] = true; + cases.push(("one_true_middle".to_string(), BooleanArray::from(mid_true))); + } else { + let mut mid_false = vec![true; len]; + mid_false[mid] = false; + cases.push(( + "one_false_middle".to_string(), + BooleanArray::from(mid_false), + )); + } + + // Scenario 5: Single true at 25% position or single false at 25% position + let mid_left = len / 4; + if TEST_ALL_FALSE { + let mut mid_left_true = vec![false; len]; + mid_left_true[mid_left] = true; + cases.push(( + "one_true_middle_left".to_string(), + BooleanArray::from(mid_left_true), + )); + } else { + let mut mid_left_false = vec![true; len]; + mid_left_false[mid_left] = false; + cases.push(( + "one_false_middle_left".to_string(), + BooleanArray::from(mid_left_false), + )); + } + + // Scenario 6: Single true at 75% position or single false at 75% position + let mid_right = (3 * len) / 4; + if TEST_ALL_FALSE { + let mut mid_right_true = vec![false; len]; + mid_right_true[mid_right] = true; + cases.push(( + "one_true_middle_right".to_string(), + BooleanArray::from(mid_right_true), + )); + } else { + let mut mid_right_false = vec![true; len]; + mid_right_false[mid_right] = false; + cases.push(( + "one_false_middle_right".to_string(), + BooleanArray::from(mid_right_false), + )); + } + + cases +} + +fn benchmark_boolean_ops(c: &mut Criterion) { + let len = 1_000_000; // Use one million elements for clear performance differentiation + static TEST_BOOL_COUNT: LazyLock = + LazyLock::new(|| match std::env::var("TEST_BOOL_COUNT") { + Ok(_) => { + println!("TEST_BOOL_COUNT=ON"); + true + } + Err(_) => { + println!("TEST_BOOL_COUNT=OFF"); + false + } + }); + + // Determine the test function to be executed based on the ENV `TEST_BOOL_COUNT` + fn test_func(array: &BooleanArray) -> bool { + // Use false_count for all false and true_count for all true + if *TEST_BOOL_COUNT { + if TEST_ALL_FALSE { + array.false_count() == array.len() + } else { + array.true_count() == array.len() + } + } + // Use bool_or for all false and bool_and for all true + else if TEST_ALL_FALSE { + match bool_or(array) { + Some(v) => !v, + None => false, + } + } else { + bool_and(array).unwrap_or(false) + } + } + + // Test cases for false_count and bool_or + { + let test_cases = generate_boolean_cases::(len); + for (scenario, array) in test_cases { + let arr_ref = Arc::new(array); + + // Benchmark test_func across different scenarios + c.bench_function(&scenario, |b| { + b.iter(|| test_func::(black_box(&arr_ref))) + }); + } + } + // Test cases for true_count and bool_and + { + let test_cases = generate_boolean_cases::(len); + for (scenario, array) in test_cases { + let arr_ref = Arc::new(array); + + // Benchmark test_func across different scenarios + c.bench_function(&scenario, |b| { + b.iter(|| test_func::(black_box(&arr_ref))) + }); + } + } +} + +criterion_group!(benches, benchmark_boolean_ops); +criterion_main!(benches); diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index e920d29c87c4..609efcedfa7a 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -811,10 +811,23 @@ impl BinaryExpr { } } -/// Check if it meets the short-circuit condition -/// 1. For the `AND` operator, if the `lhs` result all are `false` -/// 2. For the `OR` operator, if the `lhs` result all are `true` -/// 3. Otherwise, it does not meet the short-circuit condition +/// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation based on the left-hand side (lhs) result. +/// +/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes unnecessary: +/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be `false` regardless of rhs. +/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be `true` regardless of rhs. +/// +/// Returns `true` if short-circuiting is possible, `false` otherwise. +/// +/// # Arguments +/// * `arg` - The left-hand side (lhs) columnar value (array or scalar) +/// * `op` - The logical operator (`AND` or `OR`) +/// +/// # Implementation Notes +/// 1. Only works with Boolean-typed arguments (other types automatically return `false`) +/// 2. Handles both scalar values and array values +/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from arrow-rs. +/// `bool_or`/`bool_and` maybe a better choice too,for detailed discussion,see: https://github.com/apache/datafusion/pull/15462#discussion_r2020558418) fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool { let data_type = arg.data_type(); match (data_type, op) { From 59cfced52e46a87231983294fd75298837f85cde Mon Sep 17 00:00:00 2001 From: ackingliu Date: Mon, 7 Apr 2025 19:31:45 +0800 Subject: [PATCH 21/23] fix cargo doc --- datafusion/physical-expr/src/expressions/binary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 609efcedfa7a..84374f4a2970 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -827,7 +827,7 @@ impl BinaryExpr { /// 1. Only works with Boolean-typed arguments (other types automatically return `false`) /// 2. Handles both scalar values and array values /// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from arrow-rs. -/// `bool_or`/`bool_and` maybe a better choice too,for detailed discussion,see: https://github.com/apache/datafusion/pull/15462#discussion_r2020558418) +/// `bool_or`/`bool_and` maybe a better choice too,for detailed discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418) fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool { let data_type = arg.data_type(); match (data_type, op) { From 775e70a932a1dbf5315d527b3b8e8cbdfe1fbbb5 Mon Sep 17 00:00:00 2001 From: ackingliu Date: Mon, 7 Apr 2025 22:23:24 +0800 Subject: [PATCH 22/23] add binary_op bench --- datafusion/physical-expr/Cargo.toml | 2 +- .../benches/{boolean_op.rs => binary_op.rs} | 163 +++++++++++++++++- 2 files changed, 163 insertions(+), 2 deletions(-) rename datafusion/physical-expr/benches/{boolean_op.rs => binary_op.rs} (50%) diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index a8d18db2ff9c..97d028897b0b 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -74,4 +74,4 @@ name = "is_null" [[bench]] harness = false -name = "boolean_op" +name = "binary_op" diff --git a/datafusion/physical-expr/benches/boolean_op.rs b/datafusion/physical-expr/benches/binary_op.rs similarity index 50% rename from datafusion/physical-expr/benches/boolean_op.rs rename to datafusion/physical-expr/benches/binary_op.rs index a6b73322e59a..990260e2dd24 100644 --- a/datafusion/physical-expr/benches/boolean_op.rs +++ b/datafusion/physical-expr/benches/binary_op.rs @@ -18,8 +18,16 @@ use arrow::{ array::BooleanArray, compute::{bool_and, bool_or}, + datatypes::{DataType, Field, Schema}, }; +use arrow::{array::StringArray, record_batch::RecordBatch}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::{and, binary_expr, col, lit, or, Operator}; +use datafusion_physical_expr::{ + expressions::{BinaryExpr, Column}, + planner::logical2physical, + PhysicalExpr, +}; use std::sync::{Arc, LazyLock}; /// Generates BooleanArrays with different true/false distributions for benchmarking. @@ -183,5 +191,158 @@ fn benchmark_boolean_ops(c: &mut Criterion) { } } -criterion_group!(benches, benchmark_boolean_ops); +/// Benchmarks the performance of binary logical operators (AND/OR) with short-circuit behavior. +/// +/// This function evaluates the execution time of complex logical expressions when: +/// 1. AND operator short-circuits (all left values are false) +/// 2. OR operator short-circuits (all left values are true) +fn benchmark_binary_op_in_short_circuit(c: &mut Criterion) { + // Create schema with three columns + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Boolean, false), + Field::new("b", DataType::Utf8, false), + Field::new("c", DataType::Utf8, false), + ])); + + // Generate test data with extended content + let (b_values, c_values) = generate_test_strings(8192); + + // Create two RecordBatches with different boolean values + let batch_false = + create_record_batch(schema.clone(), false, &b_values, &c_values).unwrap(); + let batch_true = + create_record_batch(schema.clone(), true, &b_values, &c_values).unwrap(); + + // Build complex string matching conditions + let right_condition_and = and( + // Check for API endpoint pattern in URLs + binary_expr( + col("b"), + Operator::RegexMatch, + lit(r#"^https://(\w+\.)?example\.(com|org)/"#), + ), + // Check for markdown code blocks and summary section + binary_expr( + col("c"), + Operator::RegexMatch, + lit("```(rust|python|go)\nfn? main$$"), + ), + ); + + let right_condition_or = or( + // Check for secure HTTPS protocol + binary_expr( + col("b"), + Operator::RegexMatch, + lit(r#"^https://(\w+\.)?example\.(com|org)/"#), + ), + // Check for Rust code examples + binary_expr( + col("c"), + Operator::RegexMatch, + lit("```(rust|python|go)\nfn? main$$"), + ), + ); + + // Create physical binary expressions + let expr_and = BinaryExpr::new( + Arc::new(Column::new("a", 0)), + Operator::And, + logical2physical(&right_condition_and, &schema), + ); + + let expr_or = BinaryExpr::new( + Arc::new(Column::new("a", 0)), + Operator::Or, + logical2physical(&right_condition_or, &schema), + ); + + // Benchmark all false and op is and + { + c.bench_function("bench_all_false_and", |b| { + b.iter(|| expr_and.evaluate(black_box(&batch_false)).unwrap()) + }); + } + // Benchmark all true and op is or + { + c.bench_function("bench_all_true_or", |b| { + b.iter(|| expr_or.evaluate(black_box(&batch_true)).unwrap()) + }); + } +} + +/// Generate test data with computationally expensive patterns +fn generate_test_strings(num_rows: usize) -> (Vec, Vec) { + // Extended URL patterns with query parameters and paths + let base_urls = [ + "https://api.example.com/v2/users/12345/posts?category=tech&sort=date&lang=en-US", + "https://cdn.example.net/assets/images/2023/08/15/sample-image-highres.jpg?width=1920&quality=85", + "http://service.demo.org:8080/api/data/transactions/20230815123456.csv", + "ftp://legacy.archive.example/backups/2023/Q3/database-dump.sql.gz", + "https://docs.example.co.uk/reference/advanced-topics/concurrency/parallel-processing.md#implementation-details", + ]; + + // Extended markdown content with code blocks and structure + let base_markdowns = [ + concat!( + "# Advanced Topics in Computer Science\n\n", + "## Summary\nThis article explores complex system design patterns and...\n\n", + "```rust\nfn process_data(data: &mut [i32]) {\n // Parallel processing example\n data.par_iter_mut().for_each(|x| *x *= 2);\n}\n```\n\n", + "## Performance Considerations\nWhen implementing concurrent systems...\n" + ), + concat!( + "## API Documentation\n\n", + "```json\n{\n \"endpoint\": \"/api/v2/users\",\n \"methods\": [\"GET\", \"POST\"],\n \"parameters\": {\n \"page\": \"number\"\n }\n}\n```\n\n", + "# Authentication Guide\nSecure your API access using OAuth 2.0...\n" + ), + concat!( + "# Data Processing Pipeline\n\n", + "```python\nfrom multiprocessing import Pool\n\ndef main():\n with Pool(8) as p:\n results = p.map(process_item, data)\n```\n\n", + "## Summary of Optimizations\n1. Batch processing\n2. Memory pooling\n3. Concurrent I/O operations\n" + ), + concat!( + "# System Architecture Overview\n\n", + "## Components\n- Load Balancer\n- Database Cluster\n- Cache Service\n\n", + "```go\nfunc main() {\n router := gin.Default()\n router.GET(\"/api/health\", healthCheck)\n router.Run(\":8080\")\n}\n```\n" + ), + concat!( + "## Configuration Reference\n\n", + "```yaml\nserver:\n port: 8080\n max_threads: 32\n\ndatabase:\n url: postgres://user@prod-db:5432/main\n```\n\n", + "# Deployment Strategies\nBlue-green deployment patterns with...\n" + ), + ]; + + let mut urls = Vec::with_capacity(num_rows); + let mut markdowns = Vec::with_capacity(num_rows); + + for i in 0..num_rows { + urls.push(base_urls[i % 5].to_string()); + markdowns.push(base_markdowns[i % 5].to_string()); + } + + (urls, markdowns) +} + +/// Create RecordBatch with specified boolean values +fn create_record_batch( + schema: Arc, + a_value: bool, + b_values: &[String], + c_values: &[String], +) -> arrow::error::Result { + let a_array = BooleanArray::from(vec![a_value; b_values.len()]); + let b_array = StringArray::from(b_values.to_vec()); + let c_array = StringArray::from(c_values.to_vec()); + + RecordBatch::try_new( + schema, + vec![Arc::new(a_array), Arc::new(b_array), Arc::new(c_array)], + ) +} + +criterion_group!( + benches, + benchmark_boolean_ops, + benchmark_binary_op_in_short_circuit +); criterion_main!(benches); From 11816e93e9ba569aa91e93257628db455486c306 Mon Sep 17 00:00:00 2001 From: ackingliu Date: Tue, 8 Apr 2025 12:52:37 +0800 Subject: [PATCH 23/23] Better comments --- datafusion/physical-expr/benches/binary_op.rs | 87 ++++++++++++------- 1 file changed, 56 insertions(+), 31 deletions(-) diff --git a/datafusion/physical-expr/benches/binary_op.rs b/datafusion/physical-expr/benches/binary_op.rs index 990260e2dd24..7ac5c0485203 100644 --- a/datafusion/physical-expr/benches/binary_op.rs +++ b/datafusion/physical-expr/benches/binary_op.rs @@ -130,6 +130,14 @@ fn generate_boolean_cases( cases } +/// Benchmarks boolean operations `false_count/bool_or` and `true_count/bool_and` on [`BooleanArray`] +/// You can run this benchmark with: +/// ```sh +/// # test true_count/false_count +/// TEST_BOOL_COUNT=1 cargo bench --bench binary_op -- boolean_ops +/// # test bool_or/bool_and +/// cargo bench --bench binary_op -- boolean_ops +/// ``` fn benchmark_boolean_ops(c: &mut Criterion) { let len = 1_000_000; // Use one million elements for clear performance differentiation static TEST_BOOL_COUNT: LazyLock = @@ -172,7 +180,7 @@ fn benchmark_boolean_ops(c: &mut Criterion) { let arr_ref = Arc::new(array); // Benchmark test_func across different scenarios - c.bench_function(&scenario, |b| { + c.bench_function(&format!("boolean_ops/or/{}", scenario), |b| { b.iter(|| test_func::(black_box(&arr_ref))) }); } @@ -184,18 +192,23 @@ fn benchmark_boolean_ops(c: &mut Criterion) { let arr_ref = Arc::new(array); // Benchmark test_func across different scenarios - c.bench_function(&scenario, |b| { + c.bench_function(&format!("boolean_ops/and/{}", scenario), |b| { b.iter(|| test_func::(black_box(&arr_ref))) }); } } } -/// Benchmarks the performance of binary logical operators (AND/OR) with short-circuit behavior. +/// Benchmarks AND/OR operator short-circuiting by evaluating complex regex conditions. /// -/// This function evaluates the execution time of complex logical expressions when: -/// 1. AND operator short-circuits (all left values are false) -/// 2. OR operator short-circuits (all left values are true) +/// Creates 6 test scenarios per operator: +/// 1. All values enable short-circuit (all_true/all_false) +/// 2. 2-6 Single true/false value at different positions to measure early exit +/// +/// You can run this benchmark with: +/// ```sh +/// cargo bench --bench binary_op -- short_circuit +/// ``` fn benchmark_binary_op_in_short_circuit(c: &mut Criterion) { // Create schema with three columns let schema = Arc::new(Schema::new(vec![ @@ -207,11 +220,10 @@ fn benchmark_binary_op_in_short_circuit(c: &mut Criterion) { // Generate test data with extended content let (b_values, c_values) = generate_test_strings(8192); - // Create two RecordBatches with different boolean values - let batch_false = - create_record_batch(schema.clone(), false, &b_values, &c_values).unwrap(); - let batch_true = - create_record_batch(schema.clone(), true, &b_values, &c_values).unwrap(); + let batches_and = + create_record_batch::(schema.clone(), &b_values, &c_values).unwrap(); + let batches_or = + create_record_batch::(schema.clone(), &b_values, &c_values).unwrap(); // Build complex string matching conditions let right_condition_and = and( @@ -257,17 +269,21 @@ fn benchmark_binary_op_in_short_circuit(c: &mut Criterion) { logical2physical(&right_condition_or, &schema), ); - // Benchmark all false and op is and + // Each scenario when the test operator is `and` { - c.bench_function("bench_all_false_and", |b| { - b.iter(|| expr_and.evaluate(black_box(&batch_false)).unwrap()) - }); + for (name, batch) in batches_and { + c.bench_function(&format!("short_circuit/and/{}", name), |b| { + b.iter(|| expr_and.evaluate(black_box(&batch)).unwrap()) + }); + } } - // Benchmark all true and op is or + // Each scenario when the test operator is `or` { - c.bench_function("bench_all_true_or", |b| { - b.iter(|| expr_or.evaluate(black_box(&batch_true)).unwrap()) - }); + for (name, batch) in batches_or { + c.bench_function(&format!("short_circuit/or/{}", name), |b| { + b.iter(|| expr_or.evaluate(black_box(&batch)).unwrap()) + }); + } } } @@ -323,21 +339,29 @@ fn generate_test_strings(num_rows: usize) -> (Vec, Vec) { (urls, markdowns) } -/// Create RecordBatch with specified boolean values -fn create_record_batch( +/// Creates record batches with boolean arrays that test different short-circuit scenarios. +/// When TEST_ALL_FALSE = true: creates data for AND operator benchmarks (needs early false exit) +/// When TEST_ALL_FALSE = false: creates data for OR operator benchmarks (needs early true exit) +fn create_record_batch( schema: Arc, - a_value: bool, b_values: &[String], c_values: &[String], -) -> arrow::error::Result { - let a_array = BooleanArray::from(vec![a_value; b_values.len()]); - let b_array = StringArray::from(b_values.to_vec()); - let c_array = StringArray::from(c_values.to_vec()); - - RecordBatch::try_new( - schema, - vec![Arc::new(a_array), Arc::new(b_array), Arc::new(c_array)], - ) +) -> arrow::error::Result> { + // Generate data for six scenarios, but only the data for the "all_false" and "all_true" cases can be optimized through short-circuiting + let boolean_array = generate_boolean_cases::(b_values.len()); + let mut rbs = Vec::with_capacity(boolean_array.len()); + for (name, a_array) in boolean_array { + let b_array = StringArray::from(b_values.to_vec()); + let c_array = StringArray::from(c_values.to_vec()); + rbs.push(( + name, + RecordBatch::try_new( + schema.clone(), + vec![Arc::new(a_array), Arc::new(b_array), Arc::new(c_array)], + )?, + )); + } + Ok(rbs) } criterion_group!( @@ -345,4 +369,5 @@ criterion_group!( benchmark_boolean_ops, benchmark_binary_op_in_short_circuit ); + criterion_main!(benches);