diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 2b66de641b67..cc59b7803036 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -92,6 +92,15 @@ pub struct RunOpt { #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")] prefer_hash_join: BoolDefaultTrue, + /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join + /// True by default. + #[structopt( + short = "j", + long = "enable_piecewise_merge_join", + default_value = "false" + )] + enable_piecewise_merge_join: BoolDefaultTrue, + /// Mark the first column of each table as sorted in ascending order. /// The tables should have been created with the `--sort` option for this to have any effect. #[structopt(short = "t", long = "sorted")] @@ -112,6 +121,8 @@ impl RunOpt { .config()? .with_collect_statistics(!self.disable_statistics); config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join; + config.options_mut().optimizer.enable_piecewise_merge_join = + self.enable_piecewise_merge_join; let rt_builder = self.common.runtime_env_builder()?; let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?); // register tables @@ -379,6 +390,7 @@ mod tests { output_path: None, disable_statistics: false, prefer_hash_join: true, + enable_piecewise_merge_join: false, sorted: false, }; opt.register_tables(&ctx).await?; @@ -416,6 +428,7 @@ mod tests { output_path: None, disable_statistics: false, prefer_hash_join: true, + enable_piecewise_merge_join: false, sorted: false, }; opt.register_tables(&ctx).await?; diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh index ed3e699c1413..90bbc5d3bad0 100755 --- a/dev/update_config_docs.sh +++ b/dev/update_config_docs.sh @@ -175,6 +175,66 @@ SET datafusion.execution.batch_size = 1024; [`FairSpillPool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html +## Join Queries + +Currently Apache Datafusion supports the following join algorithms: + +- Nested Loop Join +- Sort Merge Join +- Hash Join +- Symmetric Hash Join +- Piecewise Merge Join (experimental) + +The physical planner will choose the appropriate algorithm based on the statistics + join +condition of the two tables. + +# Join Algorithm Optimizer Configurations + +You can modify join optimization behavior in your queries by setting specific configuration values. +Use the following command to update a configuration: + +``` sql +SET datafusion.optimizer.; +``` + +Example + +``` sql +SET datafusion.optimizer.prefer_hash_join = false; +``` + +Adjusting the following configuration values influences how the optimizer selects the join algorithm +used to execute your SQL query: + +## Join Optimizer Configurations + +Adjusting the following configuration values influences how the optimizer selects the join algorithm +used to execute your SQL query. + +### allow_symmetric_joins_without_pruning (bool, default = true) + +Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs +lack ordering or filtering. + +- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution. + +### prefer_hash_join (bool, default = true) + +Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection. + +- true: favors HashJoin for faster execution when sufficient memory is available. +- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed. + +### enable_piecewise_merge_join (bool, default = false) + +Enables the experimental Piecewise Merge Join algorithm. + +- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range + filter in the join condition. +- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter + except for cases where it is joining two large tables (num_rows > 100,000) that are approximately + equal in size. + EOF diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 7ec1864b4667..436c019efbb9 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -253,3 +253,63 @@ SET datafusion.execution.batch_size = 1024; ``` [`fairspillpool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html + +## Join Queries + +Currently Apache Datafusion supports the following join algorithms: + +- Nested Loop Join +- Sort Merge Join +- Hash Join +- Symmetric Hash Join +- Piecewise Merge Join (experimental) + +The physical planner will choose the appropriate algorithm based on the statistics + join +condition of the two tables. + +# Join Algorithm Optimizer Configurations + +You can modify join optimization behavior in your queries by setting specific configuration values. +Use the following command to update a configuration: + +```sql +SET datafusion.optimizer.; +``` + +Example + +```sql +SET datafusion.optimizer.prefer_hash_join = false; +``` + +Adjusting the following configuration values influences how the optimizer selects the join algorithm +used to execute your SQL query: + +## Join Optimizer Configurations + +Adjusting the following configuration values influences how the optimizer selects the join algorithm +used to execute your SQL query. + +### allow_symmetric_joins_without_pruning (bool, default = true) + +Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs +lack ordering or filtering. + +- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution. + +### prefer_hash_join (bool, default = true) + +Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection. + +- true: favors HashJoin for faster execution when sufficient memory is available. +- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed. + +### enable_piecewise_merge_join (bool, default = false) + +Enables the experimental Piecewise Merge Join algorithm. + +- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range + filter in the join condition. +- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter + except for cases where it is joining two large tables (num_rows > 100,000) that are approximately + equal in size.