From 1751e10ac9ce34582ffac5bcd84c527faed5fa56 Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Tue, 21 Oct 2025 16:51:28 -0400
Subject: [PATCH 1/6] doc: Add Join Physical Plan configuration information

---
 datafusion/physical-plan/src/joins/README.md | 65 ++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 datafusion/physical-plan/src/joins/README.md

diff --git a/datafusion/physical-plan/src/joins/README.md b/datafusion/physical-plan/src/joins/README.md
new file mode 100644
index 000000000000..4bd16c7222b5
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/README.md
@@ -0,0 +1,65 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Join Physical Plans
+
+Currently Apache Datafusion supports the following join algorithms:
+ - Nested Loop Join
+ - Sort Merge Join
+ - Hash Join
+ - Symmetric Hash Join 
+ - Piecewise Merge Join (experimental) 
+
+The physical planner will choose the appropriate algorithm based on the statistics + join 
+condition of the two tables.
+
+# Join Algorithm Optimizer Configurations
+
+You can modify join optimization behavior in your queries by setting specific configuration values.
+Use the following command to update a configuration:
+
+```
+set datafusion.optimizer.<configuration_name>
+```
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm 
+used to execute your SQL query:
+
+## Join Optimizer Configurations
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query.
+
+### allow_symmetric_joins_without_pruning (bool, default = true)
+Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs
+lack ordering or filtering.
+ - If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution.
+
+### prefer_hash_join (bool, default = true)
+Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection.
+ - true: favors HashJoin for faster execution when sufficient memory is available.
+ - false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed.
+
+### enable_piecewise_merge_join (bool, default = false)
+Enables the experimental Piecewise Merge Join algorithm.
+ - When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range 
+   filter in the join condition.
+ - Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter 
+   except for cases where it is joining two large tables (num_rows > 100,000) that are approximately 
+   equal in size.

From 62f8789b5bfc1a3d0b3c04d82e5a4e064ff83a58 Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Tue, 21 Oct 2025 16:55:08 -0400
Subject: [PATCH 2/6] fmt

---
 datafusion/physical-plan/src/joins/README.md | 37 ++++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/datafusion/physical-plan/src/joins/README.md b/datafusion/physical-plan/src/joins/README.md
index 4bd16c7222b5..5d09ebe033ec 100644
--- a/datafusion/physical-plan/src/joins/README.md
+++ b/datafusion/physical-plan/src/joins/README.md
@@ -20,13 +20,14 @@
 # Join Physical Plans
 
 Currently Apache Datafusion supports the following join algorithms:
- - Nested Loop Join
- - Sort Merge Join
- - Hash Join
- - Symmetric Hash Join 
- - Piecewise Merge Join (experimental) 
 
-The physical planner will choose the appropriate algorithm based on the statistics + join 
+- Nested Loop Join
+- Sort Merge Join
+- Hash Join
+- Symmetric Hash Join
+- Piecewise Merge Join (experimental)
+
+The physical planner will choose the appropriate algorithm based on the statistics + join
 condition of the two tables.
 
 # Join Algorithm Optimizer Configurations
@@ -38,7 +39,7 @@ Use the following command to update a configuration:
 set datafusion.optimizer.<configuration_name>
 ```
 
-Adjusting the following configuration values influences how the optimizer selects the join algorithm 
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
 used to execute your SQL query:
 
 ## Join Optimizer Configurations
@@ -47,19 +48,25 @@ Adjusting the following configuration values influences how the optimizer select
 used to execute your SQL query.
 
 ### allow_symmetric_joins_without_pruning (bool, default = true)
+
 Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs
 lack ordering or filtering.
- - If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution.
+
+- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution.
 
 ### prefer_hash_join (bool, default = true)
+
 Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection.
- - true: favors HashJoin for faster execution when sufficient memory is available.
- - false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed.
+
+- true: favors HashJoin for faster execution when sufficient memory is available.
+- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed.
 
 ### enable_piecewise_merge_join (bool, default = false)
+
 Enables the experimental Piecewise Merge Join algorithm.
- - When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range 
-   filter in the join condition.
- - Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter 
-   except for cases where it is joining two large tables (num_rows > 100,000) that are approximately 
-   equal in size.
+
+- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range
+  filter in the join condition.
+- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter
+  except for cases where it is joining two large tables (num_rows > 100,000) that are approximately
+  equal in size.

From ed31b10dc16be11ff2cc73150a87e03258dd4d5f Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Wed, 22 Oct 2025 10:10:46 -0400
Subject: [PATCH 3/6] move

---
 datafusion/physical-plan/src/joins/README.md | 72 --------------------
 docs/source/user-guide/configs.md            | 54 +++++++++++++++
 2 files changed, 54 insertions(+), 72 deletions(-)
 delete mode 100644 datafusion/physical-plan/src/joins/README.md

diff --git a/datafusion/physical-plan/src/joins/README.md b/datafusion/physical-plan/src/joins/README.md
deleted file mode 100644
index 5d09ebe033ec..000000000000
--- a/datafusion/physical-plan/src/joins/README.md
+++ /dev/null
@@ -1,72 +0,0 @@
-<!---
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-
-# Join Physical Plans
-
-Currently Apache Datafusion supports the following join algorithms:
-
-- Nested Loop Join
-- Sort Merge Join
-- Hash Join
-- Symmetric Hash Join
-- Piecewise Merge Join (experimental)
-
-The physical planner will choose the appropriate algorithm based on the statistics + join
-condition of the two tables.
-
-# Join Algorithm Optimizer Configurations
-
-You can modify join optimization behavior in your queries by setting specific configuration values.
-Use the following command to update a configuration:
-
-```
-set datafusion.optimizer.<configuration_name>
-```
-
-Adjusting the following configuration values influences how the optimizer selects the join algorithm
-used to execute your SQL query:
-
-## Join Optimizer Configurations
-
-Adjusting the following configuration values influences how the optimizer selects the join algorithm
-used to execute your SQL query.
-
-### allow_symmetric_joins_without_pruning (bool, default = true)
-
-Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs
-lack ordering or filtering.
-
-- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution.
-
-### prefer_hash_join (bool, default = true)
-
-Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection.
-
-- true: favors HashJoin for faster execution when sufficient memory is available.
-- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed.
-
-### enable_piecewise_merge_join (bool, default = false)
-
-Enables the experimental Piecewise Merge Join algorithm.
-
-- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range
-  filter in the join condition.
-- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter
-  except for cases where it is joining two large tables (num_rows > 100,000) that are approximately
-  equal in size.
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index 7ec1864b4667..fbd9216441b8 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -253,3 +253,57 @@ SET datafusion.execution.batch_size = 1024;
 ```
 
 [`fairspillpool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html
+
+## Join Queries
+
+Currently Apache Datafusion supports the following join algorithms:
+
+- Nested Loop Join
+- Sort Merge Join
+- Hash Join
+- Symmetric Hash Join
+- Piecewise Merge Join (experimental)
+
+The physical planner will choose the appropriate algorithm based on the statistics + join
+condition of the two tables.
+
+# Join Algorithm Optimizer Configurations
+
+You can modify join optimization behavior in your queries by setting specific configuration values.
+Use the following command to update a configuration:
+
+```
+set datafusion.optimizer.<configuration_name>
+```
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query:
+
+## Join Optimizer Configurations
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query.
+
+### allow_symmetric_joins_without_pruning (bool, default = true)
+
+Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs
+lack ordering or filtering.
+
+- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution.
+
+### prefer_hash_join (bool, default = true)
+
+Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection.
+
+- true: favors HashJoin for faster execution when sufficient memory is available.
+- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed.
+
+### enable_piecewise_merge_join (bool, default = false)
+
+Enables the experimental Piecewise Merge Join algorithm.
+
+- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range
+  filter in the join condition.
+- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter
+  except for cases where it is joining two large tables (num_rows > 100,000) that are approximately
+  equal in size.

From 3e4032f66c241f93ba4ff98236144d34522f6691 Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Wed, 22 Oct 2025 23:36:48 -0400
Subject: [PATCH 4/6] add to tpch

---
 benchmarks/src/tpch/run.rs        | 13 +++++++++++++
 docs/source/user-guide/configs.md |  8 +++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
index 2b66de641b67..cc59b7803036 100644
--- a/benchmarks/src/tpch/run.rs
+++ b/benchmarks/src/tpch/run.rs
@@ -92,6 +92,15 @@ pub struct RunOpt {
     #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")]
     prefer_hash_join: BoolDefaultTrue,
 
+    /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join
+    /// True by default.
+    #[structopt(
+        short = "j",
+        long = "enable_piecewise_merge_join",
+        default_value = "false"
+    )]
+    enable_piecewise_merge_join: BoolDefaultTrue,
+
     /// Mark the first column of each table as sorted in ascending order.
     /// The tables should have been created with the `--sort` option for this to have any effect.
     #[structopt(short = "t", long = "sorted")]
@@ -112,6 +121,8 @@ impl RunOpt {
             .config()?
             .with_collect_statistics(!self.disable_statistics);
         config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
+        config.options_mut().optimizer.enable_piecewise_merge_join =
+            self.enable_piecewise_merge_join;
         let rt_builder = self.common.runtime_env_builder()?;
         let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
         // register tables
@@ -379,6 +390,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            enable_piecewise_merge_join: false,
             sorted: false,
         };
         opt.register_tables(&ctx).await?;
@@ -416,6 +428,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            enable_piecewise_merge_join: false,
             sorted: false,
         };
         opt.register_tables(&ctx).await?;
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index fbd9216441b8..f5f5d8dbad4c 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -273,7 +273,13 @@ You can modify join optimization behavior in your queries by setting specific co
 Use the following command to update a configuration:
 
 ```
-set datafusion.optimizer.<configuration_name>
+SET datafusion.optimizer.<configuration_name>
+```
+
+Example
+
+``` sql
+SET datafusion.optimizer.prefer_hash_join = false;
 ```
 
 Adjusting the following configuration values influences how the optimizer selects the join algorithm

From 21fbffcb0153b11abbad13bfe9ad68e5eb2541a4 Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Thu, 23 Oct 2025 11:09:44 -0400
Subject: [PATCH 5/6] move to config docs

---
 dev/update_config_docs.sh         | 60 +++++++++++++++++++++++++++++++
 docs/source/user-guide/configs.md |  2 +-
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh
index ed3e699c1413..22f9208510b6 100755
--- a/dev/update_config_docs.sh
+++ b/dev/update_config_docs.sh
@@ -175,6 +175,66 @@ SET datafusion.execution.batch_size = 1024;
 
 [`FairSpillPool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html
 
+## Join Queries
+
+Currently Apache Datafusion supports the following join algorithms:
+
+- Nested Loop Join
+- Sort Merge Join
+- Hash Join
+- Symmetric Hash Join
+- Piecewise Merge Join (experimental)
+
+The physical planner will choose the appropriate algorithm based on the statistics + join
+condition of the two tables.
+
+# Join Algorithm Optimizer Configurations
+
+You can modify join optimization behavior in your queries by setting specific configuration values.
+Use the following command to update a configuration:
+
+```
+SET datafusion.optimizer.<configuration_name>
+```
+
+Example
+
+``` sql
+SET datafusion.optimizer.prefer_hash_join = false;
+```
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query:
+
+## Join Optimizer Configurations
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query.
+
+### allow_symmetric_joins_without_pruning (bool, default = true)
+
+Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs
+lack ordering or filtering.
+
+- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution.
+
+### prefer_hash_join (bool, default = true)
+
+Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection.
+
+- true: favors HashJoin for faster execution when sufficient memory is available.
+- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed.
+
+### enable_piecewise_merge_join (bool, default = false)
+
+Enables the experimental Piecewise Merge Join algorithm.
+
+- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range
+  filter in the join condition.
+- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter
+  except for cases where it is joining two large tables (num_rows > 100,000) that are approximately
+  equal in size.
+
 EOF
 
 
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index f5f5d8dbad4c..4d610ea95e07 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -278,7 +278,7 @@ SET datafusion.optimizer.<configuration_name>
 
 Example
 
-``` sql
+```sql
 SET datafusion.optimizer.prefer_hash_join = false;
 ```
 

From 5c65077e6b2ce27c3f956750883ff9baf3028d60 Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Thu, 23 Oct 2025 20:51:39 -0400
Subject: [PATCH 6/6] update

---
 dev/update_config_docs.sh         | 4 ++--
 docs/source/user-guide/configs.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh
index 22f9208510b6..90bbc5d3bad0 100755
--- a/dev/update_config_docs.sh
+++ b/dev/update_config_docs.sh
@@ -193,8 +193,8 @@ condition of the two tables.
 You can modify join optimization behavior in your queries by setting specific configuration values.
 Use the following command to update a configuration:
 
-```
-SET datafusion.optimizer.<configuration_name>
+``` sql
+SET datafusion.optimizer.<configuration_name>;
 ```
 
 Example
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index 4d610ea95e07..436c019efbb9 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -272,8 +272,8 @@ condition of the two tables.
 You can modify join optimization behavior in your queries by setting specific configuration values.
 Use the following command to update a configuration:
 
-```
-SET datafusion.optimizer.<configuration_name>
+```sql
+SET datafusion.optimizer.<configuration_name>;
 ```
 
 Example