Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
cd6433c
Use BatchCoaleser in sort merge join instead of calling coalesce_batc…
mbutrovich Nov 21, 2025
d29fd29
Merge branch 'main' into smj
mbutrovich Nov 21, 2025
c1b58b9
Merge branch 'main' into smj
mbutrovich Nov 25, 2025
a655212
stash
mbutrovich Nov 25, 2025
4ed5cd4
Stash with assertions.
mbutrovich Nov 25, 2025
4364656
Stash with assertions.
mbutrovich Nov 25, 2025
7a41fe6
encapsulate
mbutrovich Nov 25, 2025
b986fd7
encapsulate
mbutrovich Nov 25, 2025
387c882
encapsulate
mbutrovich Dec 1, 2025
efa2996
pre-refactor
mbutrovich Dec 1, 2025
a5c926f
get rid of confusing output_size
mbutrovich Dec 1, 2025
f725308
refactor
mbutrovich Dec 1, 2025
4cc21e8
refactor
mbutrovich Dec 1, 2025
f6430db
fix double concat for filtered joins
mbutrovich Dec 1, 2025
32021cb
more elided concats
mbutrovich Dec 2, 2025
2e0f211
remove dead code
mbutrovich Dec 2, 2025
37bb875
passes
mbutrovich Dec 2, 2025
2ac80f6
Merge branch 'main' into smj5
mbutrovich Dec 2, 2025
8c69056
comments
mbutrovich Dec 2, 2025
67877e6
clippy, comments
mbutrovich Dec 2, 2025
e7b94e5
Remove unused import
mbutrovich Dec 2, 2025
7c55ad9
optimize concat_batches call
mbutrovich Dec 2, 2025
ad583d2
Merge branch 'main' into smj
mbutrovich Dec 2, 2025
43a945f
fix metrics collection filtered joins
mbutrovich Dec 2, 2025
6a4e664
pass through batches that are batch_size / 2 similar to LimitedBatchC…
mbutrovich Dec 3, 2025
36a73e5
Merge branch 'main' into smj
mbutrovich Dec 3, 2025
1000afa
Merge branch 'main' into smj
mbutrovich Dec 4, 2025
66ea027
Address PR feedback.
mbutrovich Dec 4, 2025
eb5637e
Merge branch 'main' into smj
mbutrovich Dec 4, 2025
86cbc5c
Remove stray import.
mbutrovich Dec 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ Different queries are included to test nested loop joins under various workloads

## Hash Join

This benchmark focuses on the performance of queries with nested hash joins, minimizing other overheads such as scanning data sources or evaluating predicates.
This benchmark focuses on the performance of queries with hash joins, minimizing other overheads such as scanning data sources or evaluating predicates.

Several queries are included to test hash joins under various workloads.

Expand All @@ -757,6 +757,19 @@ Several queries are included to test hash joins under various workloads.
./bench.sh run hj
```

## Sort Merge Join

This benchmark focuses on the performance of queries with sort merge joins joins, minimizing other overheads such as scanning data sources or evaluating predicates.

Several queries are included to test sort merge joins under various workloads.

### Example Run

```bash
# No need to generate data: this benchmark uses table function `range()` as the data source

./bench.sh run smj
```
## Cancellation

Test performance of cancelling queries.
Expand Down
17 changes: 17 additions & 0 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ imdb: Join Order Benchmark (JOB) using the IMDB dataset conver
cancellation: How long cancelling a query takes
nlj: Benchmark for simple nested loop joins, testing various join scenarios
hj: Benchmark for simple hash joins, testing various join scenarios
smj: Benchmark for simple sort merge joins, testing various join scenarios
compile_profile: Compile and execute TPC-H across selected Cargo profiles, reporting timing and binary size


Expand Down Expand Up @@ -315,6 +316,10 @@ main() {
# hj uses range() function, no data generation needed
echo "HJ benchmark does not require data generation"
;;
smj)
# smj uses range() function, no data generation needed
echo "SMJ benchmark does not require data generation"
;;
compile_profile)
data_tpch "1" "parquet"
;;
Expand Down Expand Up @@ -388,6 +393,7 @@ main() {
run_external_aggr
run_nlj
run_hj
run_smj
;;
tpch)
run_tpch "1" "parquet"
Expand Down Expand Up @@ -498,6 +504,9 @@ main() {
hj)
run_hj
;;
smj)
run_smj
;;
compile_profile)
run_compile_profile "${PROFILE_ARGS[@]}"
;;
Expand Down Expand Up @@ -1166,6 +1175,14 @@ run_hj() {
debug_run $CARGO_COMMAND --bin dfbench -- hj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the smj benchmark
run_smj() {
RESULTS_FILE="${RESULTS_DIR}/smj.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running smj benchmark..."
debug_run $CARGO_COMMAND --bin dfbench -- smj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG}
}


compare_benchmarks() {
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/src/bin/dfbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;

use datafusion_benchmarks::{
cancellation, clickbench, h2o, hj, imdb, nlj, sort_tpch, tpch,
cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_tpch, tpch,
};

#[derive(Debug, StructOpt)]
Expand All @@ -46,6 +46,7 @@ enum Options {
HJ(hj::RunOpt),
Imdb(imdb::RunOpt),
Nlj(nlj::RunOpt),
Smj(smj::RunOpt),
SortTpch(sort_tpch::RunOpt),
Tpch(tpch::RunOpt),
}
Expand All @@ -62,6 +63,7 @@ pub async fn main() -> Result<()> {
Options::HJ(opt) => opt.run().await,
Options::Imdb(opt) => Box::pin(opt.run()).await,
Options::Nlj(opt) => opt.run().await,
Options::Smj(opt) => opt.run().await,
Options::SortTpch(opt) => opt.run().await,
Options::Tpch(opt) => Box::pin(opt.run()).await,
}
Expand Down
1 change: 1 addition & 0 deletions benchmarks/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pub mod h2o;
pub mod hj;
pub mod imdb;
pub mod nlj;
pub mod smj;
pub mod sort_tpch;
pub mod tpch;
pub mod util;
Loading