Skip to content

Commit 03d1974

Browse files
logan-keedealamb
andauthored
Separating Benchmarks for physical sorted union over large columns in SQL planner based on Datatype (apache#18599)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - as discussed in apache#17261 ## Rationale for this change Logical Plan for datatype Int64 and UInt64 differs, UInt64 Logical Plan's Union are wrapped up in Projection, and EliminateNestedUnion OptimezerRule is not applied leading to significantly longer execution time. <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? Separating Benchmarks based on datatype, converting a datatype specific function to a generic one. <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? Yes. <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? No, benchmarks only. <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent a8f0d59 commit 03d1974

File tree

1 file changed

+55
-13
lines changed

1 file changed

+55
-13
lines changed

datafusion/core/benches/sql_planner.rs

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,17 @@ extern crate datafusion;
2323
mod data_utils;
2424

2525
use crate::criterion::Criterion;
26+
use arrow::array::PrimitiveArray;
2627
use arrow::array::{ArrayRef, RecordBatch};
28+
use arrow::datatypes::ArrowNativeTypeOp;
29+
use arrow::datatypes::ArrowPrimitiveType;
2730
use arrow::datatypes::{DataType, Field, Fields, Schema};
2831
use criterion::Bencher;
2932
use datafusion::datasource::MemTable;
3033
use datafusion::execution::context::SessionContext;
3134
use datafusion_common::{config::Dialect, ScalarValue};
3235
use datafusion_expr::col;
36+
use rand_distr::num_traits::NumCast;
3337
use std::hint::black_box;
3438
use std::path::PathBuf;
3539
use std::sync::Arc;
@@ -155,18 +159,30 @@ fn benchmark_with_param_values_many_columns(
155159
/// 0,100...9900
156160
/// 0,200...19800
157161
/// 0,300...29700
158-
fn register_union_order_table(ctx: &SessionContext, num_columns: usize, num_rows: usize) {
159-
// ("c0", [0, 0, ...])
160-
// ("c1": [100, 200, ...])
161-
// etc
162-
let iter = (0..num_columns).map(|i| i as u64).map(|i| {
163-
let array: ArrayRef = Arc::new(arrow::array::UInt64Array::from_iter_values(
164-
(0..num_rows)
165-
.map(|j| j as u64 * 100 + i)
166-
.collect::<Vec<_>>(),
167-
));
162+
fn register_union_order_table_generic<T>(
163+
ctx: &SessionContext,
164+
num_columns: usize,
165+
num_rows: usize,
166+
) where
167+
T: ArrowPrimitiveType,
168+
T::Native: ArrowNativeTypeOp + NumCast,
169+
{
170+
let iter = (0..num_columns).map(|i| {
171+
let array_data: Vec<T::Native> = (0..num_rows)
172+
.map(|j| {
173+
let value = (j as u64) * 100 + (i as u64);
174+
<T::Native as NumCast>::from(value).unwrap_or_else(|| {
175+
panic!("Failed to cast numeric value to Native type")
176+
})
177+
})
178+
.collect();
179+
180+
// Use PrimitiveArray which is generic over the ArrowPrimitiveType T
181+
let array: ArrayRef = Arc::new(PrimitiveArray::<T>::from_iter_values(array_data));
182+
168183
(format!("c{i}"), array)
169184
});
185+
170186
let batch = RecordBatch::try_from_iter(iter).unwrap();
171187
let schema = batch.schema();
172188
let partitions = vec![vec![batch]];
@@ -183,7 +199,6 @@ fn register_union_order_table(ctx: &SessionContext, num_columns: usize, num_rows
183199

184200
ctx.register_table("t", Arc::new(table)).unwrap();
185201
}
186-
187202
/// return a query like
188203
/// ```sql
189204
/// select c1, 2 as c2, ... n as cn from t ORDER BY c1
@@ -403,13 +418,40 @@ fn criterion_benchmark(c: &mut Criterion) {
403418

404419
// -- Sorted Queries --
405420
// 100, 200 && 300 is taking too long - https://github.com/apache/datafusion/issues/18366
421+
// Logical Plan for datatype Int64 and UInt64 differs, UInt64 Logical Plan's Union are wrapped
422+
// up in Projection, and EliminateNestedUnion OptimezerRule is not applied leading to significantly
423+
// longer execution time.
424+
// https://github.com/apache/datafusion/issues/17261
425+
406426
for column_count in [10, 50 /* 100, 200, 300 */] {
407-
register_union_order_table(&ctx, column_count, 1000);
427+
register_union_order_table_generic::<arrow::datatypes::Int64Type>(
428+
&ctx,
429+
column_count,
430+
1000,
431+
);
408432

409433
// this query has many expressions in its sort order so stresses
410434
// order equivalence validation
411435
c.bench_function(
412-
&format!("physical_sorted_union_order_by_{column_count}"),
436+
&format!("physical_sorted_union_order_by_{column_count}_int64"),
437+
|b| {
438+
// SELECT ... UNION ALL ...
439+
let query = union_orderby_query(column_count);
440+
b.iter(|| physical_plan(&ctx, &rt, &query))
441+
},
442+
);
443+
444+
let _ = ctx.deregister_table("t");
445+
}
446+
447+
for column_count in [10, 50 /* 100, 200, 300 */] {
448+
register_union_order_table_generic::<arrow::datatypes::UInt64Type>(
449+
&ctx,
450+
column_count,
451+
1000,
452+
);
453+
c.bench_function(
454+
&format!("physical_sorted_union_order_by_{column_count}_uint64"),
413455
|b| {
414456
// SELECT ... UNION ALL ...
415457
let query = union_orderby_query(column_count);

0 commit comments

Comments
 (0)