diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs index f1d93644bf693..b4d13aff54828 100644 --- a/datafusion/physical-expr/benches/in_list.rs +++ b/datafusion/physical-expr/benches/in_list.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, ArrayRef, Float32Array, Int32Array, StringArray}; +use arrow::array::{ + Array, ArrayRef, Float32Array, Int32Array, StringArray, StringViewArray, +}; use arrow::datatypes::{Field, Schema}; use arrow::record_batch::RecordBatch; use criterion::{Criterion, criterion_group, criterion_main}; @@ -23,9 +25,11 @@ use datafusion_common::ScalarValue; use datafusion_physical_expr::expressions::{col, in_list, lit}; use rand::distr::Alphanumeric; use rand::prelude::*; +use std::any::TypeId; use std::hint::black_box; use std::sync::Arc; +/// Measures how long `in_list(col("a"), exprs)` takes to evaluate against a single RecordBatch. fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValue]) { let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]); let exprs = exprs.iter().map(|s| lit(s.clone())).collect(); @@ -37,79 +41,128 @@ fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValu }); } +/// Generates a random alphanumeric string of the specified length. fn random_string(rng: &mut StdRng, len: usize) -> String { let value = rng.sample_iter(&Alphanumeric).take(len).collect(); String::from_utf8(value).unwrap() } -fn do_benches( - c: &mut Criterion, - array_length: usize, - in_list_length: usize, - null_percent: f64, -) { - let mut rng = StdRng::seed_from_u64(120320); - let non_null_percent = 1.0 - null_percent; - - for string_length in [5, 10, 20] { - let values: StringArray = (0..array_length) - .map(|_| { - rng.random_bool(non_null_percent) - .then(|| random_string(&mut rng, string_length)) - }) - .collect(); - - let in_list: Vec<_> = (0..in_list_length) - .map(|_| ScalarValue::from(random_string(&mut rng, string_length))) - .collect(); - - do_bench( - c, - &format!( - "in_list_utf8({string_length}) ({array_length}, {null_percent}) IN ({in_list_length}, 0)" - ), - Arc::new(values), - &in_list, - ) +const IN_LIST_LENGTHS: [usize; 3] = [3, 8, 100]; +const NULL_PERCENTS: [f64; 2] = [0., 0.2]; +const STRING_LENGTHS: [usize; 3] = [3, 12, 100]; +const ARRAY_LENGTH: usize = 1024; + +/// Returns a friendly type name for the array type. +fn array_type_name() -> &'static str { + let id = TypeId::of::(); + if id == TypeId::of::() { + "Utf8" + } else if id == TypeId::of::() { + "Utf8View" + } else if id == TypeId::of::() { + "Float32" + } else if id == TypeId::of::() { + "Int32" + } else { + "Unknown" } +} - let values: Float32Array = (0..array_length) - .map(|_| rng.random_bool(non_null_percent).then(|| rng.random())) - .collect(); +/// Builds a benchmark name from array type, list size, and null percentage. +fn bench_name(in_list_length: usize, null_percent: f64) -> String { + format!( + "in_list/{}/list={in_list_length}/nulls={}%", + array_type_name::(), + (null_percent * 100.0) as u32 + ) +} - let in_list: Vec<_> = (0..in_list_length) - .map(|_| ScalarValue::Float32(Some(rng.random()))) - .collect(); +/// Runs in_list benchmarks for a string array type across all list-size × null-ratio × string-length combinations. +fn bench_string_type( + c: &mut Criterion, + rng: &mut StdRng, + make_scalar: fn(String) -> ScalarValue, +) where + A: Array + FromIterator> + 'static, +{ + for in_list_length in IN_LIST_LENGTHS { + for null_percent in NULL_PERCENTS { + for string_length in STRING_LENGTHS { + let values: A = (0..ARRAY_LENGTH) + .map(|_| { + rng.random_bool(1.0 - null_percent) + .then(|| random_string(rng, string_length)) + }) + .collect(); + + let in_list: Vec<_> = (0..in_list_length) + .map(|_| make_scalar(random_string(rng, string_length))) + .collect(); + + do_bench( + c, + &format!( + "{}/str={string_length}", + bench_name::(in_list_length, null_percent) + ), + Arc::new(values), + &in_list, + ) + } + } + } +} - do_bench( - c, - &format!("in_list_f32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"), - Arc::new(values), - &in_list, - ); +/// Runs in_list benchmarks for a numeric array type across all list-size × null-ratio combinations. +fn bench_numeric_type( + c: &mut Criterion, + rng: &mut StdRng, + mut gen_value: impl FnMut(&mut StdRng) -> T, + make_scalar: fn(T) -> ScalarValue, +) where + A: Array + FromIterator> + 'static, +{ + for in_list_length in IN_LIST_LENGTHS { + for null_percent in NULL_PERCENTS { + let values: A = (0..ARRAY_LENGTH) + .map(|_| rng.random_bool(1.0 - null_percent).then(|| gen_value(rng))) + .collect(); + + let in_list: Vec<_> = (0..in_list_length) + .map(|_| make_scalar(gen_value(rng))) + .collect(); + + do_bench( + c, + &bench_name::(in_list_length, null_percent), + Arc::new(values), + &in_list, + ); + } + } +} - let values: Int32Array = (0..array_length) - .map(|_| rng.random_bool(non_null_percent).then(|| rng.random())) - .collect(); +/// Entry point: registers in_list benchmarks for Utf8, Utf8View, Float32, and Int32 arrays. +fn criterion_benchmark(c: &mut Criterion) { + let mut rng = StdRng::seed_from_u64(120320); - let in_list: Vec<_> = (0..in_list_length) - .map(|_| ScalarValue::Int32(Some(rng.random()))) - .collect(); + // Benchmarks for string array types (Utf8, Utf8View) + bench_string_type::(c, &mut rng, |s| ScalarValue::Utf8(Some(s))); + bench_string_type::(c, &mut rng, |s| ScalarValue::Utf8View(Some(s))); - do_bench( + // Benchmarks for numeric types + bench_numeric_type::( c, - &format!("in_list_i32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"), - Arc::new(values), - &in_list, - ) -} - -fn criterion_benchmark(c: &mut Criterion) { - for in_list_length in [1, 3, 10, 100] { - for null_percent in [0., 0.2] { - do_benches(c, 1024, in_list_length, null_percent) - } - } + &mut rng, + |rng| rng.random(), + |v| ScalarValue::Float32(Some(v)), + ); + bench_numeric_type::( + c, + &mut rng, + |rng| rng.random(), + |v| ScalarValue::Int32(Some(v)), + ); } criterion_group!(benches, criterion_benchmark);