Skip to content

Commit

Permalink
Add lexsort benchmark (apache#2871)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Oct 26, 2022
1 parent bca8445 commit b1ffa13
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 5 deletions.
5 changes: 5 additions & 0 deletions arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -259,3 +259,8 @@ required-features = ["test_utils"]
name = "bitwise_kernel"
harness = false
required-features = ["test_utils"]

[[bench]]
name = "lexsort"
harness = false
required-features = ["test_utils"]
166 changes: 166 additions & 0 deletions arrow/benches/lexsort.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::compute::{lexsort_to_indices, SortColumn};
use arrow::row::{RowConverter, SortField};
use arrow::util::bench_util::{
create_dict_from_values, create_primitive_array, create_string_array_with_len,
};
use arrow_array::types::Int32Type;
use arrow_array::{Array, ArrayRef, UInt32Array};
use criterion::{criterion_group, criterion_main, Criterion};
use std::sync::Arc;

#[derive(Copy, Clone)]
enum Column {
RequiredI32,
OptionalI32,
Required16CharString,
Optional16CharString,
Optional50CharString,
Optional100Value50CharStringDict,
}

impl std::fmt::Debug for Column {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s = match self {
Column::RequiredI32 => "i32",
Column::OptionalI32 => "i32_opt",
Column::Required16CharString => "str(16)",
Column::Optional16CharString => "str_opt(16)",
Column::Optional50CharString => "str_opt(50)",
Column::Optional100Value50CharStringDict => "dict(100,str_opt(50))",
};
f.write_str(s)
}
}

impl Column {
fn generate(self, size: usize) -> ArrayRef {
match self {
Column::RequiredI32 => {
Arc::new(create_primitive_array::<Int32Type>(size, 0.))
}
Column::OptionalI32 => {
Arc::new(create_primitive_array::<Int32Type>(size, 0.2))
}
Column::Required16CharString => {
Arc::new(create_string_array_with_len::<i32>(size, 0., 16))
}
Column::Optional16CharString => {
Arc::new(create_string_array_with_len::<i32>(size, 0.2, 16))
}
Column::Optional50CharString => {
Arc::new(create_string_array_with_len::<i32>(size, 0., 50))
}
Column::Optional100Value50CharStringDict => {
Arc::new(create_dict_from_values::<Int32Type>(
size,
0.1,
&create_string_array_with_len::<i32>(100, 0., 50),
))
}
}
}
}

fn do_bench(c: &mut Criterion, columns: &[Column], len: usize) {
let arrays: Vec<_> = columns.iter().map(|x| x.generate(len)).collect();
let sort_columns: Vec<_> = arrays
.iter()
.cloned()
.map(|values| SortColumn {
values,
options: None,
})
.collect();

c.bench_function(&format!("lexsort_to_indices({:?}): {}", columns, len), |b| {
b.iter(|| criterion::black_box(lexsort_to_indices(&sort_columns, None).unwrap()))
});

c.bench_function(&format!("lexsort_rows({:?}): {}", columns, len), |b| {
b.iter(|| {
criterion::black_box({
let fields = arrays
.iter()
.map(|a| SortField::new(a.data_type().clone()))
.collect();
let mut converter = RowConverter::new(fields);
let rows = converter.convert_columns(&arrays).unwrap();
let mut sort: Vec<_> = rows.iter().enumerate().collect();
sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b));
UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32))
})
})
});
}

fn add_benchmark(c: &mut Criterion) {
let cases: &[&[Column]] = &[
&[Column::RequiredI32, Column::OptionalI32],
&[Column::RequiredI32, Column::Optional16CharString],
&[Column::RequiredI32, Column::Required16CharString],
&[Column::Optional16CharString, Column::Required16CharString],
&[
Column::Optional16CharString,
Column::Optional50CharString,
Column::Required16CharString,
],
&[
Column::Optional16CharString,
Column::Required16CharString,
Column::Optional16CharString,
Column::Optional16CharString,
Column::Optional16CharString,
],
&[
Column::OptionalI32,
Column::Optional100Value50CharStringDict,
],
&[
Column::Optional100Value50CharStringDict,
Column::Optional100Value50CharStringDict,
],
&[
Column::Optional100Value50CharStringDict,
Column::Optional100Value50CharStringDict,
Column::Optional100Value50CharStringDict,
Column::Required16CharString,
],
&[
Column::Optional100Value50CharStringDict,
Column::Optional100Value50CharStringDict,
Column::Optional100Value50CharStringDict,
Column::Optional50CharString,
],
&[
Column::Optional100Value50CharStringDict,
Column::Optional100Value50CharStringDict,
Column::Optional100Value50CharStringDict,
Column::Optional50CharString,
],
];

for case in cases {
do_bench(c, *case, 4096);
do_bench(c, *case, 4096 * 8);
}
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);
14 changes: 9 additions & 5 deletions arrow/src/compute/kernels/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,10 @@ pub struct SortColumn {
/// assert_eq!(as_primitive_array::<Int64Type>(&sorted_columns[0]).value(1), -64);
/// assert!(sorted_columns[0].is_null(0));
/// ```
///
/// Note: for multi-column sorts without a limit, using the [row format][crate::row]
/// may be significantly faster
///
pub fn lexsort(columns: &[SortColumn], limit: Option<usize>) -> Result<Vec<ArrayRef>> {
let indices = lexsort_to_indices(columns, limit)?;
columns
Expand All @@ -907,6 +911,9 @@ pub fn lexsort(columns: &[SortColumn], limit: Option<usize>) -> Result<Vec<Array

/// Sort elements lexicographically from a list of `ArrayRef` into an unsigned integer
/// (`UInt32Array`) of indices.
///
/// Note: for multi-column sorts without a limit, using the [row format][crate::row]
/// may be significantly faster
pub fn lexsort_to_indices(
columns: &[SortColumn],
limit: Option<usize>,
Expand Down Expand Up @@ -942,11 +949,8 @@ pub fn lexsort_to_indices(
lexicographical_comparator.compare(a, b)
});

Ok(UInt32Array::from(
(&value_indices)[0..len]
.iter()
.map(|i| *i as u32)
.collect::<Vec<u32>>(),
Ok(UInt32Array::from_iter_values(
value_indices.iter().map(|i| *i as u32),
))
}

Expand Down
18 changes: 18 additions & 0 deletions arrow/src/row/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,24 @@
//! assert_eq!(&c2_values, &["a", "f", "c", "e"]);
//! ```
//!
//! It can also be used to implement a fast lexicographic sort
//!
//! ```
//! # use arrow::row::{RowConverter, SortField};
//! # use arrow_array::{ArrayRef, UInt32Array};
//! fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array {
//! let fields = arrays
//! .iter()
//! .map(|a| SortField::new(a.data_type().clone()))
//! .collect();
//! let mut converter = RowConverter::new(fields);
//! let rows = converter.convert_columns(&arrays).unwrap();
//! let mut sort: Vec<_> = rows.iter().enumerate().collect();
//! sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b));
//! UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32))
//! }
//! ```
//!
//! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts]
//! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort]
//! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf]
Expand Down
36 changes: 36 additions & 0 deletions arrow/src/util/bench_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
use crate::array::*;
use crate::datatypes::*;
use crate::util::test_util::seedable_rng;
use arrow_buffer::Buffer;
use rand::distributions::uniform::SampleUniform;
use rand::Rng;
use rand::SeedableRng;
use rand::{
Expand Down Expand Up @@ -187,3 +189,37 @@ pub fn create_fsb_array(
}))
.unwrap()
}

pub fn create_dict_from_values<K>(
size: usize,
null_density: f32,
values: &dyn Array,
) -> DictionaryArray<K>
where
K: ArrowDictionaryKeyType,
Standard: Distribution<K::Native>,
K::Native: SampleUniform,
{
let mut rng = seedable_rng();
let data_type = DataType::Dictionary(
Box::new(K::DATA_TYPE),
Box::new(values.data_type().clone()),
);

let min_key = K::Native::from_usize(0).unwrap();
let max_key = K::Native::from_usize(values.len()).unwrap();
let keys: Buffer = (0..size).map(|_| rng.gen_range(min_key..max_key)).collect();

let nulls: Option<Buffer> = (null_density != 0.)
.then(|| (0..size).map(|_| rng.gen_bool(null_density as _)).collect());

let data = ArrayDataBuilder::new(data_type)
.len(size)
.null_bit_buffer(nulls)
.add_buffer(keys)
.add_child_data(values.data().clone())
.build()
.unwrap();

DictionaryArray::from(data)
}

0 comments on commit b1ffa13

Please sign in to comment.