-
Notifications
You must be signed in to change notification settings - Fork 1k
Perf: Make sort string view fast(1.5X ~ 3X faster) #7792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d724386
e1ccab4
5000e8a
463ff19
d6814a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,7 @@ use arrow_array::types::*; | |
use arrow_array::*; | ||
use arrow_buffer::ArrowNativeType; | ||
use arrow_buffer::BooleanBufferBuilder; | ||
use arrow_data::ArrayDataBuilder; | ||
use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN}; | ||
use arrow_schema::{ArrowError, DataType}; | ||
use arrow_select::take::take; | ||
use std::cmp::Ordering; | ||
|
@@ -310,11 +310,72 @@ fn sort_byte_view<T: ByteViewType>( | |
options: SortOptions, | ||
limit: Option<usize>, | ||
) -> UInt32Array { | ||
let mut valids = value_indices | ||
// 1. Build a list of (index, raw_view, length) | ||
let mut valids: Vec<_> = value_indices | ||
.into_iter() | ||
.map(|index| (index, values.value(index as usize).as_ref())) | ||
.collect::<Vec<(u32, &[u8])>>(); | ||
sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into() | ||
.map(|idx| { | ||
// SAFETY: we know idx < values.len() | ||
let raw = unsafe { *values.views().get_unchecked(idx as usize) }; | ||
let len = raw as u32; // lower 32 bits encode length | ||
(idx, raw, len) | ||
}) | ||
.collect(); | ||
|
||
// 2. Compute the number of non-null entries to partially sort | ||
let vlimit = match (limit, options.nulls_first) { | ||
(Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()), | ||
_ => valids.len(), | ||
}; | ||
|
||
// 3. Mixed comparator: first prefix, then inline vs full comparison | ||
let cmp_mixed = |a: &(u32, u128, u32), b: &(u32, u128, u32)| { | ||
let (_, raw_a, len_a) = *a; | ||
let (_, raw_b, len_b) = *b; | ||
|
||
// 3.1 Both inline (≤12 bytes): compare full 128-bit key including length | ||
if len_a <= MAX_INLINE_VIEW_LEN && len_b <= MAX_INLINE_VIEW_LEN { | ||
return GenericByteViewArray::<T>::inline_key_fast(raw_a) | ||
.cmp(&GenericByteViewArray::<T>::inline_key_fast(raw_b)); | ||
} | ||
|
||
// 3.2 Compare 4-byte prefix in big-endian order | ||
let pref_a = ByteView::from(raw_a).prefix.swap_bytes(); | ||
let pref_b = ByteView::from(raw_b).prefix.swap_bytes(); | ||
if pref_a != pref_b { | ||
return pref_a.cmp(&pref_b); | ||
} | ||
|
||
// 3.3 Fallback to full byte-slice comparison | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't this only valid if both views are not inlined (len > 12)? I am not sure it is ok to lexographically compare an inlined view and a non inline view 🤔 Won't that potentially compare the buffer offset to part of the strings? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update: I double checked and Though it makes me wonder if we could squeeze more out of this function by handling the three cases explicitly (short, long), (long, short) and (long, long) However, that might have a minimal payback |
||
let full_a: &[u8] = unsafe { values.value_unchecked(a.0 as usize).as_ref() }; | ||
let full_b: &[u8] = unsafe { values.value_unchecked(b.0 as usize).as_ref() }; | ||
full_a.cmp(full_b) | ||
}; | ||
|
||
// 4. Partially sort according to ascending/descending | ||
if !options.descending { | ||
sort_unstable_by(&mut valids, vlimit, cmp_mixed); | ||
} else { | ||
sort_unstable_by(&mut valids, vlimit, |x, y| cmp_mixed(x, y).reverse()); | ||
} | ||
|
||
// 5. Assemble nulls and sorted indices into final output | ||
let total = valids.len() + nulls.len(); | ||
let out_limit = limit.unwrap_or(total).min(total); | ||
let mut out = Vec::with_capacity(total); | ||
|
||
if options.nulls_first { | ||
// Place null indices first | ||
out.extend_from_slice(&nulls[..nulls.len().min(out_limit)]); | ||
let rem = out_limit - out.len(); | ||
out.extend(valids.iter().map(|&(i, _, _)| i).take(rem)); | ||
} else { | ||
// Place non-null indices first | ||
out.extend(valids.iter().map(|&(i, _, _)| i).take(out_limit)); | ||
let rem = out_limit - out.len(); | ||
out.extend_from_slice(&nulls[..rem]); | ||
} | ||
|
||
out.into() | ||
} | ||
|
||
fn sort_fixed_size_binary( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think ByteView can only be used if the view len is greater than 12:
arrow-rs/arrow-data/src/byte_view.rs
Lines 29 to 30 in 10d9714
Isn't this potentially comparing a inline view and a non inline view?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you @alamb for review, this actually copy from another PR:
#7748
And i add the comments to it here, i also can change to the raw convert instead of using ByteView::from, but i think it's safe that we only using the prefix here, and inline/unlined all have the prefix.
https://github.com/apache/arrow-rs/pull/7748/files#diff-160ecd8082d5d28081f01cdb08a898cb8f49b17149c7118bf96746ddaae24b4fR560
May be we can make another PR ready to merge, then i can just copy the same code from there, thanks a lot!