Skip to content

Commit

Permalink
Re-use TimSort analysis for small slices
Browse files Browse the repository at this point in the history
Add new loop based mini-merge sort for small sizes. This extends
allocation free sorting for random inputs of types that qualify up to
32.
  • Loading branch information
Voultapher committed Nov 4, 2022
1 parent 05a6b9c commit bad7e05
Showing 1 changed file with 81 additions and 99 deletions.
180 changes: 81 additions & 99 deletions library/alloc/src/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -822,90 +822,6 @@ where
merge_sort(v, &mut is_less);
}

// Sort a small number of elements as fast as possible, without allocations.
#[cfg(not(no_global_oom_handling))]
fn stable_sort_small<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
let len = v.len();

// This implementation is really not fit for anything beyond that, and the call is probably a
// bug.
debug_assert!(len <= 40);

if len < 2 {
return;
}

// It's not clear that using custom code for specific sizes is worth it here.
// So we go with the simpler code.
let offset = if len <= 6 || !qualifies_for_branchless_sort::<T>() {
1
} else {
// Once a certain threshold is reached, it becomes worth it to analyze the input and do
// branchless swapping for the first 5 elements.

// SAFETY: We just checked that len >= 5
unsafe {
let arr_ptr = v.as_mut_ptr();

let should_swap_0_1 = is_less(&*arr_ptr.add(1), &*arr_ptr.add(0));
let should_swap_1_2 = is_less(&*arr_ptr.add(2), &*arr_ptr.add(1));
let should_swap_2_3 = is_less(&*arr_ptr.add(3), &*arr_ptr.add(2));
let should_swap_3_4 = is_less(&*arr_ptr.add(4), &*arr_ptr.add(3));

let swap_count = should_swap_0_1 as usize
+ should_swap_1_2 as usize
+ should_swap_2_3 as usize
+ should_swap_3_4 as usize;

if swap_count == 0 {
// Potentially already sorted. No need to swap, we know the first 5 elements are
// already in the right order.
5
} else if swap_count == 4 {
// Potentially reversed.
let mut rev_i = 4;
while rev_i < (len - 1) {
if !is_less(&*arr_ptr.add(rev_i + 1), &*arr_ptr.add(rev_i)) {
break;
}
rev_i += 1;
}
rev_i += 1;
v[..rev_i].reverse();
insertion_sort_shift_left(v, rev_i, is_less);
return;
} else {
// Potentially random pattern.
branchless_swap(arr_ptr.add(0), arr_ptr.add(1), should_swap_0_1);
branchless_swap(arr_ptr.add(2), arr_ptr.add(3), should_swap_2_3);

if len >= 12 {
// This aims to find a good balance between generating more code, which is bad
// for cold loops and improving hot code while not increasing mean comparison
// count too much.
sort8_stable(&mut v[4..12], is_less);
insertion_sort_shift_left(&mut v[4..], 8, is_less);
insertion_sort_shift_right(v, 4, is_less);
return;
} else {
// Complete the sort network for the first 4 elements.
swap_next_if_less(arr_ptr.add(1), is_less);
swap_next_if_less(arr_ptr.add(2), is_less);
swap_next_if_less(arr_ptr.add(0), is_less);
swap_next_if_less(arr_ptr.add(1), is_less);

4
}
}
}
};

insertion_sort_shift_left(v, offset, is_less);
}

#[cfg(not(no_global_oom_handling))]
fn merge_sort<T, F>(v: &mut [T], is_less: &mut F)
where
Expand All @@ -918,12 +834,7 @@ where

let len = v.len();

// Slices of up to this length get sorted using insertion sort.
const MAX_NO_ALLOC_SIZE: usize = 20;

// Short arrays get sorted in-place via insertion sort to avoid allocations.
if len <= MAX_NO_ALLOC_SIZE {
stable_sort_small(v, is_less);
if len < 2 {
return;
}

Expand Down Expand Up @@ -963,6 +874,11 @@ where
// return without allocating.
return;
} else if buf_ptr.is_null() {
// Short arrays get sorted in-place via insertion sort to avoid allocations.
if sort_small_stable(v, start, is_less) {
return;
}

// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
// shallow copies of the contents of `v` without risking the dtors running on copies if
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the
Expand Down Expand Up @@ -1016,11 +932,7 @@ where
|| (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
|| (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
{
if n >= 3 && runs[n - 3].len < runs[n - 1].len {
Some(n - 3)
} else {
Some(n - 2)
}
if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
} else {
None
}
Expand All @@ -1033,6 +945,67 @@ where
}
}

/// Check whether `v` applies for small sort optimization.
/// `v[start..]` is assumed already sorted.
#[cfg(not(no_global_oom_handling))]
fn sort_small_stable<T, F>(v: &mut [T], start: usize, is_less: &mut F) -> bool
where
F: FnMut(&T, &T) -> bool,
{
let len = v.len();

if qualifies_for_branchless_sort::<T>() {
// Testing showed that even though this incurs more comparisons, up to size 32 (4 * 8),
// avoiding the allocation and sticking with simple code is worth it. Going further eg. 40
// is still worth it for u64 or even types with more expensive comparisons, but risks
// incurring just too many comparisons than doing the regular TimSort.
const MAX_NO_ALLOC_SIZE: usize = 32;
if len <= MAX_NO_ALLOC_SIZE {
if len < 8 {
insertion_sort_shift_right(v, start, is_less);
return true;
}

let mut merge_count = 0;
for chunk in v.chunks_exact_mut(8) {
// SAFETY: chunks_exact_mut promised to give us slices of len 8.
unsafe {
sort8_stable(chunk, is_less);
}
merge_count += 1;
}

let mut swap = mem::MaybeUninit::<[T; 8]>::uninit();
let swap_ptr = swap.as_mut_ptr() as *mut T;

let mut i = 8;
while merge_count > 1 {
// SAFETY: We know the smaller side will be of size 8 because mid is 8. And both
// sides are non empty because of merge_count, and the right side will always be of
// size 8 and the left size of 8 or greater. Thus the smaller side will always be
// exactly 8 long, the size of swap.
unsafe {
merge(&mut v[0..(i + 8)], i, swap_ptr, is_less);
}
i += 8;
merge_count -= 1;
}

insertion_sort_shift_left(v, i, is_less);

return true;
}
} else {
const MAX_NO_ALLOC_SIZE: usize = 20;
if len <= MAX_NO_ALLOC_SIZE {
insertion_sort_shift_right(v, start, is_less);
return true;
}
}

false
}

/// Takes a range as denoted by start and end, that is already sorted and extends it if necessary
/// with sorts optimized for smaller ranges such as insertion sort.
#[cfg(not(no_global_oom_handling))]
Expand All @@ -1042,8 +1015,7 @@ where
{
debug_assert!(end > start);

// Testing showed that using MAX_INSERTION here yields the best performance for many types, but
// incurs more total comparisons. A balance between least comparisons and best performance, as
// This value is a balance between least comparisons and best performance, as
// influenced by for example cache locality.
const MIN_INSERTION_RUN: usize = 10;

Expand Down Expand Up @@ -1115,6 +1087,7 @@ impl<T> Drop for InsertionHole<T> {

/// Inserts `v[v.len() - 1]` into pre-sorted sequence `v[..v.len() - 1]` so that whole `v[..]`
/// becomes sorted.
#[cfg(not(no_global_oom_handling))]
unsafe fn insert_tail<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
Expand Down Expand Up @@ -1167,11 +1140,12 @@ where
}
}

/// Sort v assuming v[..offset] is already sorted.
/// Sort `v` assuming `v[..offset]` is already sorted.
///
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
/// performance impact. Even improving performance in some cases.
#[inline(never)]
#[cfg(not(no_global_oom_handling))]
fn insertion_sort_shift_left<T, F>(v: &mut [T], offset: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
Expand All @@ -1195,11 +1169,12 @@ where
}
}

/// Sort v assuming v[offset..] is already sorted.
/// Sort `v` assuming `v[offset..]` is already sorted.
///
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
/// performance impact. Even improving performance in some cases.
#[inline(never)]
#[cfg(not(no_global_oom_handling))]
fn insertion_sort_shift_right<T, F>(v: &mut [T], offset: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
Expand Down Expand Up @@ -1227,6 +1202,7 @@ where
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
///
/// This is the integral subroutine of insertion sort.
#[cfg(not(no_global_oom_handling))]
unsafe fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
Expand Down Expand Up @@ -1287,6 +1263,10 @@ where
///
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
///
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
/// performance impact.
#[inline(never)]
#[cfg(not(no_global_oom_handling))]
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
where
Expand Down Expand Up @@ -1506,6 +1486,7 @@ where
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
/// performance impact.
#[inline(never)]
#[cfg(not(no_global_oom_handling))]
unsafe fn sort8_stable<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
Expand Down Expand Up @@ -1559,6 +1540,7 @@ where
}
}

#[cfg(not(no_global_oom_handling))]
unsafe fn sort24_stable<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
Expand Down

0 comments on commit bad7e05

Please sign in to comment.