From bad7e053082f8a5d1df6e5b4396ce741e8734800 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 1 Nov 2022 13:42:19 +0100 Subject: [PATCH] Re-use TimSort analysis for small slices Add new loop based mini-merge sort for small sizes. This extends allocation free sorting for random inputs of types that qualify up to 32. --- library/alloc/src/slice.rs | 180 +++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 99 deletions(-) diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index 882559b33fd05..57a3a3e96f36b 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -822,90 +822,6 @@ where merge_sort(v, &mut is_less); } -// Sort a small number of elements as fast as possible, without allocations. -#[cfg(not(no_global_oom_handling))] -fn stable_sort_small(v: &mut [T], is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - - // This implementation is really not fit for anything beyond that, and the call is probably a - // bug. - debug_assert!(len <= 40); - - if len < 2 { - return; - } - - // It's not clear that using custom code for specific sizes is worth it here. - // So we go with the simpler code. - let offset = if len <= 6 || !qualifies_for_branchless_sort::() { - 1 - } else { - // Once a certain threshold is reached, it becomes worth it to analyze the input and do - // branchless swapping for the first 5 elements. - - // SAFETY: We just checked that len >= 5 - unsafe { - let arr_ptr = v.as_mut_ptr(); - - let should_swap_0_1 = is_less(&*arr_ptr.add(1), &*arr_ptr.add(0)); - let should_swap_1_2 = is_less(&*arr_ptr.add(2), &*arr_ptr.add(1)); - let should_swap_2_3 = is_less(&*arr_ptr.add(3), &*arr_ptr.add(2)); - let should_swap_3_4 = is_less(&*arr_ptr.add(4), &*arr_ptr.add(3)); - - let swap_count = should_swap_0_1 as usize - + should_swap_1_2 as usize - + should_swap_2_3 as usize - + should_swap_3_4 as usize; - - if swap_count == 0 { - // Potentially already sorted. No need to swap, we know the first 5 elements are - // already in the right order. - 5 - } else if swap_count == 4 { - // Potentially reversed. - let mut rev_i = 4; - while rev_i < (len - 1) { - if !is_less(&*arr_ptr.add(rev_i + 1), &*arr_ptr.add(rev_i)) { - break; - } - rev_i += 1; - } - rev_i += 1; - v[..rev_i].reverse(); - insertion_sort_shift_left(v, rev_i, is_less); - return; - } else { - // Potentially random pattern. - branchless_swap(arr_ptr.add(0), arr_ptr.add(1), should_swap_0_1); - branchless_swap(arr_ptr.add(2), arr_ptr.add(3), should_swap_2_3); - - if len >= 12 { - // This aims to find a good balance between generating more code, which is bad - // for cold loops and improving hot code while not increasing mean comparison - // count too much. - sort8_stable(&mut v[4..12], is_less); - insertion_sort_shift_left(&mut v[4..], 8, is_less); - insertion_sort_shift_right(v, 4, is_less); - return; - } else { - // Complete the sort network for the first 4 elements. - swap_next_if_less(arr_ptr.add(1), is_less); - swap_next_if_less(arr_ptr.add(2), is_less); - swap_next_if_less(arr_ptr.add(0), is_less); - swap_next_if_less(arr_ptr.add(1), is_less); - - 4 - } - } - } - }; - - insertion_sort_shift_left(v, offset, is_less); -} - #[cfg(not(no_global_oom_handling))] fn merge_sort(v: &mut [T], is_less: &mut F) where @@ -918,12 +834,7 @@ where let len = v.len(); - // Slices of up to this length get sorted using insertion sort. - const MAX_NO_ALLOC_SIZE: usize = 20; - - // Short arrays get sorted in-place via insertion sort to avoid allocations. - if len <= MAX_NO_ALLOC_SIZE { - stable_sort_small(v, is_less); + if len < 2 { return; } @@ -963,6 +874,11 @@ where // return without allocating. return; } else if buf_ptr.is_null() { + // Short arrays get sorted in-place via insertion sort to avoid allocations. + if sort_small_stable(v, start, is_less) { + return; + } + // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it // shallow copies of the contents of `v` without risking the dtors running on copies if // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the @@ -1016,11 +932,7 @@ where || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len) || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len)) { - if n >= 3 && runs[n - 3].len < runs[n - 1].len { - Some(n - 3) - } else { - Some(n - 2) - } + if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) } } else { None } @@ -1033,6 +945,67 @@ where } } +/// Check whether `v` applies for small sort optimization. +/// `v[start..]` is assumed already sorted. +#[cfg(not(no_global_oom_handling))] +fn sort_small_stable(v: &mut [T], start: usize, is_less: &mut F) -> bool +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + + if qualifies_for_branchless_sort::() { + // Testing showed that even though this incurs more comparisons, up to size 32 (4 * 8), + // avoiding the allocation and sticking with simple code is worth it. Going further eg. 40 + // is still worth it for u64 or even types with more expensive comparisons, but risks + // incurring just too many comparisons than doing the regular TimSort. + const MAX_NO_ALLOC_SIZE: usize = 32; + if len <= MAX_NO_ALLOC_SIZE { + if len < 8 { + insertion_sort_shift_right(v, start, is_less); + return true; + } + + let mut merge_count = 0; + for chunk in v.chunks_exact_mut(8) { + // SAFETY: chunks_exact_mut promised to give us slices of len 8. + unsafe { + sort8_stable(chunk, is_less); + } + merge_count += 1; + } + + let mut swap = mem::MaybeUninit::<[T; 8]>::uninit(); + let swap_ptr = swap.as_mut_ptr() as *mut T; + + let mut i = 8; + while merge_count > 1 { + // SAFETY: We know the smaller side will be of size 8 because mid is 8. And both + // sides are non empty because of merge_count, and the right side will always be of + // size 8 and the left size of 8 or greater. Thus the smaller side will always be + // exactly 8 long, the size of swap. + unsafe { + merge(&mut v[0..(i + 8)], i, swap_ptr, is_less); + } + i += 8; + merge_count -= 1; + } + + insertion_sort_shift_left(v, i, is_less); + + return true; + } + } else { + const MAX_NO_ALLOC_SIZE: usize = 20; + if len <= MAX_NO_ALLOC_SIZE { + insertion_sort_shift_right(v, start, is_less); + return true; + } + } + + false +} + /// Takes a range as denoted by start and end, that is already sorted and extends it if necessary /// with sorts optimized for smaller ranges such as insertion sort. #[cfg(not(no_global_oom_handling))] @@ -1042,8 +1015,7 @@ where { debug_assert!(end > start); - // Testing showed that using MAX_INSERTION here yields the best performance for many types, but - // incurs more total comparisons. A balance between least comparisons and best performance, as + // This value is a balance between least comparisons and best performance, as // influenced by for example cache locality. const MIN_INSERTION_RUN: usize = 10; @@ -1115,6 +1087,7 @@ impl Drop for InsertionHole { /// Inserts `v[v.len() - 1]` into pre-sorted sequence `v[..v.len() - 1]` so that whole `v[..]` /// becomes sorted. +#[cfg(not(no_global_oom_handling))] unsafe fn insert_tail(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool, @@ -1167,11 +1140,12 @@ where } } -/// Sort v assuming v[..offset] is already sorted. +/// Sort `v` assuming `v[..offset]` is already sorted. /// /// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no /// performance impact. Even improving performance in some cases. #[inline(never)] +#[cfg(not(no_global_oom_handling))] fn insertion_sort_shift_left(v: &mut [T], offset: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, @@ -1195,11 +1169,12 @@ where } } -/// Sort v assuming v[offset..] is already sorted. +/// Sort `v` assuming `v[offset..]` is already sorted. /// /// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no /// performance impact. Even improving performance in some cases. #[inline(never)] +#[cfg(not(no_global_oom_handling))] fn insertion_sort_shift_right(v: &mut [T], offset: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, @@ -1227,6 +1202,7 @@ where /// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted. /// /// This is the integral subroutine of insertion sort. +#[cfg(not(no_global_oom_handling))] unsafe fn insert_head(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool, @@ -1287,6 +1263,10 @@ where /// /// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough /// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type. +/// +/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no +/// performance impact. +#[inline(never)] #[cfg(not(no_global_oom_handling))] unsafe fn merge(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F) where @@ -1506,6 +1486,7 @@ where /// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no /// performance impact. #[inline(never)] +#[cfg(not(no_global_oom_handling))] unsafe fn sort8_stable(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool, @@ -1559,6 +1540,7 @@ where } } +#[cfg(not(no_global_oom_handling))] unsafe fn sort24_stable(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool,