From 1805c29ca128b07ac13c2555f6b1f9567c50ba8c Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sun, 25 Aug 2024 19:14:00 +0200 Subject: [PATCH 1/9] Add binary-size optimized variants for stable and unstable sort as well as select_nth_unstable - Stable sort uses a simple merge-sort that re-uses the existing - rather gnarly - merge function. - Unstable sort jumps directly to the branchless heapsort fallback. - select_nth_unstable jumps directly to the median_of_medians fallback, which is augmented with a custom tiny smallsort and partition impl. Some code is duplicated but de-duplication would bring it's own problems. For example `swap_if_less` is critical for performance, if the sorting networks don't inline it perf drops drastically, however `#[inline(always)]` is also a poor fit, if the provided comparison function is huge, it gives the compiler an out to only instantiate `swap_if_less` once and call it. Another aspect that would suffer when making `swap_if_less` pub, is having to cfg out dozens of functions in in smallsort module. --- core/src/slice/sort/mod.rs | 1 + core/src/slice/sort/select.rs | 125 +++++++++++++++++++++- core/src/slice/sort/shared/smallsort.rs | 2 +- core/src/slice/sort/stable/mod.rs | 58 +++++++--- core/src/slice/sort/stable/tiny.rs | 75 +++++++++++++ core/src/slice/sort/unstable/mod.rs | 37 +++++-- core/src/slice/sort/unstable/quicksort.rs | 19 +++- 7 files changed, 284 insertions(+), 33 deletions(-) create mode 100644 core/src/slice/sort/stable/tiny.rs diff --git a/core/src/slice/sort/mod.rs b/core/src/slice/sort/mod.rs index 79852708b81ea..81b99a4364b2e 100644 --- a/core/src/slice/sort/mod.rs +++ b/core/src/slice/sort/mod.rs @@ -5,4 +5,5 @@ pub mod stable; pub mod unstable; pub(crate) mod select; +#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod shared; diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs index f6529f23bcb3f..b60e33b3a2f0d 100644 --- a/core/src/slice/sort/select.rs +++ b/core/src/slice/sort/select.rs @@ -6,9 +6,13 @@ //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with //! better performance than one would get using heapsort as fallback. +use crate::intrinsics; use crate::mem::{self, SizedTypeProperties}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::unstable::quicksort::partition; /// Reorders the slice such that the element at `index` is at its final sorted position. @@ -40,7 +44,15 @@ where let min_idx = min_index(v, &mut is_less).unwrap(); v.swap(min_idx, index); } else { - partition_at_index_loop(v, index, None, &mut is_less); + #[cfg(not(feature = "optimize_for_size"))] + { + partition_at_index_loop(v, index, None, &mut is_less); + } + + #[cfg(feature = "optimize_for_size")] + { + median_of_medians(v, &mut is_less, index); + } } let (left, right) = v.split_at_mut(index); @@ -53,6 +65,7 @@ where // most once, it doesn't make sense to use something more sophisticated than insertion-sort. const INSERTION_SORT_THRESHOLD: usize = 16; +#[cfg(not(feature = "optimize_for_size"))] fn partition_at_index_loop<'a, T, F>( mut v: &'a mut [T], mut index: usize, @@ -167,8 +180,17 @@ fn median_of_medians bool>(mut v: &mut [T], is_less: &mut loop { if v.len() <= INSERTION_SORT_THRESHOLD { if v.len() >= 2 { - insertion_sort_shift_left(v, 1, is_less); + #[cfg(not(feature = "optimize_for_size"))] + { + insertion_sort_shift_left(v, 1, is_less); + } + + #[cfg(feature = "optimize_for_size")] + { + bubble_sort(v, is_less); + } } + return; } @@ -230,7 +252,15 @@ fn median_of_ninthers bool>(v: &mut [T], is_less: &mut F) median_of_medians(&mut v[lo..lo + frac], is_less, pivot); - partition(v, lo + pivot, is_less) + #[cfg(not(feature = "optimize_for_size"))] + { + partition(v, lo + pivot, is_less) + } + + #[cfg(feature = "optimize_for_size")] + { + partition_size_opt(v, lo + pivot, is_less) + } } /// Moves around the 9 elements at the indices a..i, such that @@ -298,3 +328,92 @@ fn median_idx bool>( } b } + +// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it +// would clutter that module with cfg statements and make it generally harder to read and develop. +// So to decouple things and simplify it, we use a an even smaller bubble sort. +#[cfg(feature = "optimize_for_size")] +fn bubble_sort bool>(v: &mut [T], is_less: &mut F) { + let mut n = v.len(); + let mut did_swap = true; + + while did_swap && n > 1 { + did_swap = false; + for i in 1..n { + // SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds. + unsafe { + if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) { + v.swap_unchecked(i - 1, i); + did_swap = true; + } + } + } + n -= 1; + } +} + +#[cfg(feature = "optimize_for_size")] +fn partition_size_opt(v: &mut [T], pivot: usize, is_less: &mut F) -> usize +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + + // Allows for panic-free code-gen by proving this property to the compiler. + if len == 0 { + return 0; + } + + if pivot >= len { + intrinsics::abort(); + } + + // SAFETY: We checked that `pivot` is in-bounds. + unsafe { + // Place the pivot at the beginning of slice. + v.swap_unchecked(0, pivot); + } + let (pivot, v_without_pivot) = v.split_at_mut(1); + + // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function + // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias. + // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value + // into a stack value, but this creates issues for types with interior mutability mandating + // a drop guard. + let pivot = &mut pivot[0]; + + let num_lt = partition_lomuto_branchless_simple(v_without_pivot, pivot, is_less); + + if num_lt >= len { + intrinsics::abort(); + } + + // SAFETY: We checked that `num_lt` is in-bounds. + unsafe { + // Place the pivot between the two partitions. + v.swap_unchecked(0, num_lt); + } + + num_lt +} + +#[cfg(feature = "optimize_for_size")] +fn partition_lomuto_branchless_simple bool>( + v: &mut [T], + pivot: &T, + is_less: &mut F, +) -> usize { + let mut left = 0; + + for right in 0..v.len() { + // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that + // left <= right and that both are in-bounds. + unsafe { + let right_is_lt = is_less(v.get_unchecked(right), pivot); + v.swap_unchecked(left, right); + left += right_is_lt as usize; + } + } + + left +} diff --git a/core/src/slice/sort/shared/smallsort.rs b/core/src/slice/sort/shared/smallsort.rs index db0c5c72822c0..5027962ccb415 100644 --- a/core/src/slice/sort/shared/smallsort.rs +++ b/core/src/slice/sort/shared/smallsort.rs @@ -378,7 +378,7 @@ where /// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the /// value at position `b_pos` is less than the one at position `a_pos`. -pub unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) +unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, { diff --git a/core/src/slice/sort/stable/mod.rs b/core/src/slice/sort/stable/mod.rs index a383b0f589ccf..a61a95a225455 100644 --- a/core/src/slice/sort/stable/mod.rs +++ b/core/src/slice/sort/stable/mod.rs @@ -1,15 +1,24 @@ //! This module contains the entry points for `slice::sort`. +#[cfg(not(feature = "optimize_for_size"))] +use crate::cmp; +use crate::intrinsics; use crate::mem::{self, MaybeUninit, SizedTypeProperties}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::{ insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN, }; -use crate::{cmp, intrinsics}; -pub(crate) mod drift; pub(crate) mod merge; + +#[cfg(not(feature = "optimize_for_size"))] +pub(crate) mod drift; +#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod quicksort; +#[cfg(feature = "optimize_for_size")] +pub(crate) mod tiny; + /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. /// Design document: /// @@ -30,25 +39,48 @@ pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less return; } - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; + #[cfg(not(feature = "optimize_for_size"))] + { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + driftsort_main::(v, is_less); } - driftsort_main::(v, is_less); + #[cfg(feature = "optimize_for_size")] + { + let alloc_len = len / 2; + + // For small inputs 4KiB of stack storage suffices, which allows us to avoid + // calling the (de-)allocator. Benchmarks showed this was quite beneficial. + let mut stack_buf = AlignedStorage::::new(); + let stack_scratch = stack_buf.as_uninit_slice_mut(); + let mut heap_buf; + let scratch = if stack_scratch.len() >= alloc_len { + stack_scratch + } else { + heap_buf = BufT::with_capacity(alloc_len); + heap_buf.as_uninit_slice_mut() + }; + + tiny::mergesort(v, scratch, is_less); + } } /// See [`sort`] /// /// Deliberately don't inline the main sorting routine entrypoint to ensure the /// inlined insertion sort i-cache footprint remains minimal. +#[cfg(not(feature = "optimize_for_size"))] #[inline(never)] fn driftsort_main bool, BufT: BufGuard>(v: &mut [T], is_less: &mut F) { // By allocating n elements of memory we can ensure the entire input can diff --git a/core/src/slice/sort/stable/tiny.rs b/core/src/slice/sort/stable/tiny.rs new file mode 100644 index 0000000000000..307011f3ee8c4 --- /dev/null +++ b/core/src/slice/sort/stable/tiny.rs @@ -0,0 +1,75 @@ +//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs. + +use crate::mem::{ManuallyDrop, MaybeUninit}; +use crate::ptr; +use crate::slice::sort::stable::merge; + +/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever, +/// no run detection, etc. +#[inline(always)] +pub fn mergesort bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + is_less: &mut F, +) { + let len = v.len(); + + if len > 2 { + let mid = len / 2; + + // SAFETY: mid is in-bounds. + unsafe { + // Sort the left half recursively. + mergesort(v.get_unchecked_mut(..mid), scratch, is_less); + // Sort the right half recursively. + mergesort(v.get_unchecked_mut(mid..), scratch, is_less); + } + + merge::merge(v, scratch, mid, is_less); + } else if len == 2 { + // Branchless swap the two elements. This reduces the recursion depth and improves + // perf significantly at a small binary-size cost. Trades ~10% perf boost for integers + // for ~50 bytes in the binary. + + // SAFETY: We checked the len, the pointers we create are valid and don't overlap. + unsafe { + swap_if_less(v.as_mut_ptr(), 0, 1, is_less); + } + } +} + +/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the +/// value at position `b_pos` is less than the one at position `a_pos`. +unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid + // pointers into `v_base`, and are properly aligned, and part of the same allocation. + unsafe { + let v_a = v_base.add(a_pos); + let v_b = v_base.add(b_pos); + + // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be + // in a well defined state, without duplicates. + + // Important to only swap if it is more and not if it is equal. is_less should return false for + // equal, so we don't swap. + let should_swap = is_less(&*v_b, &*v_a); + + // This is a branchless version of swap if. + // The equivalent code with a branch would be: + // + // if should_swap { + // ptr::swap(left, right, 1); + // } + + // The goal is to generate cmov instructions here. + let left_swap = if should_swap { v_b } else { v_a }; + let right_swap = if should_swap { v_a } else { v_b }; + + let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap)); + ptr::copy(left_swap, v_a, 1); + ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1); + } +} diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs index 932e01f4401e5..43b5cc79fc7f3 100644 --- a/core/src/slice/sort/unstable/mod.rs +++ b/core/src/slice/sort/unstable/mod.rs @@ -2,10 +2,13 @@ use crate::intrinsics; use crate::mem::SizedTypeProperties; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::find_existing_run; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; pub(crate) mod heapsort; +#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod quicksort; /// Unstable sort called ipnsort by Lukas Bergdoll and Orson Peters. @@ -28,25 +31,37 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { return; } - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; + #[cfg(not(feature = "optimize_for_size"))] + { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + ipnsort(v, is_less); } - ipnsort(v, is_less); + #[cfg(feature = "optimize_for_size")] + { + // SAFETY: We checked that `len >= 2`. + unsafe { + heapsort::heapsort(v, is_less); + } + } } /// See [`sort`] /// /// Deliberately don't inline the main sorting routine entrypoint to ensure the /// inlined insertion sort i-cache footprint remains minimal. +#[cfg(not(feature = "optimize_for_size"))] #[inline(never)] fn ipnsort(v: &mut [T], is_less: &mut F) where diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs index cd53656e9b4b8..e55ebe67e1306 100644 --- a/core/src/slice/sort/unstable/quicksort.rs +++ b/core/src/slice/sort/unstable/quicksort.rs @@ -98,13 +98,15 @@ where return 0; } - // Allows for panic-free code-gen by proving this property to the compiler. if pivot >= len { intrinsics::abort(); } - // Place the pivot at the beginning of slice. - v.swap(0, pivot); + // SAFETY: We checked that `pivot` is in-bounds. + unsafe { + // Place the pivot at the beginning of slice. + v.swap_unchecked(0, pivot); + } let (pivot, v_without_pivot) = v.split_at_mut(1); // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function @@ -118,8 +120,15 @@ where // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn. let num_lt = (const { inst_partition::() })(v_without_pivot, pivot, is_less); - // Place the pivot between the two partitions. - v.swap(0, num_lt); + if num_lt >= len { + intrinsics::abort(); + } + + // SAFETY: We checked that `num_lt` is in-bounds. + unsafe { + // Place the pivot between the two partitions. + v.swap_unchecked(0, num_lt); + } num_lt } From f46fcfed64476c113e4caf7ffcd87c32761b60cd Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Mon, 26 Aug 2024 11:17:38 +0200 Subject: [PATCH 2/9] Reduce code duplication by moving partition_lomuto_branchless_simple into quicksort module --- core/src/slice/sort/select.rs | 78 +---------------------- core/src/slice/sort/unstable/mod.rs | 1 - core/src/slice/sort/unstable/quicksort.rs | 39 +++++++++++- 3 files changed, 39 insertions(+), 79 deletions(-) diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs index b60e33b3a2f0d..2b3e14755302c 100644 --- a/core/src/slice/sort/select.rs +++ b/core/src/slice/sort/select.rs @@ -6,13 +6,11 @@ //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with //! better performance than one would get using heapsort as fallback. -use crate::intrinsics; use crate::mem::{self, SizedTypeProperties}; #[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; #[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; -#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::unstable::quicksort::partition; /// Reorders the slice such that the element at `index` is at its final sorted position. @@ -252,15 +250,7 @@ fn median_of_ninthers bool>(v: &mut [T], is_less: &mut F) median_of_medians(&mut v[lo..lo + frac], is_less, pivot); - #[cfg(not(feature = "optimize_for_size"))] - { - partition(v, lo + pivot, is_less) - } - - #[cfg(feature = "optimize_for_size")] - { - partition_size_opt(v, lo + pivot, is_less) - } + partition(v, lo + pivot, is_less) } /// Moves around the 9 elements at the indices a..i, such that @@ -351,69 +341,3 @@ fn bubble_sort bool>(v: &mut [T], is_less: &mut F) { n -= 1; } } - -#[cfg(feature = "optimize_for_size")] -fn partition_size_opt(v: &mut [T], pivot: usize, is_less: &mut F) -> usize -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - - // Allows for panic-free code-gen by proving this property to the compiler. - if len == 0 { - return 0; - } - - if pivot >= len { - intrinsics::abort(); - } - - // SAFETY: We checked that `pivot` is in-bounds. - unsafe { - // Place the pivot at the beginning of slice. - v.swap_unchecked(0, pivot); - } - let (pivot, v_without_pivot) = v.split_at_mut(1); - - // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function - // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias. - // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value - // into a stack value, but this creates issues for types with interior mutability mandating - // a drop guard. - let pivot = &mut pivot[0]; - - let num_lt = partition_lomuto_branchless_simple(v_without_pivot, pivot, is_less); - - if num_lt >= len { - intrinsics::abort(); - } - - // SAFETY: We checked that `num_lt` is in-bounds. - unsafe { - // Place the pivot between the two partitions. - v.swap_unchecked(0, num_lt); - } - - num_lt -} - -#[cfg(feature = "optimize_for_size")] -fn partition_lomuto_branchless_simple bool>( - v: &mut [T], - pivot: &T, - is_less: &mut F, -) -> usize { - let mut left = 0; - - for right in 0..v.len() { - // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that - // left <= right and that both are in-bounds. - unsafe { - let right_is_lt = is_less(v.get_unchecked(right), pivot); - v.swap_unchecked(left, right); - left += right_is_lt as usize; - } - } - - left -} diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs index 43b5cc79fc7f3..faac97eab02b8 100644 --- a/core/src/slice/sort/unstable/mod.rs +++ b/core/src/slice/sort/unstable/mod.rs @@ -8,7 +8,6 @@ use crate::slice::sort::shared::find_existing_run; use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; pub(crate) mod heapsort; -#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod quicksort; /// Unstable sort called ipnsort by Lukas Bergdoll and Orson Peters. diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs index e55ebe67e1306..83751d99cc261 100644 --- a/core/src/slice/sort/unstable/quicksort.rs +++ b/core/src/slice/sort/unstable/quicksort.rs @@ -1,7 +1,9 @@ //! This module contains an unstable quicksort and two partition implementations. use crate::mem::{self, ManuallyDrop}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl; use crate::{intrinsics, ptr}; @@ -11,6 +13,7 @@ use crate::{intrinsics, ptr}; /// /// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero, /// this function will immediately switch to heapsort. +#[cfg(not(feature = "optimize_for_size"))] pub(crate) fn quicksort<'a, T, F>( mut v: &'a mut [T], mut ancestor_pivot: Option<&'a T>, @@ -138,7 +141,16 @@ const fn inst_partition bool>() -> fn(&mut [T], &T, &mut if mem::size_of::() <= MAX_BRANCHLESS_PARTITION_SIZE { // Specialize for types that are relatively cheap to copy, where branchless optimizations // have large leverage e.g. `u64` and `String`. - partition_lomuto_branchless_cyclic:: + + #[cfg(not(feature = "optimize_for_size"))] + { + partition_lomuto_branchless_cyclic:: + } + + #[cfg(feature = "optimize_for_size")] + { + partition_lomuto_branchless_simple:: + } } else { partition_hoare_branchy_cyclic:: } @@ -224,6 +236,7 @@ where } } +#[cfg(not(feature = "optimize_for_size"))] struct PartitionState { // The current element that is being looked at, scans left to right through slice. right: *mut T, @@ -234,6 +247,7 @@ struct PartitionState { gap: GapGuardRaw, } +#[cfg(not(feature = "optimize_for_size"))] fn partition_lomuto_branchless_cyclic(v: &mut [T], pivot: &T, is_less: &mut F) -> usize where F: FnMut(&T, &T) -> bool, @@ -325,6 +339,27 @@ where } } +#[cfg(feature = "optimize_for_size")] +fn partition_lomuto_branchless_simple bool>( + v: &mut [T], + pivot: &T, + is_less: &mut F, +) -> usize { + let mut left = 0; + + for right in 0..v.len() { + // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that + // left <= right and that both are in-bounds. + unsafe { + let right_is_lt = is_less(v.get_unchecked(right), pivot); + v.swap_unchecked(left, right); + left += right_is_lt as usize; + } + } + + left +} + struct GapGuard { pos: *mut T, value: ManuallyDrop, @@ -342,11 +377,13 @@ impl Drop for GapGuard { /// Ideally this wouldn't be needed and we could just use the regular GapGuard. /// See comment in [`partition_lomuto_branchless_cyclic`]. +#[cfg(not(feature = "optimize_for_size"))] struct GapGuardRaw { pos: *mut T, value: *mut T, } +#[cfg(not(feature = "optimize_for_size"))] impl Drop for GapGuardRaw { fn drop(&mut self) { // SAFETY: `self` MUST be constructed in a way that makes copying the gap value into From 2fa330eedddb7cba1c8e5210166a734246a582d5 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Mon, 26 Aug 2024 12:21:37 +0200 Subject: [PATCH 3/9] Convert cfg blocks to cfg_if --- core/src/slice/sort/select.rs | 28 +++++----- core/src/slice/sort/stable/mod.rs | 64 +++++++++++------------ core/src/slice/sort/unstable/mod.rs | 40 +++++++------- core/src/slice/sort/unstable/quicksort.rs | 15 +++--- 4 files changed, 68 insertions(+), 79 deletions(-) diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs index 2b3e14755302c..28ca9dcad39a7 100644 --- a/core/src/slice/sort/select.rs +++ b/core/src/slice/sort/select.rs @@ -42,14 +42,12 @@ where let min_idx = min_index(v, &mut is_less).unwrap(); v.swap(min_idx, index); } else { - #[cfg(not(feature = "optimize_for_size"))] - { - partition_at_index_loop(v, index, None, &mut is_less); - } - - #[cfg(feature = "optimize_for_size")] - { - median_of_medians(v, &mut is_less, index); + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + median_of_medians(v, &mut is_less, index); + } else { + partition_at_index_loop(v, index, None, &mut is_less); + } } } @@ -178,14 +176,12 @@ fn median_of_medians bool>(mut v: &mut [T], is_less: &mut loop { if v.len() <= INSERTION_SORT_THRESHOLD { if v.len() >= 2 { - #[cfg(not(feature = "optimize_for_size"))] - { - insertion_sort_shift_left(v, 1, is_less); - } - - #[cfg(feature = "optimize_for_size")] - { - bubble_sort(v, is_less); + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + bubble_sort(v, is_less); + } else { + insertion_sort_shift_left(v, 1, is_less); + } } } diff --git a/core/src/slice/sort/stable/mod.rs b/core/src/slice/sort/stable/mod.rs index a61a95a225455..3472401c4dcf8 100644 --- a/core/src/slice/sort/stable/mod.rs +++ b/core/src/slice/sort/stable/mod.rs @@ -39,40 +39,38 @@ pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less return; } - #[cfg(not(feature = "optimize_for_size"))] - { - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; - } - - driftsort_main::(v, is_less); - } - - #[cfg(feature = "optimize_for_size")] - { - let alloc_len = len / 2; - - // For small inputs 4KiB of stack storage suffices, which allows us to avoid - // calling the (de-)allocator. Benchmarks showed this was quite beneficial. - let mut stack_buf = AlignedStorage::::new(); - let stack_scratch = stack_buf.as_uninit_slice_mut(); - let mut heap_buf; - let scratch = if stack_scratch.len() >= alloc_len { - stack_scratch + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + let alloc_len = len / 2; + + // For small inputs 4KiB of stack storage suffices, which allows us to avoid + // calling the (de-)allocator. Benchmarks showed this was quite beneficial. + let mut stack_buf = AlignedStorage::::new(); + let stack_scratch = stack_buf.as_uninit_slice_mut(); + let mut heap_buf; + let scratch = if stack_scratch.len() >= alloc_len { + stack_scratch + } else { + heap_buf = BufT::with_capacity(alloc_len); + heap_buf.as_uninit_slice_mut() + }; + + tiny::mergesort(v, scratch, is_less); } else { - heap_buf = BufT::with_capacity(alloc_len); - heap_buf.as_uninit_slice_mut() - }; - - tiny::mergesort(v, scratch, is_less); + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + driftsort_main::(v, is_less); + } } } diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs index faac97eab02b8..130be21ee3fe8 100644 --- a/core/src/slice/sort/unstable/mod.rs +++ b/core/src/slice/sort/unstable/mod.rs @@ -30,28 +30,26 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { return; } - #[cfg(not(feature = "optimize_for_size"))] - { - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; - } - - ipnsort(v, is_less); - } + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + // SAFETY: We checked that `len >= 2`. + unsafe { + heapsort::heapsort(v, is_less); + } + } else { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } - #[cfg(feature = "optimize_for_size")] - { - // SAFETY: We checked that `len >= 2`. - unsafe { - heapsort::heapsort(v, is_less); + ipnsort(v, is_less); } } } diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs index 83751d99cc261..9c59ccdb70005 100644 --- a/core/src/slice/sort/unstable/quicksort.rs +++ b/core/src/slice/sort/unstable/quicksort.rs @@ -141,15 +141,12 @@ const fn inst_partition bool>() -> fn(&mut [T], &T, &mut if mem::size_of::() <= MAX_BRANCHLESS_PARTITION_SIZE { // Specialize for types that are relatively cheap to copy, where branchless optimizations // have large leverage e.g. `u64` and `String`. - - #[cfg(not(feature = "optimize_for_size"))] - { - partition_lomuto_branchless_cyclic:: - } - - #[cfg(feature = "optimize_for_size")] - { - partition_lomuto_branchless_simple:: + cfg_if! { + if #[cfg(feature = "optimize_for_size")] { + partition_lomuto_branchless_simple:: + } else { + partition_lomuto_branchless_cyclic:: + } } } else { partition_hoare_branchy_cyclic:: From ae57bdf05e4066321f2841f9a604868d3525675e Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 27 Aug 2024 10:01:48 +0200 Subject: [PATCH 4/9] Use last swap optimization in bubblesort --- core/src/slice/sort/select.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs index 28ca9dcad39a7..7bda54e792528 100644 --- a/core/src/slice/sort/select.rs +++ b/core/src/slice/sort/select.rs @@ -317,23 +317,32 @@ fn median_idx bool>( // It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it // would clutter that module with cfg statements and make it generally harder to read and develop. -// So to decouple things and simplify it, we use a an even smaller bubble sort. +// So to decouple things and simplify it, we use an even smaller bubble sort. #[cfg(feature = "optimize_for_size")] fn bubble_sort bool>(v: &mut [T], is_less: &mut F) { + use crate::ptr; + let mut n = v.len(); - let mut did_swap = true; - while did_swap && n > 1 { - did_swap = false; - for i in 1..n { + let v_base = v.as_mut_ptr(); + + while n > 1 { + let loop_n = n; + n = 0; + for i in 1..loop_n { // SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds. unsafe { - if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) { - v.swap_unchecked(i - 1, i); - did_swap = true; + // Even if `is_less` erroneously always returns true, we are guaranteed that `n` + // reduces by one each out loop iteration, because `1..n` is exclusive. This + // guarantees a bounded run-time should `Ord` be implemented incorrectly. + let v_i = v_base.add(i); + let v_i_minus_one = v_base.add(i - 1); + + if is_less(&*v_i, &*v_i_minus_one) { + ptr::swap_nonoverlapping(v_i, v_i_minus_one, 1); + n = i; } } } - n -= 1; } } From 717e3aa5f1fa580c1984c3976d18ef66364c8185 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Thu, 29 Aug 2024 10:32:59 +0200 Subject: [PATCH 5/9] Use simpler branchy swap logic in tiny merge sort Avoids the code duplication issue and results in smaller binary size, which after all is the purpose of the feature. --- core/src/slice/sort/shared/smallsort.rs | 5 +++ core/src/slice/sort/stable/tiny.rs | 50 ++++--------------------- 2 files changed, 13 insertions(+), 42 deletions(-) diff --git a/core/src/slice/sort/shared/smallsort.rs b/core/src/slice/sort/shared/smallsort.rs index 5027962ccb415..aaa9f9f1c7fd1 100644 --- a/core/src/slice/sort/shared/smallsort.rs +++ b/core/src/slice/sort/shared/smallsort.rs @@ -378,6 +378,11 @@ where /// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the /// value at position `b_pos` is less than the one at position `a_pos`. +/// +/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like +/// types. `is_less` could be a huge function and we want to give the compiler an option to +/// not inline this function. For the same reasons that this function is very perf critical +/// it should be in the same module as the functions that use it. unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, diff --git a/core/src/slice/sort/stable/tiny.rs b/core/src/slice/sort/stable/tiny.rs index 307011f3ee8c4..071ab8e107fe3 100644 --- a/core/src/slice/sort/stable/tiny.rs +++ b/core/src/slice/sort/stable/tiny.rs @@ -1,6 +1,6 @@ //! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs. -use crate::mem::{ManuallyDrop, MaybeUninit}; +use crate::mem::MaybeUninit; use crate::ptr; use crate::slice::sort::stable::merge; @@ -27,49 +27,15 @@ pub fn mergesort bool>( merge::merge(v, scratch, mid, is_less); } else if len == 2 { - // Branchless swap the two elements. This reduces the recursion depth and improves - // perf significantly at a small binary-size cost. Trades ~10% perf boost for integers - // for ~50 bytes in the binary. - // SAFETY: We checked the len, the pointers we create are valid and don't overlap. unsafe { - swap_if_less(v.as_mut_ptr(), 0, 1, is_less); - } - } -} - -/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the -/// value at position `b_pos` is less than the one at position `a_pos`. -unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid - // pointers into `v_base`, and are properly aligned, and part of the same allocation. - unsafe { - let v_a = v_base.add(a_pos); - let v_b = v_base.add(b_pos); + let v_base = v.as_mut_ptr(); + let v_a = v_base; + let v_b = v_base.add(1); - // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be - // in a well defined state, without duplicates. - - // Important to only swap if it is more and not if it is equal. is_less should return false for - // equal, so we don't swap. - let should_swap = is_less(&*v_b, &*v_a); - - // This is a branchless version of swap if. - // The equivalent code with a branch would be: - // - // if should_swap { - // ptr::swap(left, right, 1); - // } - - // The goal is to generate cmov instructions here. - let left_swap = if should_swap { v_b } else { v_a }; - let right_swap = if should_swap { v_a } else { v_b }; - - let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap)); - ptr::copy(left_swap, v_a, 1); - ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1); + if is_less(&*v_b, &*v_a) { + ptr::swap_nonoverlapping(v_a, v_b, 1); + } + } } } From bea61dafef2da35f347f7cdfb09e8dc6be428fb9 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Wed, 4 Sep 2024 18:58:44 +0200 Subject: [PATCH 6/9] Drop bubble_sort While using it results in slightly slammer binaries, it's not deemed worth it to add yet another sort algorithm to the standard library. select_nth_unstable has bigger binary-size problems. --- core/src/slice/sort/mod.rs | 1 - core/src/slice/sort/select.rs | 41 +------------------------------ core/src/slice/sort/shared/mod.rs | 2 ++ 3 files changed, 3 insertions(+), 41 deletions(-) diff --git a/core/src/slice/sort/mod.rs b/core/src/slice/sort/mod.rs index 81b99a4364b2e..79852708b81ea 100644 --- a/core/src/slice/sort/mod.rs +++ b/core/src/slice/sort/mod.rs @@ -5,5 +5,4 @@ pub mod stable; pub mod unstable; pub(crate) mod select; -#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod shared; diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs index 7bda54e792528..3358c03d30a9b 100644 --- a/core/src/slice/sort/select.rs +++ b/core/src/slice/sort/select.rs @@ -9,7 +9,6 @@ use crate::mem::{self, SizedTypeProperties}; #[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; -#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; use crate::slice::sort::unstable::quicksort::partition; @@ -176,13 +175,7 @@ fn median_of_medians bool>(mut v: &mut [T], is_less: &mut loop { if v.len() <= INSERTION_SORT_THRESHOLD { if v.len() >= 2 { - cfg_if! { - if #[cfg(feature = "optimize_for_size")] { - bubble_sort(v, is_less); - } else { - insertion_sort_shift_left(v, 1, is_less); - } - } + insertion_sort_shift_left(v, 1, is_less); } return; @@ -314,35 +307,3 @@ fn median_idx bool>( } b } - -// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it -// would clutter that module with cfg statements and make it generally harder to read and develop. -// So to decouple things and simplify it, we use an even smaller bubble sort. -#[cfg(feature = "optimize_for_size")] -fn bubble_sort bool>(v: &mut [T], is_less: &mut F) { - use crate::ptr; - - let mut n = v.len(); - - let v_base = v.as_mut_ptr(); - - while n > 1 { - let loop_n = n; - n = 0; - for i in 1..loop_n { - // SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds. - unsafe { - // Even if `is_less` erroneously always returns true, we are guaranteed that `n` - // reduces by one each out loop iteration, because `1..n` is exclusive. This - // guarantees a bounded run-time should `Ord` be implemented incorrectly. - let v_i = v_base.add(i); - let v_i_minus_one = v_base.add(i - 1); - - if is_less(&*v_i, &*v_i_minus_one) { - ptr::swap_nonoverlapping(v_i, v_i_minus_one, 1); - n = i; - } - } - } - } -} diff --git a/core/src/slice/sort/shared/mod.rs b/core/src/slice/sort/shared/mod.rs index ad1171bfc6a0a..e0f8d475a2e30 100644 --- a/core/src/slice/sort/shared/mod.rs +++ b/core/src/slice/sort/shared/mod.rs @@ -1,3 +1,5 @@ +#![cfg_attr(feature = "optimize_for_size", allow(dead_code))] + use crate::marker::Freeze; pub(crate) mod pivot; From cd3d6e88bc9d02ed5e00536e3a1a70d2472d2f9b Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Wed, 4 Sep 2024 19:37:49 +0200 Subject: [PATCH 7/9] Shrink heapsort further by combining sift_down loops --- core/src/slice/sort/unstable/heapsort.rs | 36 +++++++++++------------ core/src/slice/sort/unstable/mod.rs | 5 +--- core/src/slice/sort/unstable/quicksort.rs | 7 ++--- 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/core/src/slice/sort/unstable/heapsort.rs b/core/src/slice/sort/unstable/heapsort.rs index 27e2ad588ea09..35bfe2959cd78 100644 --- a/core/src/slice/sort/unstable/heapsort.rs +++ b/core/src/slice/sort/unstable/heapsort.rs @@ -1,46 +1,46 @@ //! This module contains a branchless heapsort as fallback for unstable quicksort. -use crate::{intrinsics, ptr}; +use crate::{cmp, intrinsics, ptr}; /// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case. /// /// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic /// fallback. -/// -/// SAFETY: The caller has to guarantee that `v.len()` >= 2. #[inline(never)] -pub(crate) unsafe fn heapsort(v: &mut [T], is_less: &mut F) +pub(crate) fn heapsort(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool, { - // SAFETY: See function safety. - unsafe { - intrinsics::assume(v.len() >= 2); - - // Build the heap in linear time. - for i in (0..v.len() / 2).rev() { - sift_down(v, i, is_less); - } + let len = v.len(); - // Pop maximal elements from the heap. - for i in (1..v.len()).rev() { + for i in (0..len + len / 2).rev() { + let sift_idx = if i >= len { + i - len + } else { v.swap(0, i); - sift_down(&mut v[..i], 0, is_less); + 0 + }; + + // SAFETY: The above calculation ensures that `sift_idx` is either 0 or + // `(len..(len + (len / 2))) - len`, which simplifies to `0..(len / 2)`. + // This guarantees the required `sift_idx <= len`. + unsafe { + sift_down(&mut v[..cmp::min(i, len)], sift_idx, is_less); } } } // This binary heap respects the invariant `parent >= child`. // -// SAFETY: The caller has to guarantee that node < `v.len()`. -#[inline(never)] +// SAFETY: The caller has to guarantee that `node <= v.len()`. +#[inline(always)] unsafe fn sift_down(v: &mut [T], mut node: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, { // SAFETY: See function safety. unsafe { - intrinsics::assume(node < v.len()); + intrinsics::assume(node <= v.len()); } let len = v.len(); diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs index 130be21ee3fe8..953c27ab6f417 100644 --- a/core/src/slice/sort/unstable/mod.rs +++ b/core/src/slice/sort/unstable/mod.rs @@ -32,10 +32,7 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { cfg_if! { if #[cfg(feature = "optimize_for_size")] { - // SAFETY: We checked that `len >= 2`. - unsafe { - heapsort::heapsort(v, is_less); - } + heapsort::heapsort(v, is_less); } else { // More advanced sorting methods than insertion sort are faster if called in // a hot loop for small inputs, but for general-purpose code the small diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs index 9c59ccdb70005..4feef5deeb0fb 100644 --- a/core/src/slice/sort/unstable/quicksort.rs +++ b/core/src/slice/sort/unstable/quicksort.rs @@ -5,6 +5,8 @@ use crate::mem::{self, ManuallyDrop}; use crate::slice::sort::shared::pivot::choose_pivot; #[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl; +#[cfg(not(feature = "optimize_for_size"))] +use crate::slice::sort::unstable::heapsort; use crate::{intrinsics, ptr}; /// Sorts `v` recursively. @@ -31,10 +33,7 @@ pub(crate) fn quicksort<'a, T, F>( // If too many bad pivot choices were made, simply fall back to heapsort in order to // guarantee `O(N x log(N))` worst-case. if limit == 0 { - // SAFETY: We assume the `small_sort` threshold is at least 1. - unsafe { - crate::slice::sort::unstable::heapsort::heapsort(v, is_less); - } + heapsort::heapsort(v, is_less); return; } From 670630da181204654adbb34ea06a128f56b0d7c3 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Wed, 4 Sep 2024 19:53:56 +0200 Subject: [PATCH 8/9] Select tiny sorts for 16-bit platforms Also skips stack alloc in stable sort if 16-bit target platform. --- core/src/slice/sort/stable/mod.rs | 31 ++++++++++++++++++----------- core/src/slice/sort/unstable/mod.rs | 2 +- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/core/src/slice/sort/stable/mod.rs b/core/src/slice/sort/stable/mod.rs index 3472401c4dcf8..00eb3785e0f25 100644 --- a/core/src/slice/sort/stable/mod.rs +++ b/core/src/slice/sort/stable/mod.rs @@ -40,20 +40,27 @@ pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less } cfg_if! { - if #[cfg(feature = "optimize_for_size")] { + if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] { let alloc_len = len / 2; - // For small inputs 4KiB of stack storage suffices, which allows us to avoid - // calling the (de-)allocator. Benchmarks showed this was quite beneficial. - let mut stack_buf = AlignedStorage::::new(); - let stack_scratch = stack_buf.as_uninit_slice_mut(); - let mut heap_buf; - let scratch = if stack_scratch.len() >= alloc_len { - stack_scratch - } else { - heap_buf = BufT::with_capacity(alloc_len); - heap_buf.as_uninit_slice_mut() - }; + cfg_if! { + if #[cfg(target_pointer_width = "16")] { + let heap_buf = BufT::with_capacity(alloc_len); + let scratch = heap_buf.as_uninit_slice_mut(); + } else { + // For small inputs 4KiB of stack storage suffices, which allows us to avoid + // calling the (de-)allocator. Benchmarks showed this was quite beneficial. + let mut stack_buf = AlignedStorage::::new(); + let stack_scratch = stack_buf.as_uninit_slice_mut(); + let mut heap_buf; + let scratch = if stack_scratch.len() >= alloc_len { + stack_scratch + } else { + heap_buf = BufT::with_capacity(alloc_len); + heap_buf.as_uninit_slice_mut() + }; + } + } tiny::mergesort(v, scratch, is_less); } else { diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs index 953c27ab6f417..8bbd85443d478 100644 --- a/core/src/slice/sort/unstable/mod.rs +++ b/core/src/slice/sort/unstable/mod.rs @@ -31,7 +31,7 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { } cfg_if! { - if #[cfg(feature = "optimize_for_size")] { + if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] { heapsort::heapsort(v, is_less); } else { // More advanced sorting methods than insertion sort are faster if called in From 5446229e7af30ff64a7e322e059bc514509c8fbe Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Wed, 4 Sep 2024 19:54:46 +0200 Subject: [PATCH 9/9] Use non-overlapping swap for inner heapsort loop This regresses binary-size slightly for normal builds, but the important release_lto_thin_opt_level_s config sees a small improvement in binary-size and a larger types such as string and 1k see 2-3% run-time improvements with this change. --- core/src/slice/sort/unstable/heapsort.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/core/src/slice/sort/unstable/heapsort.rs b/core/src/slice/sort/unstable/heapsort.rs index 35bfe2959cd78..85231779d031f 100644 --- a/core/src/slice/sort/unstable/heapsort.rs +++ b/core/src/slice/sort/unstable/heapsort.rs @@ -69,9 +69,7 @@ where break; } - // Swap `node` with the greater child, move one step down, and continue sifting. This - // could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size. - ptr::swap(v_base.add(node), v_base.add(child)); + ptr::swap_nonoverlapping(v_base.add(node), v_base.add(child), 1); } node = child;