From 1805c29ca128b07ac13c2555f6b1f9567c50ba8c Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Sun, 25 Aug 2024 19:14:00 +0200
Subject: [PATCH 1/9] Add binary-size optimized variants for stable and
 unstable sort as well as select_nth_unstable

- Stable sort uses a simple merge-sort that re-uses the existing - rather gnarly - merge function.
- Unstable sort jumps directly to the branchless heapsort fallback.
- select_nth_unstable jumps directly to the median_of_medians fallback, which is augmented with a
  custom tiny smallsort and partition impl.

Some code is duplicated but de-duplication would bring it's own problems. For example `swap_if_less`
is critical for performance, if the sorting networks don't inline it perf drops drastically,
however `#[inline(always)]` is also a poor fit, if the provided comparison function is huge,
it gives the compiler an out to only instantiate `swap_if_less` once and call it. Another aspect
that would suffer when making `swap_if_less` pub, is having to cfg out dozens of functions in
in smallsort module.
---
 core/src/slice/sort/mod.rs                |   1 +
 core/src/slice/sort/select.rs             | 125 +++++++++++++++++++++-
 core/src/slice/sort/shared/smallsort.rs   |   2 +-
 core/src/slice/sort/stable/mod.rs         |  58 +++++++---
 core/src/slice/sort/stable/tiny.rs        |  75 +++++++++++++
 core/src/slice/sort/unstable/mod.rs       |  37 +++++--
 core/src/slice/sort/unstable/quicksort.rs |  19 +++-
 7 files changed, 284 insertions(+), 33 deletions(-)
 create mode 100644 core/src/slice/sort/stable/tiny.rs

diff --git a/core/src/slice/sort/mod.rs b/core/src/slice/sort/mod.rs
index 79852708b81ea..81b99a4364b2e 100644
--- a/core/src/slice/sort/mod.rs
+++ b/core/src/slice/sort/mod.rs
@@ -5,4 +5,5 @@ pub mod stable;
 pub mod unstable;
 
 pub(crate) mod select;
+#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) mod shared;
diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs
index f6529f23bcb3f..b60e33b3a2f0d 100644
--- a/core/src/slice/sort/select.rs
+++ b/core/src/slice/sort/select.rs
@@ -6,9 +6,13 @@
 //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
 //! better performance than one would get using heapsort as fallback.
 
+use crate::intrinsics;
 use crate::mem::{self, SizedTypeProperties};
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::pivot::choose_pivot;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::unstable::quicksort::partition;
 
 /// Reorders the slice such that the element at `index` is at its final sorted position.
@@ -40,7 +44,15 @@ where
         let min_idx = min_index(v, &mut is_less).unwrap();
         v.swap(min_idx, index);
     } else {
-        partition_at_index_loop(v, index, None, &mut is_less);
+        #[cfg(not(feature = "optimize_for_size"))]
+        {
+            partition_at_index_loop(v, index, None, &mut is_less);
+        }
+
+        #[cfg(feature = "optimize_for_size")]
+        {
+            median_of_medians(v, &mut is_less, index);
+        }
     }
 
     let (left, right) = v.split_at_mut(index);
@@ -53,6 +65,7 @@ where
 // most once, it doesn't make sense to use something more sophisticated than insertion-sort.
 const INSERTION_SORT_THRESHOLD: usize = 16;
 
+#[cfg(not(feature = "optimize_for_size"))]
 fn partition_at_index_loop<'a, T, F>(
     mut v: &'a mut [T],
     mut index: usize,
@@ -167,8 +180,17 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
     loop {
         if v.len() <= INSERTION_SORT_THRESHOLD {
             if v.len() >= 2 {
-                insertion_sort_shift_left(v, 1, is_less);
+                #[cfg(not(feature = "optimize_for_size"))]
+                {
+                    insertion_sort_shift_left(v, 1, is_less);
+                }
+
+                #[cfg(feature = "optimize_for_size")]
+                {
+                    bubble_sort(v, is_less);
+                }
             }
+
             return;
         }
 
@@ -230,7 +252,15 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)
 
     median_of_medians(&mut v[lo..lo + frac], is_less, pivot);
 
-    partition(v, lo + pivot, is_less)
+    #[cfg(not(feature = "optimize_for_size"))]
+    {
+        partition(v, lo + pivot, is_less)
+    }
+
+    #[cfg(feature = "optimize_for_size")]
+    {
+        partition_size_opt(v, lo + pivot, is_less)
+    }
 }
 
 /// Moves around the 9 elements at the indices a..i, such that
@@ -298,3 +328,92 @@ fn median_idx<T, F: FnMut(&T, &T) -> bool>(
     }
     b
 }
+
+// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it
+// would clutter that module with cfg statements and make it generally harder to read and develop.
+// So to decouple things and simplify it, we use a an even smaller bubble sort.
+#[cfg(feature = "optimize_for_size")]
+fn bubble_sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    let mut n = v.len();
+    let mut did_swap = true;
+
+    while did_swap && n > 1 {
+        did_swap = false;
+        for i in 1..n {
+            // SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds.
+            unsafe {
+                if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
+                    v.swap_unchecked(i - 1, i);
+                    did_swap = true;
+                }
+            }
+        }
+        n -= 1;
+    }
+}
+
+#[cfg(feature = "optimize_for_size")]
+fn partition_size_opt<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+
+    // Allows for panic-free code-gen by proving this property to the compiler.
+    if len == 0 {
+        return 0;
+    }
+
+    if pivot >= len {
+        intrinsics::abort();
+    }
+
+    // SAFETY: We checked that `pivot` is in-bounds.
+    unsafe {
+        // Place the pivot at the beginning of slice.
+        v.swap_unchecked(0, pivot);
+    }
+    let (pivot, v_without_pivot) = v.split_at_mut(1);
+
+    // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
+    // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
+    // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
+    // into a stack value, but this creates issues for types with interior mutability mandating
+    // a drop guard.
+    let pivot = &mut pivot[0];
+
+    let num_lt = partition_lomuto_branchless_simple(v_without_pivot, pivot, is_less);
+
+    if num_lt >= len {
+        intrinsics::abort();
+    }
+
+    // SAFETY: We checked that `num_lt` is in-bounds.
+    unsafe {
+        // Place the pivot between the two partitions.
+        v.swap_unchecked(0, num_lt);
+    }
+
+    num_lt
+}
+
+#[cfg(feature = "optimize_for_size")]
+fn partition_lomuto_branchless_simple<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    pivot: &T,
+    is_less: &mut F,
+) -> usize {
+    let mut left = 0;
+
+    for right in 0..v.len() {
+        // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that
+        // left <= right and that both are in-bounds.
+        unsafe {
+            let right_is_lt = is_less(v.get_unchecked(right), pivot);
+            v.swap_unchecked(left, right);
+            left += right_is_lt as usize;
+        }
+    }
+
+    left
+}
diff --git a/core/src/slice/sort/shared/smallsort.rs b/core/src/slice/sort/shared/smallsort.rs
index db0c5c72822c0..5027962ccb415 100644
--- a/core/src/slice/sort/shared/smallsort.rs
+++ b/core/src/slice/sort/shared/smallsort.rs
@@ -378,7 +378,7 @@ where
 
 /// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
 /// value at position `b_pos` is less than the one at position `a_pos`.
-pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
+unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
 where
     F: FnMut(&T, &T) -> bool,
 {
diff --git a/core/src/slice/sort/stable/mod.rs b/core/src/slice/sort/stable/mod.rs
index a383b0f589ccf..a61a95a225455 100644
--- a/core/src/slice/sort/stable/mod.rs
+++ b/core/src/slice/sort/stable/mod.rs
@@ -1,15 +1,24 @@
 //! This module contains the entry points for `slice::sort`.
 
+#[cfg(not(feature = "optimize_for_size"))]
+use crate::cmp;
+use crate::intrinsics;
 use crate::mem::{self, MaybeUninit, SizedTypeProperties};
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::{
     insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
 };
-use crate::{cmp, intrinsics};
 
-pub(crate) mod drift;
 pub(crate) mod merge;
+
+#[cfg(not(feature = "optimize_for_size"))]
+pub(crate) mod drift;
+#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) mod quicksort;
 
+#[cfg(feature = "optimize_for_size")]
+pub(crate) mod tiny;
+
 /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
 /// Design document:
 /// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
@@ -30,25 +39,48 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
         return;
     }
 
-    // More advanced sorting methods than insertion sort are faster if called in
-    // a hot loop for small inputs, but for general-purpose code the small
-    // binary size of insertion sort is more important. The instruction cache in
-    // modern processors is very valuable, and for a single sort call in general
-    // purpose code any gains from an advanced method are cancelled by i-cache
-    // misses during the sort, and thrashing the i-cache for surrounding code.
-    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
-    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
-        insertion_sort_shift_left(v, 1, is_less);
-        return;
+    #[cfg(not(feature = "optimize_for_size"))]
+    {
+        // More advanced sorting methods than insertion sort are faster if called in
+        // a hot loop for small inputs, but for general-purpose code the small
+        // binary size of insertion sort is more important. The instruction cache in
+        // modern processors is very valuable, and for a single sort call in general
+        // purpose code any gains from an advanced method are cancelled by i-cache
+        // misses during the sort, and thrashing the i-cache for surrounding code.
+        const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+        if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+            insertion_sort_shift_left(v, 1, is_less);
+            return;
+        }
+
+        driftsort_main::<T, F, BufT>(v, is_less);
     }
 
-    driftsort_main::<T, F, BufT>(v, is_less);
+    #[cfg(feature = "optimize_for_size")]
+    {
+        let alloc_len = len / 2;
+
+        // For small inputs 4KiB of stack storage suffices, which allows us to avoid
+        // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
+        let mut stack_buf = AlignedStorage::<T, 4096>::new();
+        let stack_scratch = stack_buf.as_uninit_slice_mut();
+        let mut heap_buf;
+        let scratch = if stack_scratch.len() >= alloc_len {
+            stack_scratch
+        } else {
+            heap_buf = BufT::with_capacity(alloc_len);
+            heap_buf.as_uninit_slice_mut()
+        };
+
+        tiny::mergesort(v, scratch, is_less);
+    }
 }
 
 /// See [`sort`]
 ///
 /// Deliberately don't inline the main sorting routine entrypoint to ensure the
 /// inlined insertion sort i-cache footprint remains minimal.
+#[cfg(not(feature = "optimize_for_size"))]
 #[inline(never)]
 fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
     // By allocating n elements of memory we can ensure the entire input can
diff --git a/core/src/slice/sort/stable/tiny.rs b/core/src/slice/sort/stable/tiny.rs
new file mode 100644
index 0000000000000..307011f3ee8c4
--- /dev/null
+++ b/core/src/slice/sort/stable/tiny.rs
@@ -0,0 +1,75 @@
+//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.
+
+use crate::mem::{ManuallyDrop, MaybeUninit};
+use crate::ptr;
+use crate::slice::sort::stable::merge;
+
+/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
+/// no run detection, etc.
+#[inline(always)]
+pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    is_less: &mut F,
+) {
+    let len = v.len();
+
+    if len > 2 {
+        let mid = len / 2;
+
+        // SAFETY: mid is in-bounds.
+        unsafe {
+            // Sort the left half recursively.
+            mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
+            // Sort the right half recursively.
+            mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
+        }
+
+        merge::merge(v, scratch, mid, is_less);
+    } else if len == 2 {
+        // Branchless swap the two elements. This reduces the recursion depth and improves
+        // perf significantly at a small binary-size cost. Trades ~10% perf boost for integers
+        // for ~50 bytes in the binary.
+
+        // SAFETY: We checked the len, the pointers we create are valid and don't overlap.
+        unsafe {
+            swap_if_less(v.as_mut_ptr(), 0, 1, is_less);
+        }
+    }
+}
+
+/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
+/// value at position `b_pos` is less than the one at position `a_pos`.
+unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
+    // pointers into `v_base`, and are properly aligned, and part of the same allocation.
+    unsafe {
+        let v_a = v_base.add(a_pos);
+        let v_b = v_base.add(b_pos);
+
+        // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
+        // in a well defined state, without duplicates.
+
+        // Important to only swap if it is more and not if it is equal. is_less should return false for
+        // equal, so we don't swap.
+        let should_swap = is_less(&*v_b, &*v_a);
+
+        // This is a branchless version of swap if.
+        // The equivalent code with a branch would be:
+        //
+        // if should_swap {
+        //     ptr::swap(left, right, 1);
+        // }
+
+        // The goal is to generate cmov instructions here.
+        let left_swap = if should_swap { v_b } else { v_a };
+        let right_swap = if should_swap { v_a } else { v_b };
+
+        let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
+        ptr::copy(left_swap, v_a, 1);
+        ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
+    }
+}
diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs
index 932e01f4401e5..43b5cc79fc7f3 100644
--- a/core/src/slice/sort/unstable/mod.rs
+++ b/core/src/slice/sort/unstable/mod.rs
@@ -2,10 +2,13 @@
 
 use crate::intrinsics;
 use crate::mem::SizedTypeProperties;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::find_existing_run;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
 
 pub(crate) mod heapsort;
+#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) mod quicksort;
 
 /// Unstable sort called ipnsort by Lukas Bergdoll and Orson Peters.
@@ -28,25 +31,37 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
         return;
     }
 
-    // More advanced sorting methods than insertion sort are faster if called in
-    // a hot loop for small inputs, but for general-purpose code the small
-    // binary size of insertion sort is more important. The instruction cache in
-    // modern processors is very valuable, and for a single sort call in general
-    // purpose code any gains from an advanced method are cancelled by i-cache
-    // misses during the sort, and thrashing the i-cache for surrounding code.
-    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
-    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
-        insertion_sort_shift_left(v, 1, is_less);
-        return;
+    #[cfg(not(feature = "optimize_for_size"))]
+    {
+        // More advanced sorting methods than insertion sort are faster if called in
+        // a hot loop for small inputs, but for general-purpose code the small
+        // binary size of insertion sort is more important. The instruction cache in
+        // modern processors is very valuable, and for a single sort call in general
+        // purpose code any gains from an advanced method are cancelled by i-cache
+        // misses during the sort, and thrashing the i-cache for surrounding code.
+        const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+        if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+            insertion_sort_shift_left(v, 1, is_less);
+            return;
+        }
+
+        ipnsort(v, is_less);
     }
 
-    ipnsort(v, is_less);
+    #[cfg(feature = "optimize_for_size")]
+    {
+        // SAFETY: We checked that `len >= 2`.
+        unsafe {
+            heapsort::heapsort(v, is_less);
+        }
+    }
 }
 
 /// See [`sort`]
 ///
 /// Deliberately don't inline the main sorting routine entrypoint to ensure the
 /// inlined insertion sort i-cache footprint remains minimal.
+#[cfg(not(feature = "optimize_for_size"))]
 #[inline(never)]
 fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
 where
diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs
index cd53656e9b4b8..e55ebe67e1306 100644
--- a/core/src/slice/sort/unstable/quicksort.rs
+++ b/core/src/slice/sort/unstable/quicksort.rs
@@ -98,13 +98,15 @@ where
         return 0;
     }
 
-    // Allows for panic-free code-gen by proving this property to the compiler.
     if pivot >= len {
         intrinsics::abort();
     }
 
-    // Place the pivot at the beginning of slice.
-    v.swap(0, pivot);
+    // SAFETY: We checked that `pivot` is in-bounds.
+    unsafe {
+        // Place the pivot at the beginning of slice.
+        v.swap_unchecked(0, pivot);
+    }
     let (pivot, v_without_pivot) = v.split_at_mut(1);
 
     // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
@@ -118,8 +120,15 @@ where
     // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
     let num_lt = (const { inst_partition::<T, F>() })(v_without_pivot, pivot, is_less);
 
-    // Place the pivot between the two partitions.
-    v.swap(0, num_lt);
+    if num_lt >= len {
+        intrinsics::abort();
+    }
+
+    // SAFETY: We checked that `num_lt` is in-bounds.
+    unsafe {
+        // Place the pivot between the two partitions.
+        v.swap_unchecked(0, num_lt);
+    }
 
     num_lt
 }

From f46fcfed64476c113e4caf7ffcd87c32761b60cd Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Mon, 26 Aug 2024 11:17:38 +0200
Subject: [PATCH 2/9] Reduce code duplication by moving
 partition_lomuto_branchless_simple into quicksort module

---
 core/src/slice/sort/select.rs             | 78 +----------------------
 core/src/slice/sort/unstable/mod.rs       |  1 -
 core/src/slice/sort/unstable/quicksort.rs | 39 +++++++++++-
 3 files changed, 39 insertions(+), 79 deletions(-)

diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs
index b60e33b3a2f0d..2b3e14755302c 100644
--- a/core/src/slice/sort/select.rs
+++ b/core/src/slice/sort/select.rs
@@ -6,13 +6,11 @@
 //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
 //! better performance than one would get using heapsort as fallback.
 
-use crate::intrinsics;
 use crate::mem::{self, SizedTypeProperties};
 #[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::pivot::choose_pivot;
 #[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
-#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::unstable::quicksort::partition;
 
 /// Reorders the slice such that the element at `index` is at its final sorted position.
@@ -252,15 +250,7 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)
 
     median_of_medians(&mut v[lo..lo + frac], is_less, pivot);
 
-    #[cfg(not(feature = "optimize_for_size"))]
-    {
-        partition(v, lo + pivot, is_less)
-    }
-
-    #[cfg(feature = "optimize_for_size")]
-    {
-        partition_size_opt(v, lo + pivot, is_less)
-    }
+    partition(v, lo + pivot, is_less)
 }
 
 /// Moves around the 9 elements at the indices a..i, such that
@@ -351,69 +341,3 @@ fn bubble_sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
         n -= 1;
     }
 }
-
-#[cfg(feature = "optimize_for_size")]
-fn partition_size_opt<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    let len = v.len();
-
-    // Allows for panic-free code-gen by proving this property to the compiler.
-    if len == 0 {
-        return 0;
-    }
-
-    if pivot >= len {
-        intrinsics::abort();
-    }
-
-    // SAFETY: We checked that `pivot` is in-bounds.
-    unsafe {
-        // Place the pivot at the beginning of slice.
-        v.swap_unchecked(0, pivot);
-    }
-    let (pivot, v_without_pivot) = v.split_at_mut(1);
-
-    // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
-    // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
-    // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
-    // into a stack value, but this creates issues for types with interior mutability mandating
-    // a drop guard.
-    let pivot = &mut pivot[0];
-
-    let num_lt = partition_lomuto_branchless_simple(v_without_pivot, pivot, is_less);
-
-    if num_lt >= len {
-        intrinsics::abort();
-    }
-
-    // SAFETY: We checked that `num_lt` is in-bounds.
-    unsafe {
-        // Place the pivot between the two partitions.
-        v.swap_unchecked(0, num_lt);
-    }
-
-    num_lt
-}
-
-#[cfg(feature = "optimize_for_size")]
-fn partition_lomuto_branchless_simple<T, F: FnMut(&T, &T) -> bool>(
-    v: &mut [T],
-    pivot: &T,
-    is_less: &mut F,
-) -> usize {
-    let mut left = 0;
-
-    for right in 0..v.len() {
-        // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that
-        // left <= right and that both are in-bounds.
-        unsafe {
-            let right_is_lt = is_less(v.get_unchecked(right), pivot);
-            v.swap_unchecked(left, right);
-            left += right_is_lt as usize;
-        }
-    }
-
-    left
-}
diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs
index 43b5cc79fc7f3..faac97eab02b8 100644
--- a/core/src/slice/sort/unstable/mod.rs
+++ b/core/src/slice/sort/unstable/mod.rs
@@ -8,7 +8,6 @@ use crate::slice::sort::shared::find_existing_run;
 use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
 
 pub(crate) mod heapsort;
-#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) mod quicksort;
 
 /// Unstable sort called ipnsort by Lukas Bergdoll and Orson Peters.
diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs
index e55ebe67e1306..83751d99cc261 100644
--- a/core/src/slice/sort/unstable/quicksort.rs
+++ b/core/src/slice/sort/unstable/quicksort.rs
@@ -1,7 +1,9 @@
 //! This module contains an unstable quicksort and two partition implementations.
 
 use crate::mem::{self, ManuallyDrop};
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::pivot::choose_pivot;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl;
 use crate::{intrinsics, ptr};
 
@@ -11,6 +13,7 @@ use crate::{intrinsics, ptr};
 ///
 /// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
 /// this function will immediately switch to heapsort.
+#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) fn quicksort<'a, T, F>(
     mut v: &'a mut [T],
     mut ancestor_pivot: Option<&'a T>,
@@ -138,7 +141,16 @@ const fn inst_partition<T, F: FnMut(&T, &T) -> bool>() -> fn(&mut [T], &T, &mut
     if mem::size_of::<T>() <= MAX_BRANCHLESS_PARTITION_SIZE {
         // Specialize for types that are relatively cheap to copy, where branchless optimizations
         // have large leverage e.g. `u64` and `String`.
-        partition_lomuto_branchless_cyclic::<T, F>
+
+        #[cfg(not(feature = "optimize_for_size"))]
+        {
+            partition_lomuto_branchless_cyclic::<T, F>
+        }
+
+        #[cfg(feature = "optimize_for_size")]
+        {
+            partition_lomuto_branchless_simple::<T, F>
+        }
     } else {
         partition_hoare_branchy_cyclic::<T, F>
     }
@@ -224,6 +236,7 @@ where
     }
 }
 
+#[cfg(not(feature = "optimize_for_size"))]
 struct PartitionState<T> {
     // The current element that is being looked at, scans left to right through slice.
     right: *mut T,
@@ -234,6 +247,7 @@ struct PartitionState<T> {
     gap: GapGuardRaw<T>,
 }
 
+#[cfg(not(feature = "optimize_for_size"))]
 fn partition_lomuto_branchless_cyclic<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
 where
     F: FnMut(&T, &T) -> bool,
@@ -325,6 +339,27 @@ where
     }
 }
 
+#[cfg(feature = "optimize_for_size")]
+fn partition_lomuto_branchless_simple<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    pivot: &T,
+    is_less: &mut F,
+) -> usize {
+    let mut left = 0;
+
+    for right in 0..v.len() {
+        // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that
+        // left <= right and that both are in-bounds.
+        unsafe {
+            let right_is_lt = is_less(v.get_unchecked(right), pivot);
+            v.swap_unchecked(left, right);
+            left += right_is_lt as usize;
+        }
+    }
+
+    left
+}
+
 struct GapGuard<T> {
     pos: *mut T,
     value: ManuallyDrop<T>,
@@ -342,11 +377,13 @@ impl<T> Drop for GapGuard<T> {
 
 /// Ideally this wouldn't be needed and we could just use the regular GapGuard.
 /// See comment in [`partition_lomuto_branchless_cyclic`].
+#[cfg(not(feature = "optimize_for_size"))]
 struct GapGuardRaw<T> {
     pos: *mut T,
     value: *mut T,
 }
 
+#[cfg(not(feature = "optimize_for_size"))]
 impl<T> Drop for GapGuardRaw<T> {
     fn drop(&mut self) {
         // SAFETY: `self` MUST be constructed in a way that makes copying the gap value into

From 2fa330eedddb7cba1c8e5210166a734246a582d5 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Mon, 26 Aug 2024 12:21:37 +0200
Subject: [PATCH 3/9] Convert cfg blocks to cfg_if

---
 core/src/slice/sort/select.rs             | 28 +++++-----
 core/src/slice/sort/stable/mod.rs         | 64 +++++++++++------------
 core/src/slice/sort/unstable/mod.rs       | 40 +++++++-------
 core/src/slice/sort/unstable/quicksort.rs | 15 +++---
 4 files changed, 68 insertions(+), 79 deletions(-)

diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs
index 2b3e14755302c..28ca9dcad39a7 100644
--- a/core/src/slice/sort/select.rs
+++ b/core/src/slice/sort/select.rs
@@ -42,14 +42,12 @@ where
         let min_idx = min_index(v, &mut is_less).unwrap();
         v.swap(min_idx, index);
     } else {
-        #[cfg(not(feature = "optimize_for_size"))]
-        {
-            partition_at_index_loop(v, index, None, &mut is_less);
-        }
-
-        #[cfg(feature = "optimize_for_size")]
-        {
-            median_of_medians(v, &mut is_less, index);
+        cfg_if! {
+            if #[cfg(feature = "optimize_for_size")] {
+                median_of_medians(v, &mut is_less, index);
+            } else {
+                partition_at_index_loop(v, index, None, &mut is_less);
+            }
         }
     }
 
@@ -178,14 +176,12 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
     loop {
         if v.len() <= INSERTION_SORT_THRESHOLD {
             if v.len() >= 2 {
-                #[cfg(not(feature = "optimize_for_size"))]
-                {
-                    insertion_sort_shift_left(v, 1, is_less);
-                }
-
-                #[cfg(feature = "optimize_for_size")]
-                {
-                    bubble_sort(v, is_less);
+                cfg_if! {
+                    if #[cfg(feature = "optimize_for_size")] {
+                        bubble_sort(v, is_less);
+                    } else {
+                        insertion_sort_shift_left(v, 1, is_less);
+                    }
                 }
             }
 
diff --git a/core/src/slice/sort/stable/mod.rs b/core/src/slice/sort/stable/mod.rs
index a61a95a225455..3472401c4dcf8 100644
--- a/core/src/slice/sort/stable/mod.rs
+++ b/core/src/slice/sort/stable/mod.rs
@@ -39,40 +39,38 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
         return;
     }
 
-    #[cfg(not(feature = "optimize_for_size"))]
-    {
-        // More advanced sorting methods than insertion sort are faster if called in
-        // a hot loop for small inputs, but for general-purpose code the small
-        // binary size of insertion sort is more important. The instruction cache in
-        // modern processors is very valuable, and for a single sort call in general
-        // purpose code any gains from an advanced method are cancelled by i-cache
-        // misses during the sort, and thrashing the i-cache for surrounding code.
-        const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
-        if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
-            insertion_sort_shift_left(v, 1, is_less);
-            return;
-        }
-
-        driftsort_main::<T, F, BufT>(v, is_less);
-    }
-
-    #[cfg(feature = "optimize_for_size")]
-    {
-        let alloc_len = len / 2;
-
-        // For small inputs 4KiB of stack storage suffices, which allows us to avoid
-        // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
-        let mut stack_buf = AlignedStorage::<T, 4096>::new();
-        let stack_scratch = stack_buf.as_uninit_slice_mut();
-        let mut heap_buf;
-        let scratch = if stack_scratch.len() >= alloc_len {
-            stack_scratch
+    cfg_if! {
+        if #[cfg(feature = "optimize_for_size")] {
+            let alloc_len = len / 2;
+
+            // For small inputs 4KiB of stack storage suffices, which allows us to avoid
+            // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
+            let mut stack_buf = AlignedStorage::<T, 4096>::new();
+            let stack_scratch = stack_buf.as_uninit_slice_mut();
+            let mut heap_buf;
+            let scratch = if stack_scratch.len() >= alloc_len {
+                stack_scratch
+            } else {
+                heap_buf = BufT::with_capacity(alloc_len);
+                heap_buf.as_uninit_slice_mut()
+            };
+
+            tiny::mergesort(v, scratch, is_less);
         } else {
-            heap_buf = BufT::with_capacity(alloc_len);
-            heap_buf.as_uninit_slice_mut()
-        };
-
-        tiny::mergesort(v, scratch, is_less);
+            // More advanced sorting methods than insertion sort are faster if called in
+            // a hot loop for small inputs, but for general-purpose code the small
+            // binary size of insertion sort is more important. The instruction cache in
+            // modern processors is very valuable, and for a single sort call in general
+            // purpose code any gains from an advanced method are cancelled by i-cache
+            // misses during the sort, and thrashing the i-cache for surrounding code.
+            const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+            if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+                insertion_sort_shift_left(v, 1, is_less);
+                return;
+            }
+
+            driftsort_main::<T, F, BufT>(v, is_less);
+        }
     }
 }
 
diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs
index faac97eab02b8..130be21ee3fe8 100644
--- a/core/src/slice/sort/unstable/mod.rs
+++ b/core/src/slice/sort/unstable/mod.rs
@@ -30,28 +30,26 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
         return;
     }
 
-    #[cfg(not(feature = "optimize_for_size"))]
-    {
-        // More advanced sorting methods than insertion sort are faster if called in
-        // a hot loop for small inputs, but for general-purpose code the small
-        // binary size of insertion sort is more important. The instruction cache in
-        // modern processors is very valuable, and for a single sort call in general
-        // purpose code any gains from an advanced method are cancelled by i-cache
-        // misses during the sort, and thrashing the i-cache for surrounding code.
-        const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
-        if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
-            insertion_sort_shift_left(v, 1, is_less);
-            return;
-        }
-
-        ipnsort(v, is_less);
-    }
+    cfg_if! {
+        if #[cfg(feature = "optimize_for_size")] {
+            // SAFETY: We checked that `len >= 2`.
+            unsafe {
+                heapsort::heapsort(v, is_less);
+            }
+        } else {
+            // More advanced sorting methods than insertion sort are faster if called in
+            // a hot loop for small inputs, but for general-purpose code the small
+            // binary size of insertion sort is more important. The instruction cache in
+            // modern processors is very valuable, and for a single sort call in general
+            // purpose code any gains from an advanced method are cancelled by i-cache
+            // misses during the sort, and thrashing the i-cache for surrounding code.
+            const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+            if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+                insertion_sort_shift_left(v, 1, is_less);
+                return;
+            }
 
-    #[cfg(feature = "optimize_for_size")]
-    {
-        // SAFETY: We checked that `len >= 2`.
-        unsafe {
-            heapsort::heapsort(v, is_less);
+            ipnsort(v, is_less);
         }
     }
 }
diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs
index 83751d99cc261..9c59ccdb70005 100644
--- a/core/src/slice/sort/unstable/quicksort.rs
+++ b/core/src/slice/sort/unstable/quicksort.rs
@@ -141,15 +141,12 @@ const fn inst_partition<T, F: FnMut(&T, &T) -> bool>() -> fn(&mut [T], &T, &mut
     if mem::size_of::<T>() <= MAX_BRANCHLESS_PARTITION_SIZE {
         // Specialize for types that are relatively cheap to copy, where branchless optimizations
         // have large leverage e.g. `u64` and `String`.
-
-        #[cfg(not(feature = "optimize_for_size"))]
-        {
-            partition_lomuto_branchless_cyclic::<T, F>
-        }
-
-        #[cfg(feature = "optimize_for_size")]
-        {
-            partition_lomuto_branchless_simple::<T, F>
+        cfg_if! {
+            if #[cfg(feature = "optimize_for_size")] {
+                partition_lomuto_branchless_simple::<T, F>
+            } else {
+                partition_lomuto_branchless_cyclic::<T, F>
+            }
         }
     } else {
         partition_hoare_branchy_cyclic::<T, F>

From ae57bdf05e4066321f2841f9a604868d3525675e Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Tue, 27 Aug 2024 10:01:48 +0200
Subject: [PATCH 4/9] Use last swap optimization in bubblesort

---
 core/src/slice/sort/select.rs | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs
index 28ca9dcad39a7..7bda54e792528 100644
--- a/core/src/slice/sort/select.rs
+++ b/core/src/slice/sort/select.rs
@@ -317,23 +317,32 @@ fn median_idx<T, F: FnMut(&T, &T) -> bool>(
 
 // It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it
 // would clutter that module with cfg statements and make it generally harder to read and develop.
-// So to decouple things and simplify it, we use a an even smaller bubble sort.
+// So to decouple things and simplify it, we use an even smaller bubble sort.
 #[cfg(feature = "optimize_for_size")]
 fn bubble_sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    use crate::ptr;
+
     let mut n = v.len();
-    let mut did_swap = true;
 
-    while did_swap && n > 1 {
-        did_swap = false;
-        for i in 1..n {
+    let v_base = v.as_mut_ptr();
+
+    while n > 1 {
+        let loop_n = n;
+        n = 0;
+        for i in 1..loop_n {
             // SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds.
             unsafe {
-                if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
-                    v.swap_unchecked(i - 1, i);
-                    did_swap = true;
+                // Even if `is_less` erroneously always returns true, we are guaranteed that `n`
+                // reduces by one each out loop iteration, because `1..n` is exclusive. This
+                // guarantees a bounded run-time should `Ord` be implemented incorrectly.
+                let v_i = v_base.add(i);
+                let v_i_minus_one = v_base.add(i - 1);
+
+                if is_less(&*v_i, &*v_i_minus_one) {
+                    ptr::swap_nonoverlapping(v_i, v_i_minus_one, 1);
+                    n = i;
                 }
             }
         }
-        n -= 1;
     }
 }

From 717e3aa5f1fa580c1984c3976d18ef66364c8185 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Thu, 29 Aug 2024 10:32:59 +0200
Subject: [PATCH 5/9] Use simpler branchy swap logic in tiny merge sort

Avoids the code duplication issue and results in
smaller binary size, which after all is the
purpose of the feature.
---
 core/src/slice/sort/shared/smallsort.rs |  5 +++
 core/src/slice/sort/stable/tiny.rs      | 50 ++++---------------------
 2 files changed, 13 insertions(+), 42 deletions(-)

diff --git a/core/src/slice/sort/shared/smallsort.rs b/core/src/slice/sort/shared/smallsort.rs
index 5027962ccb415..aaa9f9f1c7fd1 100644
--- a/core/src/slice/sort/shared/smallsort.rs
+++ b/core/src/slice/sort/shared/smallsort.rs
@@ -378,6 +378,11 @@ where
 
 /// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
 /// value at position `b_pos` is less than the one at position `a_pos`.
+///
+/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
+/// types. `is_less` could be a huge function and we want to give the compiler an option to
+/// not inline this function. For the same reasons that this function is very perf critical
+/// it should be in the same module as the functions that use it.
 unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
 where
     F: FnMut(&T, &T) -> bool,
diff --git a/core/src/slice/sort/stable/tiny.rs b/core/src/slice/sort/stable/tiny.rs
index 307011f3ee8c4..071ab8e107fe3 100644
--- a/core/src/slice/sort/stable/tiny.rs
+++ b/core/src/slice/sort/stable/tiny.rs
@@ -1,6 +1,6 @@
 //! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.
 
-use crate::mem::{ManuallyDrop, MaybeUninit};
+use crate::mem::MaybeUninit;
 use crate::ptr;
 use crate::slice::sort::stable::merge;
 
@@ -27,49 +27,15 @@ pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
 
         merge::merge(v, scratch, mid, is_less);
     } else if len == 2 {
-        // Branchless swap the two elements. This reduces the recursion depth and improves
-        // perf significantly at a small binary-size cost. Trades ~10% perf boost for integers
-        // for ~50 bytes in the binary.
-
         // SAFETY: We checked the len, the pointers we create are valid and don't overlap.
         unsafe {
-            swap_if_less(v.as_mut_ptr(), 0, 1, is_less);
-        }
-    }
-}
-
-/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
-/// value at position `b_pos` is less than the one at position `a_pos`.
-unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
-where
-    F: FnMut(&T, &T) -> bool,
-{
-    // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
-    // pointers into `v_base`, and are properly aligned, and part of the same allocation.
-    unsafe {
-        let v_a = v_base.add(a_pos);
-        let v_b = v_base.add(b_pos);
+            let v_base = v.as_mut_ptr();
+            let v_a = v_base;
+            let v_b = v_base.add(1);
 
-        // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
-        // in a well defined state, without duplicates.
-
-        // Important to only swap if it is more and not if it is equal. is_less should return false for
-        // equal, so we don't swap.
-        let should_swap = is_less(&*v_b, &*v_a);
-
-        // This is a branchless version of swap if.
-        // The equivalent code with a branch would be:
-        //
-        // if should_swap {
-        //     ptr::swap(left, right, 1);
-        // }
-
-        // The goal is to generate cmov instructions here.
-        let left_swap = if should_swap { v_b } else { v_a };
-        let right_swap = if should_swap { v_a } else { v_b };
-
-        let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
-        ptr::copy(left_swap, v_a, 1);
-        ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
+            if is_less(&*v_b, &*v_a) {
+                ptr::swap_nonoverlapping(v_a, v_b, 1);
+            }
+        }
     }
 }

From bea61dafef2da35f347f7cdfb09e8dc6be428fb9 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Wed, 4 Sep 2024 18:58:44 +0200
Subject: [PATCH 6/9] Drop bubble_sort

While using it results in slightly slammer binaries, it's not deemed
worth it to add yet another sort algorithm to the standard library.
select_nth_unstable has bigger binary-size problems.
---
 core/src/slice/sort/mod.rs        |  1 -
 core/src/slice/sort/select.rs     | 41 +------------------------------
 core/src/slice/sort/shared/mod.rs |  2 ++
 3 files changed, 3 insertions(+), 41 deletions(-)

diff --git a/core/src/slice/sort/mod.rs b/core/src/slice/sort/mod.rs
index 81b99a4364b2e..79852708b81ea 100644
--- a/core/src/slice/sort/mod.rs
+++ b/core/src/slice/sort/mod.rs
@@ -5,5 +5,4 @@ pub mod stable;
 pub mod unstable;
 
 pub(crate) mod select;
-#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) mod shared;
diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs
index 7bda54e792528..3358c03d30a9b 100644
--- a/core/src/slice/sort/select.rs
+++ b/core/src/slice/sort/select.rs
@@ -9,7 +9,6 @@
 use crate::mem::{self, SizedTypeProperties};
 #[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::pivot::choose_pivot;
-#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
 use crate::slice::sort::unstable::quicksort::partition;
 
@@ -176,13 +175,7 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
     loop {
         if v.len() <= INSERTION_SORT_THRESHOLD {
             if v.len() >= 2 {
-                cfg_if! {
-                    if #[cfg(feature = "optimize_for_size")] {
-                        bubble_sort(v, is_less);
-                    } else {
-                        insertion_sort_shift_left(v, 1, is_less);
-                    }
-                }
+                insertion_sort_shift_left(v, 1, is_less);
             }
 
             return;
@@ -314,35 +307,3 @@ fn median_idx<T, F: FnMut(&T, &T) -> bool>(
     }
     b
 }
-
-// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it
-// would clutter that module with cfg statements and make it generally harder to read and develop.
-// So to decouple things and simplify it, we use an even smaller bubble sort.
-#[cfg(feature = "optimize_for_size")]
-fn bubble_sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
-    use crate::ptr;
-
-    let mut n = v.len();
-
-    let v_base = v.as_mut_ptr();
-
-    while n > 1 {
-        let loop_n = n;
-        n = 0;
-        for i in 1..loop_n {
-            // SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds.
-            unsafe {
-                // Even if `is_less` erroneously always returns true, we are guaranteed that `n`
-                // reduces by one each out loop iteration, because `1..n` is exclusive. This
-                // guarantees a bounded run-time should `Ord` be implemented incorrectly.
-                let v_i = v_base.add(i);
-                let v_i_minus_one = v_base.add(i - 1);
-
-                if is_less(&*v_i, &*v_i_minus_one) {
-                    ptr::swap_nonoverlapping(v_i, v_i_minus_one, 1);
-                    n = i;
-                }
-            }
-        }
-    }
-}
diff --git a/core/src/slice/sort/shared/mod.rs b/core/src/slice/sort/shared/mod.rs
index ad1171bfc6a0a..e0f8d475a2e30 100644
--- a/core/src/slice/sort/shared/mod.rs
+++ b/core/src/slice/sort/shared/mod.rs
@@ -1,3 +1,5 @@
+#![cfg_attr(feature = "optimize_for_size", allow(dead_code))]
+
 use crate::marker::Freeze;
 
 pub(crate) mod pivot;

From cd3d6e88bc9d02ed5e00536e3a1a70d2472d2f9b Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Wed, 4 Sep 2024 19:37:49 +0200
Subject: [PATCH 7/9] Shrink heapsort further by combining sift_down loops

---
 core/src/slice/sort/unstable/heapsort.rs  | 36 +++++++++++------------
 core/src/slice/sort/unstable/mod.rs       |  5 +---
 core/src/slice/sort/unstable/quicksort.rs |  7 ++---
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/core/src/slice/sort/unstable/heapsort.rs b/core/src/slice/sort/unstable/heapsort.rs
index 27e2ad588ea09..35bfe2959cd78 100644
--- a/core/src/slice/sort/unstable/heapsort.rs
+++ b/core/src/slice/sort/unstable/heapsort.rs
@@ -1,46 +1,46 @@
 //! This module contains a branchless heapsort as fallback for unstable quicksort.
 
-use crate::{intrinsics, ptr};
+use crate::{cmp, intrinsics, ptr};
 
 /// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
 ///
 /// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
 /// fallback.
-///
-/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
 #[inline(never)]
-pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
+pub(crate) fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
 where
     F: FnMut(&T, &T) -> bool,
 {
-    // SAFETY: See function safety.
-    unsafe {
-        intrinsics::assume(v.len() >= 2);
-
-        // Build the heap in linear time.
-        for i in (0..v.len() / 2).rev() {
-            sift_down(v, i, is_less);
-        }
+    let len = v.len();
 
-        // Pop maximal elements from the heap.
-        for i in (1..v.len()).rev() {
+    for i in (0..len + len / 2).rev() {
+        let sift_idx = if i >= len {
+            i - len
+        } else {
             v.swap(0, i);
-            sift_down(&mut v[..i], 0, is_less);
+            0
+        };
+
+        // SAFETY: The above calculation ensures that `sift_idx` is either 0 or
+        // `(len..(len + (len / 2))) - len`, which simplifies to `0..(len / 2)`.
+        // This guarantees the required `sift_idx <= len`.
+        unsafe {
+            sift_down(&mut v[..cmp::min(i, len)], sift_idx, is_less);
         }
     }
 }
 
 // This binary heap respects the invariant `parent >= child`.
 //
-// SAFETY: The caller has to guarantee that node < `v.len()`.
-#[inline(never)]
+// SAFETY: The caller has to guarantee that `node <= v.len()`.
+#[inline(always)]
 unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
 where
     F: FnMut(&T, &T) -> bool,
 {
     // SAFETY: See function safety.
     unsafe {
-        intrinsics::assume(node < v.len());
+        intrinsics::assume(node <= v.len());
     }
 
     let len = v.len();
diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs
index 130be21ee3fe8..953c27ab6f417 100644
--- a/core/src/slice/sort/unstable/mod.rs
+++ b/core/src/slice/sort/unstable/mod.rs
@@ -32,10 +32,7 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
 
     cfg_if! {
         if #[cfg(feature = "optimize_for_size")] {
-            // SAFETY: We checked that `len >= 2`.
-            unsafe {
-                heapsort::heapsort(v, is_less);
-            }
+            heapsort::heapsort(v, is_less);
         } else {
             // More advanced sorting methods than insertion sort are faster if called in
             // a hot loop for small inputs, but for general-purpose code the small
diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs
index 9c59ccdb70005..4feef5deeb0fb 100644
--- a/core/src/slice/sort/unstable/quicksort.rs
+++ b/core/src/slice/sort/unstable/quicksort.rs
@@ -5,6 +5,8 @@ use crate::mem::{self, ManuallyDrop};
 use crate::slice::sort::shared::pivot::choose_pivot;
 #[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl;
+#[cfg(not(feature = "optimize_for_size"))]
+use crate::slice::sort::unstable::heapsort;
 use crate::{intrinsics, ptr};
 
 /// Sorts `v` recursively.
@@ -31,10 +33,7 @@ pub(crate) fn quicksort<'a, T, F>(
         // If too many bad pivot choices were made, simply fall back to heapsort in order to
         // guarantee `O(N x log(N))` worst-case.
         if limit == 0 {
-            // SAFETY: We assume the `small_sort` threshold is at least 1.
-            unsafe {
-                crate::slice::sort::unstable::heapsort::heapsort(v, is_less);
-            }
+            heapsort::heapsort(v, is_less);
             return;
         }
 

From 670630da181204654adbb34ea06a128f56b0d7c3 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Wed, 4 Sep 2024 19:53:56 +0200
Subject: [PATCH 8/9] Select tiny sorts for 16-bit platforms

Also skips stack alloc in stable sort if 16-bit target platform.
---
 core/src/slice/sort/stable/mod.rs   | 31 ++++++++++++++++++-----------
 core/src/slice/sort/unstable/mod.rs |  2 +-
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/core/src/slice/sort/stable/mod.rs b/core/src/slice/sort/stable/mod.rs
index 3472401c4dcf8..00eb3785e0f25 100644
--- a/core/src/slice/sort/stable/mod.rs
+++ b/core/src/slice/sort/stable/mod.rs
@@ -40,20 +40,27 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
     }
 
     cfg_if! {
-        if #[cfg(feature = "optimize_for_size")] {
+        if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
             let alloc_len = len / 2;
 
-            // For small inputs 4KiB of stack storage suffices, which allows us to avoid
-            // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
-            let mut stack_buf = AlignedStorage::<T, 4096>::new();
-            let stack_scratch = stack_buf.as_uninit_slice_mut();
-            let mut heap_buf;
-            let scratch = if stack_scratch.len() >= alloc_len {
-                stack_scratch
-            } else {
-                heap_buf = BufT::with_capacity(alloc_len);
-                heap_buf.as_uninit_slice_mut()
-            };
+            cfg_if! {
+                if #[cfg(target_pointer_width = "16")] {
+                    let heap_buf = BufT::with_capacity(alloc_len);
+                    let scratch = heap_buf.as_uninit_slice_mut();
+                } else {
+                    // For small inputs 4KiB of stack storage suffices, which allows us to avoid
+                    // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
+                    let mut stack_buf = AlignedStorage::<T, 4096>::new();
+                    let stack_scratch = stack_buf.as_uninit_slice_mut();
+                    let mut heap_buf;
+                    let scratch = if stack_scratch.len() >= alloc_len {
+                        stack_scratch
+                    } else {
+                        heap_buf = BufT::with_capacity(alloc_len);
+                        heap_buf.as_uninit_slice_mut()
+                    };
+                }
+            }
 
             tiny::mergesort(v, scratch, is_less);
         } else {
diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs
index 953c27ab6f417..8bbd85443d478 100644
--- a/core/src/slice/sort/unstable/mod.rs
+++ b/core/src/slice/sort/unstable/mod.rs
@@ -31,7 +31,7 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
     }
 
     cfg_if! {
-        if #[cfg(feature = "optimize_for_size")] {
+        if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
             heapsort::heapsort(v, is_less);
         } else {
             // More advanced sorting methods than insertion sort are faster if called in

From 5446229e7af30ff64a7e322e059bc514509c8fbe Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Wed, 4 Sep 2024 19:54:46 +0200
Subject: [PATCH 9/9] Use non-overlapping swap for inner heapsort loop

This regresses binary-size slightly for normal builds, but the important
release_lto_thin_opt_level_s config sees a small improvement in
binary-size and a larger types such as string and 1k see 2-3% run-time
improvements with this change.
---
 core/src/slice/sort/unstable/heapsort.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/core/src/slice/sort/unstable/heapsort.rs b/core/src/slice/sort/unstable/heapsort.rs
index 35bfe2959cd78..85231779d031f 100644
--- a/core/src/slice/sort/unstable/heapsort.rs
+++ b/core/src/slice/sort/unstable/heapsort.rs
@@ -69,9 +69,7 @@ where
                 break;
             }
 
-            // Swap `node` with the greater child, move one step down, and continue sifting. This
-            // could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
-            ptr::swap(v_base.add(node), v_base.add(child));
+            ptr::swap_nonoverlapping(v_base.add(node), v_base.add(child), 1);
         }
 
         node = child;