Skip to content

Commit 1805c29

Browse files
committed
Add binary-size optimized variants for stable and unstable sort as well as select_nth_unstable
- Stable sort uses a simple merge-sort that re-uses the existing - rather gnarly - merge function. - Unstable sort jumps directly to the branchless heapsort fallback. - select_nth_unstable jumps directly to the median_of_medians fallback, which is augmented with a custom tiny smallsort and partition impl. Some code is duplicated but de-duplication would bring it's own problems. For example `swap_if_less` is critical for performance, if the sorting networks don't inline it perf drops drastically, however `#[inline(always)]` is also a poor fit, if the provided comparison function is huge, it gives the compiler an out to only instantiate `swap_if_less` once and call it. Another aspect that would suffer when making `swap_if_less` pub, is having to cfg out dozens of functions in in smallsort module.
1 parent e2614f2 commit 1805c29

File tree

7 files changed

+284
-33
lines changed

7 files changed

+284
-33
lines changed

core/src/slice/sort/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ pub mod stable;
55
pub mod unstable;
66

77
pub(crate) mod select;
8+
#[cfg(not(feature = "optimize_for_size"))]
89
pub(crate) mod shared;

core/src/slice/sort/select.rs

+122-3
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,13 @@
66
//! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
77
//! better performance than one would get using heapsort as fallback.
88
9+
use crate::intrinsics;
910
use crate::mem::{self, SizedTypeProperties};
11+
#[cfg(not(feature = "optimize_for_size"))]
1012
use crate::slice::sort::shared::pivot::choose_pivot;
13+
#[cfg(not(feature = "optimize_for_size"))]
1114
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
15+
#[cfg(not(feature = "optimize_for_size"))]
1216
use crate::slice::sort::unstable::quicksort::partition;
1317

1418
/// Reorders the slice such that the element at `index` is at its final sorted position.
@@ -40,7 +44,15 @@ where
4044
let min_idx = min_index(v, &mut is_less).unwrap();
4145
v.swap(min_idx, index);
4246
} else {
43-
partition_at_index_loop(v, index, None, &mut is_less);
47+
#[cfg(not(feature = "optimize_for_size"))]
48+
{
49+
partition_at_index_loop(v, index, None, &mut is_less);
50+
}
51+
52+
#[cfg(feature = "optimize_for_size")]
53+
{
54+
median_of_medians(v, &mut is_less, index);
55+
}
4456
}
4557

4658
let (left, right) = v.split_at_mut(index);
@@ -53,6 +65,7 @@ where
5365
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
5466
const INSERTION_SORT_THRESHOLD: usize = 16;
5567

68+
#[cfg(not(feature = "optimize_for_size"))]
5669
fn partition_at_index_loop<'a, T, F>(
5770
mut v: &'a mut [T],
5871
mut index: usize,
@@ -167,8 +180,17 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
167180
loop {
168181
if v.len() <= INSERTION_SORT_THRESHOLD {
169182
if v.len() >= 2 {
170-
insertion_sort_shift_left(v, 1, is_less);
183+
#[cfg(not(feature = "optimize_for_size"))]
184+
{
185+
insertion_sort_shift_left(v, 1, is_less);
186+
}
187+
188+
#[cfg(feature = "optimize_for_size")]
189+
{
190+
bubble_sort(v, is_less);
191+
}
171192
}
193+
172194
return;
173195
}
174196

@@ -230,7 +252,15 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)
230252

231253
median_of_medians(&mut v[lo..lo + frac], is_less, pivot);
232254

233-
partition(v, lo + pivot, is_less)
255+
#[cfg(not(feature = "optimize_for_size"))]
256+
{
257+
partition(v, lo + pivot, is_less)
258+
}
259+
260+
#[cfg(feature = "optimize_for_size")]
261+
{
262+
partition_size_opt(v, lo + pivot, is_less)
263+
}
234264
}
235265

236266
/// Moves around the 9 elements at the indices a..i, such that
@@ -298,3 +328,92 @@ fn median_idx<T, F: FnMut(&T, &T) -> bool>(
298328
}
299329
b
300330
}
331+
332+
// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it
333+
// would clutter that module with cfg statements and make it generally harder to read and develop.
334+
// So to decouple things and simplify it, we use a an even smaller bubble sort.
335+
#[cfg(feature = "optimize_for_size")]
336+
fn bubble_sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
337+
let mut n = v.len();
338+
let mut did_swap = true;
339+
340+
while did_swap && n > 1 {
341+
did_swap = false;
342+
for i in 1..n {
343+
// SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds.
344+
unsafe {
345+
if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
346+
v.swap_unchecked(i - 1, i);
347+
did_swap = true;
348+
}
349+
}
350+
}
351+
n -= 1;
352+
}
353+
}
354+
355+
#[cfg(feature = "optimize_for_size")]
356+
fn partition_size_opt<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
357+
where
358+
F: FnMut(&T, &T) -> bool,
359+
{
360+
let len = v.len();
361+
362+
// Allows for panic-free code-gen by proving this property to the compiler.
363+
if len == 0 {
364+
return 0;
365+
}
366+
367+
if pivot >= len {
368+
intrinsics::abort();
369+
}
370+
371+
// SAFETY: We checked that `pivot` is in-bounds.
372+
unsafe {
373+
// Place the pivot at the beginning of slice.
374+
v.swap_unchecked(0, pivot);
375+
}
376+
let (pivot, v_without_pivot) = v.split_at_mut(1);
377+
378+
// Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
379+
// signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
380+
// Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
381+
// into a stack value, but this creates issues for types with interior mutability mandating
382+
// a drop guard.
383+
let pivot = &mut pivot[0];
384+
385+
let num_lt = partition_lomuto_branchless_simple(v_without_pivot, pivot, is_less);
386+
387+
if num_lt >= len {
388+
intrinsics::abort();
389+
}
390+
391+
// SAFETY: We checked that `num_lt` is in-bounds.
392+
unsafe {
393+
// Place the pivot between the two partitions.
394+
v.swap_unchecked(0, num_lt);
395+
}
396+
397+
num_lt
398+
}
399+
400+
#[cfg(feature = "optimize_for_size")]
401+
fn partition_lomuto_branchless_simple<T, F: FnMut(&T, &T) -> bool>(
402+
v: &mut [T],
403+
pivot: &T,
404+
is_less: &mut F,
405+
) -> usize {
406+
let mut left = 0;
407+
408+
for right in 0..v.len() {
409+
// SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that
410+
// left <= right and that both are in-bounds.
411+
unsafe {
412+
let right_is_lt = is_less(v.get_unchecked(right), pivot);
413+
v.swap_unchecked(left, right);
414+
left += right_is_lt as usize;
415+
}
416+
}
417+
418+
left
419+
}

core/src/slice/sort/shared/smallsort.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ where
378378

379379
/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
380380
/// value at position `b_pos` is less than the one at position `a_pos`.
381-
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
381+
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
382382
where
383383
F: FnMut(&T, &T) -> bool,
384384
{

core/src/slice/sort/stable/mod.rs

+45-13
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,24 @@
11
//! This module contains the entry points for `slice::sort`.
22
3+
#[cfg(not(feature = "optimize_for_size"))]
4+
use crate::cmp;
5+
use crate::intrinsics;
36
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
7+
#[cfg(not(feature = "optimize_for_size"))]
48
use crate::slice::sort::shared::smallsort::{
59
insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
610
};
7-
use crate::{cmp, intrinsics};
811

9-
pub(crate) mod drift;
1012
pub(crate) mod merge;
13+
14+
#[cfg(not(feature = "optimize_for_size"))]
15+
pub(crate) mod drift;
16+
#[cfg(not(feature = "optimize_for_size"))]
1117
pub(crate) mod quicksort;
1218

19+
#[cfg(feature = "optimize_for_size")]
20+
pub(crate) mod tiny;
21+
1322
/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
1423
/// Design document:
1524
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
@@ -30,25 +39,48 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
3039
return;
3140
}
3241

33-
// More advanced sorting methods than insertion sort are faster if called in
34-
// a hot loop for small inputs, but for general-purpose code the small
35-
// binary size of insertion sort is more important. The instruction cache in
36-
// modern processors is very valuable, and for a single sort call in general
37-
// purpose code any gains from an advanced method are cancelled by i-cache
38-
// misses during the sort, and thrashing the i-cache for surrounding code.
39-
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
40-
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
41-
insertion_sort_shift_left(v, 1, is_less);
42-
return;
42+
#[cfg(not(feature = "optimize_for_size"))]
43+
{
44+
// More advanced sorting methods than insertion sort are faster if called in
45+
// a hot loop for small inputs, but for general-purpose code the small
46+
// binary size of insertion sort is more important. The instruction cache in
47+
// modern processors is very valuable, and for a single sort call in general
48+
// purpose code any gains from an advanced method are cancelled by i-cache
49+
// misses during the sort, and thrashing the i-cache for surrounding code.
50+
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
51+
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
52+
insertion_sort_shift_left(v, 1, is_less);
53+
return;
54+
}
55+
56+
driftsort_main::<T, F, BufT>(v, is_less);
4357
}
4458

45-
driftsort_main::<T, F, BufT>(v, is_less);
59+
#[cfg(feature = "optimize_for_size")]
60+
{
61+
let alloc_len = len / 2;
62+
63+
// For small inputs 4KiB of stack storage suffices, which allows us to avoid
64+
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
65+
let mut stack_buf = AlignedStorage::<T, 4096>::new();
66+
let stack_scratch = stack_buf.as_uninit_slice_mut();
67+
let mut heap_buf;
68+
let scratch = if stack_scratch.len() >= alloc_len {
69+
stack_scratch
70+
} else {
71+
heap_buf = BufT::with_capacity(alloc_len);
72+
heap_buf.as_uninit_slice_mut()
73+
};
74+
75+
tiny::mergesort(v, scratch, is_less);
76+
}
4677
}
4778

4879
/// See [`sort`]
4980
///
5081
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
5182
/// inlined insertion sort i-cache footprint remains minimal.
83+
#[cfg(not(feature = "optimize_for_size"))]
5284
#[inline(never)]
5385
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
5486
// By allocating n elements of memory we can ensure the entire input can

core/src/slice/sort/stable/tiny.rs

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.
2+
3+
use crate::mem::{ManuallyDrop, MaybeUninit};
4+
use crate::ptr;
5+
use crate::slice::sort::stable::merge;
6+
7+
/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
8+
/// no run detection, etc.
9+
#[inline(always)]
10+
pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
11+
v: &mut [T],
12+
scratch: &mut [MaybeUninit<T>],
13+
is_less: &mut F,
14+
) {
15+
let len = v.len();
16+
17+
if len > 2 {
18+
let mid = len / 2;
19+
20+
// SAFETY: mid is in-bounds.
21+
unsafe {
22+
// Sort the left half recursively.
23+
mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
24+
// Sort the right half recursively.
25+
mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
26+
}
27+
28+
merge::merge(v, scratch, mid, is_less);
29+
} else if len == 2 {
30+
// Branchless swap the two elements. This reduces the recursion depth and improves
31+
// perf significantly at a small binary-size cost. Trades ~10% perf boost for integers
32+
// for ~50 bytes in the binary.
33+
34+
// SAFETY: We checked the len, the pointers we create are valid and don't overlap.
35+
unsafe {
36+
swap_if_less(v.as_mut_ptr(), 0, 1, is_less);
37+
}
38+
}
39+
}
40+
41+
/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
42+
/// value at position `b_pos` is less than the one at position `a_pos`.
43+
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
44+
where
45+
F: FnMut(&T, &T) -> bool,
46+
{
47+
// SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
48+
// pointers into `v_base`, and are properly aligned, and part of the same allocation.
49+
unsafe {
50+
let v_a = v_base.add(a_pos);
51+
let v_b = v_base.add(b_pos);
52+
53+
// PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
54+
// in a well defined state, without duplicates.
55+
56+
// Important to only swap if it is more and not if it is equal. is_less should return false for
57+
// equal, so we don't swap.
58+
let should_swap = is_less(&*v_b, &*v_a);
59+
60+
// This is a branchless version of swap if.
61+
// The equivalent code with a branch would be:
62+
//
63+
// if should_swap {
64+
// ptr::swap(left, right, 1);
65+
// }
66+
67+
// The goal is to generate cmov instructions here.
68+
let left_swap = if should_swap { v_b } else { v_a };
69+
let right_swap = if should_swap { v_a } else { v_b };
70+
71+
let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
72+
ptr::copy(left_swap, v_a, 1);
73+
ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
74+
}
75+
}

0 commit comments

Comments
 (0)