Skip to content

Commit 363ae41

Browse files
committed
Auto merge of rust-lang#129587 - Voultapher:opt-for-size-variants-of-sort-impls, r=cuviper
Add `optimize_for_size` variants for stable and unstable sort as well as select_nth_unstable - Stable sort uses a simple merge-sort that re-uses the existing - rather gnarly - merge function. - Unstable sort jumps directly to the branchless heapsort fallback. - select_nth_unstable jumps directly to the median_of_medians fallback, which is augmented with a custom tiny smallsort and partition impl. Some code is duplicated but de-duplication would bring it's own problems. For example `swap_if_less` is critical for performance, if the sorting networks don't inline it perf drops drastically, however `#[inline(always)]` is also a poor fit, if the provided comparison function is huge, it gives the compiler an out to only instantiate `swap_if_less` once and call it. Another aspect that would suffer when making `swap_if_less` pub, is having to cfg out dozens of functions in in smallsort module. Part of rust-lang#125612 r​? `@Kobzol`
2 parents 67bb749 + 5439198 commit 363ae41

File tree

8 files changed

+202
-59
lines changed

8 files changed

+202
-59
lines changed

library/core/src/slice/sort/select.rs

+10-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//! better performance than one would get using heapsort as fallback.
88
99
use crate::mem::{self, SizedTypeProperties};
10+
#[cfg(not(feature = "optimize_for_size"))]
1011
use crate::slice::sort::shared::pivot::choose_pivot;
1112
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
1213
use crate::slice::sort::unstable::quicksort::partition;
@@ -40,7 +41,13 @@ where
4041
let min_idx = min_index(v, &mut is_less).unwrap();
4142
v.swap(min_idx, index);
4243
} else {
43-
partition_at_index_loop(v, index, None, &mut is_less);
44+
cfg_if! {
45+
if #[cfg(feature = "optimize_for_size")] {
46+
median_of_medians(v, &mut is_less, index);
47+
} else {
48+
partition_at_index_loop(v, index, None, &mut is_less);
49+
}
50+
}
4451
}
4552

4653
let (left, right) = v.split_at_mut(index);
@@ -53,6 +60,7 @@ where
5360
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
5461
const INSERTION_SORT_THRESHOLD: usize = 16;
5562

63+
#[cfg(not(feature = "optimize_for_size"))]
5664
fn partition_at_index_loop<'a, T, F>(
5765
mut v: &'a mut [T],
5866
mut index: usize,
@@ -169,6 +177,7 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
169177
if v.len() >= 2 {
170178
insertion_sort_shift_left(v, 1, is_less);
171179
}
180+
172181
return;
173182
}
174183

library/core/src/slice/sort/shared/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#![cfg_attr(feature = "optimize_for_size", allow(dead_code))]
2+
13
use crate::marker::Freeze;
24

35
pub(crate) mod pivot;

library/core/src/slice/sort/shared/smallsort.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,12 @@ where
378378

379379
/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
380380
/// value at position `b_pos` is less than the one at position `a_pos`.
381-
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
381+
///
382+
/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
383+
/// types. `is_less` could be a huge function and we want to give the compiler an option to
384+
/// not inline this function. For the same reasons that this function is very perf critical
385+
/// it should be in the same module as the functions that use it.
386+
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
382387
where
383388
F: FnMut(&T, &T) -> bool,
384389
{

library/core/src/slice/sort/stable/mod.rs

+51-14
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,24 @@
11
//! This module contains the entry points for `slice::sort`.
22
3+
#[cfg(not(feature = "optimize_for_size"))]
4+
use crate::cmp;
5+
use crate::intrinsics;
36
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
7+
#[cfg(not(feature = "optimize_for_size"))]
48
use crate::slice::sort::shared::smallsort::{
59
SMALL_SORT_GENERAL_SCRATCH_LEN, StableSmallSortTypeImpl, insertion_sort_shift_left,
610
};
7-
use crate::{cmp, intrinsics};
811

9-
pub(crate) mod drift;
1012
pub(crate) mod merge;
13+
14+
#[cfg(not(feature = "optimize_for_size"))]
15+
pub(crate) mod drift;
16+
#[cfg(not(feature = "optimize_for_size"))]
1117
pub(crate) mod quicksort;
1218

19+
#[cfg(feature = "optimize_for_size")]
20+
pub(crate) mod tiny;
21+
1322
/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
1423
/// Design document:
1524
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
@@ -30,25 +39,53 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
3039
return;
3140
}
3241

33-
// More advanced sorting methods than insertion sort are faster if called in
34-
// a hot loop for small inputs, but for general-purpose code the small
35-
// binary size of insertion sort is more important. The instruction cache in
36-
// modern processors is very valuable, and for a single sort call in general
37-
// purpose code any gains from an advanced method are cancelled by i-cache
38-
// misses during the sort, and thrashing the i-cache for surrounding code.
39-
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
40-
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
41-
insertion_sort_shift_left(v, 1, is_less);
42-
return;
43-
}
42+
cfg_if! {
43+
if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
44+
let alloc_len = len / 2;
45+
46+
cfg_if! {
47+
if #[cfg(target_pointer_width = "16")] {
48+
let heap_buf = BufT::with_capacity(alloc_len);
49+
let scratch = heap_buf.as_uninit_slice_mut();
50+
} else {
51+
// For small inputs 4KiB of stack storage suffices, which allows us to avoid
52+
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
53+
let mut stack_buf = AlignedStorage::<T, 4096>::new();
54+
let stack_scratch = stack_buf.as_uninit_slice_mut();
55+
let mut heap_buf;
56+
let scratch = if stack_scratch.len() >= alloc_len {
57+
stack_scratch
58+
} else {
59+
heap_buf = BufT::with_capacity(alloc_len);
60+
heap_buf.as_uninit_slice_mut()
61+
};
62+
}
63+
}
4464

45-
driftsort_main::<T, F, BufT>(v, is_less);
65+
tiny::mergesort(v, scratch, is_less);
66+
} else {
67+
// More advanced sorting methods than insertion sort are faster if called in
68+
// a hot loop for small inputs, but for general-purpose code the small
69+
// binary size of insertion sort is more important. The instruction cache in
70+
// modern processors is very valuable, and for a single sort call in general
71+
// purpose code any gains from an advanced method are cancelled by i-cache
72+
// misses during the sort, and thrashing the i-cache for surrounding code.
73+
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
74+
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
75+
insertion_sort_shift_left(v, 1, is_less);
76+
return;
77+
}
78+
79+
driftsort_main::<T, F, BufT>(v, is_less);
80+
}
81+
}
4682
}
4783

4884
/// See [`sort`]
4985
///
5086
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
5187
/// inlined insertion sort i-cache footprint remains minimal.
88+
#[cfg(not(feature = "optimize_for_size"))]
5289
#[inline(never)]
5390
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
5491
// By allocating n elements of memory we can ensure the entire input can
+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.
2+
3+
use crate::mem::MaybeUninit;
4+
use crate::ptr;
5+
use crate::slice::sort::stable::merge;
6+
7+
/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
8+
/// no run detection, etc.
9+
#[inline(always)]
10+
pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
11+
v: &mut [T],
12+
scratch: &mut [MaybeUninit<T>],
13+
is_less: &mut F,
14+
) {
15+
let len = v.len();
16+
17+
if len > 2 {
18+
let mid = len / 2;
19+
20+
// SAFETY: mid is in-bounds.
21+
unsafe {
22+
// Sort the left half recursively.
23+
mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
24+
// Sort the right half recursively.
25+
mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
26+
}
27+
28+
merge::merge(v, scratch, mid, is_less);
29+
} else if len == 2 {
30+
// SAFETY: We checked the len, the pointers we create are valid and don't overlap.
31+
unsafe {
32+
let v_base = v.as_mut_ptr();
33+
let v_a = v_base;
34+
let v_b = v_base.add(1);
35+
36+
if is_less(&*v_b, &*v_a) {
37+
ptr::swap_nonoverlapping(v_a, v_b, 1);
38+
}
39+
}
40+
}
41+
}

library/core/src/slice/sort/unstable/heapsort.rs

+19-21
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,46 @@
11
//! This module contains a branchless heapsort as fallback for unstable quicksort.
22
3-
use crate::{intrinsics, ptr};
3+
use crate::{cmp, intrinsics, ptr};
44

55
/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
66
///
77
/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
88
/// fallback.
9-
///
10-
/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
119
#[inline(never)]
12-
pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
10+
pub(crate) fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
1311
where
1412
F: FnMut(&T, &T) -> bool,
1513
{
16-
// SAFETY: See function safety.
17-
unsafe {
18-
intrinsics::assume(v.len() >= 2);
19-
20-
// Build the heap in linear time.
21-
for i in (0..v.len() / 2).rev() {
22-
sift_down(v, i, is_less);
23-
}
14+
let len = v.len();
2415

25-
// Pop maximal elements from the heap.
26-
for i in (1..v.len()).rev() {
16+
for i in (0..len + len / 2).rev() {
17+
let sift_idx = if i >= len {
18+
i - len
19+
} else {
2720
v.swap(0, i);
28-
sift_down(&mut v[..i], 0, is_less);
21+
0
22+
};
23+
24+
// SAFETY: The above calculation ensures that `sift_idx` is either 0 or
25+
// `(len..(len + (len / 2))) - len`, which simplifies to `0..(len / 2)`.
26+
// This guarantees the required `sift_idx <= len`.
27+
unsafe {
28+
sift_down(&mut v[..cmp::min(i, len)], sift_idx, is_less);
2929
}
3030
}
3131
}
3232

3333
// This binary heap respects the invariant `parent >= child`.
3434
//
35-
// SAFETY: The caller has to guarantee that node < `v.len()`.
36-
#[inline(never)]
35+
// SAFETY: The caller has to guarantee that `node <= v.len()`.
36+
#[inline(always)]
3737
unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
3838
where
3939
F: FnMut(&T, &T) -> bool,
4040
{
4141
// SAFETY: See function safety.
4242
unsafe {
43-
intrinsics::assume(node < v.len());
43+
intrinsics::assume(node <= v.len());
4444
}
4545

4646
let len = v.len();
@@ -69,9 +69,7 @@ where
6969
break;
7070
}
7171

72-
// Swap `node` with the greater child, move one step down, and continue sifting. This
73-
// could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
74-
ptr::swap(v_base.add(node), v_base.add(child));
72+
ptr::swap_nonoverlapping(v_base.add(node), v_base.add(child), 1);
7573
}
7674

7775
node = child;

library/core/src/slice/sort/unstable/mod.rs

+21-12
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
33
use crate::intrinsics;
44
use crate::mem::SizedTypeProperties;
5+
#[cfg(not(feature = "optimize_for_size"))]
56
use crate::slice::sort::shared::find_existing_run;
7+
#[cfg(not(feature = "optimize_for_size"))]
68
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
79

810
pub(crate) mod heapsort;
@@ -28,25 +30,32 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
2830
return;
2931
}
3032

31-
// More advanced sorting methods than insertion sort are faster if called in
32-
// a hot loop for small inputs, but for general-purpose code the small
33-
// binary size of insertion sort is more important. The instruction cache in
34-
// modern processors is very valuable, and for a single sort call in general
35-
// purpose code any gains from an advanced method are cancelled by i-cache
36-
// misses during the sort, and thrashing the i-cache for surrounding code.
37-
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
38-
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
39-
insertion_sort_shift_left(v, 1, is_less);
40-
return;
41-
}
33+
cfg_if! {
34+
if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
35+
heapsort::heapsort(v, is_less);
36+
} else {
37+
// More advanced sorting methods than insertion sort are faster if called in
38+
// a hot loop for small inputs, but for general-purpose code the small
39+
// binary size of insertion sort is more important. The instruction cache in
40+
// modern processors is very valuable, and for a single sort call in general
41+
// purpose code any gains from an advanced method are cancelled by i-cache
42+
// misses during the sort, and thrashing the i-cache for surrounding code.
43+
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
44+
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
45+
insertion_sort_shift_left(v, 1, is_less);
46+
return;
47+
}
4248

43-
ipnsort(v, is_less);
49+
ipnsort(v, is_less);
50+
}
51+
}
4452
}
4553

4654
/// See [`sort`]
4755
///
4856
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
4957
/// inlined insertion sort i-cache footprint remains minimal.
58+
#[cfg(not(feature = "optimize_for_size"))]
5059
#[inline(never)]
5160
fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
5261
where

0 commit comments

Comments
 (0)