Rollup merge of rust-lang#104672 - Voultapher:unify-sort-modules, r=t…

…homcc Unify stable and unstable sort implementations in same core module This moves the stable sort implementation to the core::slice::sort module. By virtue of being in core it can't access `Vec`. The two `Vec` used by merge sort, `buf` and `runs`, are modelled as custom types that implement the very limited required `Vec` interface with the help of provided allocation and free functions. This is done to allow future re-use of functions and logic between stable and unstable sort. Such as `insert_head`. This is in preparation of rust-lang#100856 and rust-lang#104116. It only moves code, it *doesn't* change any of the sort related logic. This unlocks the ability to share `insert_head`, `insert_tail`, `swap_if_less` `merge` and more. Tagging ``@Mark-Simulacrum`` I hope this allows progress on rust-lang#100856, by moving `merge_sort` here I hope future changes will be easier to review.
compiler-errors · Jan 20, 2023 · 4b7bec8 · 4b7bec8
2 parents 993932b + 4b5844f
commit 4b7bec8
Show file tree

Hide file tree

Showing 3 changed files with 560 additions and 310 deletions.
diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs
@@ -19,10 +19,12 @@ use core::cmp::Ordering::{self, Less};
 use core::mem::{self, SizedTypeProperties};
 #[cfg(not(no_global_oom_handling))]
 use core::ptr;
+#[cfg(not(no_global_oom_handling))]
+use core::slice::sort;
 
 use crate::alloc::Allocator;
 #[cfg(not(no_global_oom_handling))]
-use crate::alloc::Global;
+use crate::alloc::{self, Global};
 #[cfg(not(no_global_oom_handling))]
 use crate::borrow::ToOwned;
 use crate::boxed::Box;
@@ -206,7 +208,7 @@ impl<T> [T] {
  where
  T: Ord,
  {
- merge_sort(self, T::lt);
+ stable_sort(self, T::lt);
  }
 
  /// Sorts the slice with a comparator function.
@@ -262,7 +264,7 @@ impl<T> [T] {
  where
  F: FnMut(&T, &T) -> Ordering,
  {
- merge_sort(self, |a, b| compare(a, b) == Less);
+ stable_sort(self, |a, b| compare(a, b) == Less);
  }
 
  /// Sorts the slice with a key extraction function.
@@ -305,7 +307,7 @@ impl<T> [T] {
  F: FnMut(&T) -> K,
  K: Ord,
  {
- merge_sort(self, |a, b| f(a).lt(&f(b)));
+ stable_sort(self, |a, b| f(a).lt(&f(b)));
  }
 
  /// Sorts the slice with a key extraction function.
@@ -812,324 +814,52 @@ impl<T: Clone> ToOwned for [T] {
 // Sorting
 ////////////////////////////////////////////////////////////////////////////////
 
-/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
-///
-/// This is the integral subroutine of insertion sort.
-#[cfg(not(no_global_oom_handling))]
-fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
-where
- F: FnMut(&T, &T) -> bool,
-{
- if v.len() >= 2 && is_less(&v[1], &v[0]) {
- unsafe {
- // There are three ways to implement insertion here:
- //
- // 1. Swap adjacent elements until the first one gets to its final destination.
- // However, this way we copy data around more than is necessary. If elements are big
- // structures (costly to copy), this method will be slow.
- //
- // 2. Iterate until the right place for the first element is found. Then shift the
- // elements succeeding it to make room for it and finally place it into the
- // remaining hole. This is a good method.
- //
- // 3. Copy the first element into a temporary variable. Iterate until the right place
- // for it is found. As we go along, copy every traversed element into the slot
- // preceding it. Finally, copy data from the temporary variable into the remaining
- // hole. This method is very good. Benchmarks demonstrated slightly better
- // performance than with the 2nd method.
- //
- // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
- let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
-
- // Intermediate state of the insertion process is always tracked by `hole`, which
- // serves two purposes:
- // 1. Protects integrity of `v` from panics in `is_less`.
- // 2. Fills the remaining hole in `v` in the end.
- //
- // Panic safety:
- //
- // If `is_less` panics at any point during the process, `hole` will get dropped and
- // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
- // initially held exactly once.
- let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] };
- ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
-
- for i in 2..v.len() {
- if !is_less(&v[i], &*tmp) {
- break;
- }
- ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
- hole.dest = &mut v[i];
- }
- // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
- }
- }
-
- // When dropped, copies from `src` into `dest`.
- struct InsertionHole<T> {
- src: *const T,
- dest: *mut T,
- }
-
- impl<T> Drop for InsertionHole<T> {
- fn drop(&mut self) {
- unsafe {
- ptr::copy_nonoverlapping(self.src, self.dest, 1);
- }
- }
- }
-}
-
-/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
-/// stores the result into `v[..]`.
-///
-/// # Safety
-///
-/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
-/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
-#[cfg(not(no_global_oom_handling))]
-unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
-where
- F: FnMut(&T, &T) -> bool,
-{
- let len = v.len();
- let v = v.as_mut_ptr();
- let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
-
- // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
- // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
- // copying the lesser (or greater) one into `v`.
- //
- // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
- // consumed first, then we must copy whatever is left of the shorter run into the remaining
- // hole in `v`.
- //
- // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
- // 1. Protects integrity of `v` from panics in `is_less`.
- // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
- //
- // Panic safety:
- //
- // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
- // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
- // object it initially held exactly once.
- let mut hole;
-
- if mid <= len - mid {
- // The left run is shorter.
- unsafe {
- ptr::copy_nonoverlapping(v, buf, mid);
- hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
- }
-
- // Initially, these pointers point to the beginnings of their arrays.
- let left = &mut hole.start;
- let mut right = v_mid;
- let out = &mut hole.dest;
-
- while *left < hole.end && right < v_end {
- // Consume the lesser side.
- // If equal, prefer the left run to maintain stability.
- unsafe {
- let to_copy = if is_less(&*right, &**left) {
- get_and_increment(&mut right)
- } else {
- get_and_increment(left)
- };
- ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
- }
- }
- } else {
- // The right run is shorter.
- unsafe {
- ptr::copy_nonoverlapping(v_mid, buf, len - mid);
- hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
- }
-
- // Initially, these pointers point past the ends of their arrays.
- let left = &mut hole.dest;
- let right = &mut hole.end;
- let mut out = v_end;
-
- while v < *left && buf < *right {
- // Consume the greater side.
- // If equal, prefer the right run to maintain stability.
- unsafe {
- let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) {
- decrement_and_get(left)
- } else {
- decrement_and_get(right)
- };
- ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
- }
- }
- }
- // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
- // it will now be copied into the hole in `v`.
-
- unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
- let old = *ptr;
- *ptr = unsafe { ptr.add(1) };
- old
- }
-
- unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
- *ptr = unsafe { ptr.sub(1) };
- *ptr
- }
-
- // When dropped, copies the range `start..end` into `dest..`.
- struct MergeHole<T> {
- start: *mut T,
- end: *mut T,
- dest: *mut T,
- }
-
- impl<T> Drop for MergeHole<T> {
- fn drop(&mut self) {
- // `T` is not a zero-sized type, and these are pointers into a slice's elements.
- unsafe {
- let len = self.end.sub_ptr(self.start);
- ptr::copy_nonoverlapping(self.start, self.dest, len);
- }
- }
- }
-}
-
-/// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
-/// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt).
-///
-/// The algorithm identifies strictly descending and non-descending subsequences, which are called
-/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
-/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
-/// satisfied:
-///
-/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
-/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
-///
-/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
+#[inline]
 #[cfg(not(no_global_oom_handling))]
-fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
+fn stable_sort<T, F>(v: &mut [T], mut is_less: F)
 where
  F: FnMut(&T, &T) -> bool,
 {
- // Slices of up to this length get sorted using insertion sort.
- const MAX_INSERTION: usize = 20;
- // Very short runs are extended using insertion sort to span at least this many elements.
- const MIN_RUN: usize = 10;
-
- // Sorting has no meaningful behavior on zero-sized types.
  if T::IS_ZST {
+ // Sorting has no meaningful behavior on zero-sized types. Do nothing.
  return;
  }
 
- let len = v.len();
-
- // Short arrays get sorted in-place via insertion sort to avoid allocations.
- if len <= MAX_INSERTION {
- if len >= 2 {
- for i in (0..len - 1).rev() {
- insert_head(&mut v[i..], &mut is_less);
- }
- }
- return;
- }
-
- // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
- // shallow copies of the contents of `v` without risking the dtors running on copies if
- // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
- // which will always have length at most `len / 2`.
- let mut buf = Vec::with_capacity(len / 2);
+ let elem_alloc_fn = |len: usize| -> *mut T {
+ // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
+ // v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap
+ // elements.
+ unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T }
+ };
 
- // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
- // strange decision, but consider the fact that merges more often go in the opposite direction
- // (forwards). According to benchmarks, merging forwards is slightly faster than merging
- // backwards. To conclude, identifying runs by traversing backwards improves performance.
- let mut runs = vec![];
- let mut end = len;
- while end > 0 {
- // Find the next natural run, and reverse it if it's strictly descending.
- let mut start = end - 1;
- if start > 0 {
- start -= 1;
- unsafe {
- if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
- while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
- start -= 1;
- }
- v[start..end].reverse();
- } else {
- while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
- {
- start -= 1;
- }
- }
- }
- }
-
- // Insert some more elements into the run if it's too short. Insertion sort is faster than
- // merge sort on short sequences, so this significantly improves performance.
- while start > 0 && end - start < MIN_RUN {
- start -= 1;
- insert_head(&mut v[start..end], &mut is_less);
+ let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| {
+ // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
+ // v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
+ // len.
+ unsafe {
+ alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked());
  }
+ };
 
- // Push this run onto the stack.
- runs.push(Run { start, len: end - start });
- end = start;
-
- // Merge some pairs of adjacent runs to satisfy the invariants.
- while let Some(r) = collapse(&runs) {
- let left = runs[r + 1];
- let right = runs[r];
- unsafe {
- merge(
- &mut v[left.start..right.start + right.len],
- left.len,
- buf.as_mut_ptr(),
- &mut is_less,
- );
- }
- runs[r] = Run { start: left.start, len: left.len + right.len };
- runs.remove(r + 1);
+ let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun {
+ // SAFETY: Creating the layout is safe as long as merge_sort never calls this with an
+ // obscene length or 0.
+ unsafe {
+ alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked())
+ as *mut sort::TimSortRun
  }
- }
-
- // Finally, exactly one run must remain in the stack.
- debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
+ };
 
- // Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
- // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
- // algorithm should continue building a new run instead, `None` is returned.
- //
- // TimSort is infamous for its buggy implementations, as described here:
- // http://envisage-project.eu/timsort-specification-and-verification/
- //
- // The gist of the story is: we must enforce the invariants on the top four runs on the stack.
- // Enforcing them on just top three is not sufficient to ensure that the invariants will still
- // hold for *all* runs in the stack.
- //
- // This function correctly checks invariants for the top four runs. Additionally, if the top
- // run starts at index 0, it will always demand a merge operation until the stack is fully
- // collapsed, in order to complete the sort.
- #[inline]
- fn collapse(runs: &[Run]) -> Option<usize> {
- let n = runs.len();
- if n >= 2
- && (runs[n - 1].start == 0
- || runs[n - 2].len <= runs[n - 1].len
- || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
- || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
- {
- if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
- } else {
- None
+ let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| {
+ // SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
+ // len.
+ unsafe {
+ alloc::dealloc(
+ buf_ptr as *mut u8,
+ alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(),
+ );
  }
- }
+ };
 
- #[derive(Clone, Copy)]
- struct Run {
- start: usize,
- len: usize,
- }
+ sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn);
 }