diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index e9886fc571799..fecacc2bb6395 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -19,10 +19,12 @@ use core::cmp::Ordering::{self, Less}; use core::mem::{self, SizedTypeProperties}; #[cfg(not(no_global_oom_handling))] use core::ptr; +#[cfg(not(no_global_oom_handling))] +use core::slice::sort; use crate::alloc::Allocator; #[cfg(not(no_global_oom_handling))] -use crate::alloc::Global; +use crate::alloc::{self, Global}; #[cfg(not(no_global_oom_handling))] use crate::borrow::ToOwned; use crate::boxed::Box; @@ -206,7 +208,7 @@ impl [T] { where T: Ord, { - merge_sort(self, T::lt); + stable_sort(self, T::lt); } /// Sorts the slice with a comparator function. @@ -262,7 +264,7 @@ impl [T] { where F: FnMut(&T, &T) -> Ordering, { - merge_sort(self, |a, b| compare(a, b) == Less); + stable_sort(self, |a, b| compare(a, b) == Less); } /// Sorts the slice with a key extraction function. @@ -305,7 +307,7 @@ impl [T] { F: FnMut(&T) -> K, K: Ord, { - merge_sort(self, |a, b| f(a).lt(&f(b))); + stable_sort(self, |a, b| f(a).lt(&f(b))); } /// Sorts the slice with a key extraction function. @@ -812,324 +814,52 @@ impl ToOwned for [T] { // Sorting //////////////////////////////////////////////////////////////////////////////// -/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted. -/// -/// This is the integral subroutine of insertion sort. -#[cfg(not(no_global_oom_handling))] -fn insert_head(v: &mut [T], is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - if v.len() >= 2 && is_less(&v[1], &v[0]) { - unsafe { - // There are three ways to implement insertion here: - // - // 1. Swap adjacent elements until the first one gets to its final destination. - // However, this way we copy data around more than is necessary. If elements are big - // structures (costly to copy), this method will be slow. - // - // 2. Iterate until the right place for the first element is found. Then shift the - // elements succeeding it to make room for it and finally place it into the - // remaining hole. This is a good method. - // - // 3. Copy the first element into a temporary variable. Iterate until the right place - // for it is found. As we go along, copy every traversed element into the slot - // preceding it. Finally, copy data from the temporary variable into the remaining - // hole. This method is very good. Benchmarks demonstrated slightly better - // performance than with the 2nd method. - // - // All methods were benchmarked, and the 3rd showed best results. So we chose that one. - let tmp = mem::ManuallyDrop::new(ptr::read(&v[0])); - - // Intermediate state of the insertion process is always tracked by `hole`, which - // serves two purposes: - // 1. Protects integrity of `v` from panics in `is_less`. - // 2. Fills the remaining hole in `v` in the end. - // - // Panic safety: - // - // If `is_less` panics at any point during the process, `hole` will get dropped and - // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it - // initially held exactly once. - let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] }; - ptr::copy_nonoverlapping(&v[1], &mut v[0], 1); - - for i in 2..v.len() { - if !is_less(&v[i], &*tmp) { - break; - } - ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1); - hole.dest = &mut v[i]; - } - // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`. - } - } - - // When dropped, copies from `src` into `dest`. - struct InsertionHole { - src: *const T, - dest: *mut T, - } - - impl Drop for InsertionHole { - fn drop(&mut self) { - unsafe { - ptr::copy_nonoverlapping(self.src, self.dest, 1); - } - } - } -} - -/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and -/// stores the result into `v[..]`. -/// -/// # Safety -/// -/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough -/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type. -#[cfg(not(no_global_oom_handling))] -unsafe fn merge(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - let v = v.as_mut_ptr(); - let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) }; - - // The merge process first copies the shorter run into `buf`. Then it traces the newly copied - // run and the longer run forwards (or backwards), comparing their next unconsumed elements and - // copying the lesser (or greater) one into `v`. - // - // As soon as the shorter run is fully consumed, the process is done. If the longer run gets - // consumed first, then we must copy whatever is left of the shorter run into the remaining - // hole in `v`. - // - // Intermediate state of the process is always tracked by `hole`, which serves two purposes: - // 1. Protects integrity of `v` from panics in `is_less`. - // 2. Fills the remaining hole in `v` if the longer run gets consumed first. - // - // Panic safety: - // - // If `is_less` panics at any point during the process, `hole` will get dropped and fill the - // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every - // object it initially held exactly once. - let mut hole; - - if mid <= len - mid { - // The left run is shorter. - unsafe { - ptr::copy_nonoverlapping(v, buf, mid); - hole = MergeHole { start: buf, end: buf.add(mid), dest: v }; - } - - // Initially, these pointers point to the beginnings of their arrays. - let left = &mut hole.start; - let mut right = v_mid; - let out = &mut hole.dest; - - while *left < hole.end && right < v_end { - // Consume the lesser side. - // If equal, prefer the left run to maintain stability. - unsafe { - let to_copy = if is_less(&*right, &**left) { - get_and_increment(&mut right) - } else { - get_and_increment(left) - }; - ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1); - } - } - } else { - // The right run is shorter. - unsafe { - ptr::copy_nonoverlapping(v_mid, buf, len - mid); - hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid }; - } - - // Initially, these pointers point past the ends of their arrays. - let left = &mut hole.dest; - let right = &mut hole.end; - let mut out = v_end; - - while v < *left && buf < *right { - // Consume the greater side. - // If equal, prefer the right run to maintain stability. - unsafe { - let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) { - decrement_and_get(left) - } else { - decrement_and_get(right) - }; - ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1); - } - } - } - // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of - // it will now be copied into the hole in `v`. - - unsafe fn get_and_increment(ptr: &mut *mut T) -> *mut T { - let old = *ptr; - *ptr = unsafe { ptr.add(1) }; - old - } - - unsafe fn decrement_and_get(ptr: &mut *mut T) -> *mut T { - *ptr = unsafe { ptr.sub(1) }; - *ptr - } - - // When dropped, copies the range `start..end` into `dest..`. - struct MergeHole { - start: *mut T, - end: *mut T, - dest: *mut T, - } - - impl Drop for MergeHole { - fn drop(&mut self) { - // `T` is not a zero-sized type, and these are pointers into a slice's elements. - unsafe { - let len = self.end.sub_ptr(self.start); - ptr::copy_nonoverlapping(self.start, self.dest, len); - } - } - } -} - -/// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail -/// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). -/// -/// The algorithm identifies strictly descending and non-descending subsequences, which are called -/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed -/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are -/// satisfied: -/// -/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len` -/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len` -/// -/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case. +#[inline] #[cfg(not(no_global_oom_handling))] -fn merge_sort(v: &mut [T], mut is_less: F) +fn stable_sort(v: &mut [T], mut is_less: F) where F: FnMut(&T, &T) -> bool, { - // Slices of up to this length get sorted using insertion sort. - const MAX_INSERTION: usize = 20; - // Very short runs are extended using insertion sort to span at least this many elements. - const MIN_RUN: usize = 10; - - // Sorting has no meaningful behavior on zero-sized types. if T::IS_ZST { + // Sorting has no meaningful behavior on zero-sized types. Do nothing. return; } - let len = v.len(); - - // Short arrays get sorted in-place via insertion sort to avoid allocations. - if len <= MAX_INSERTION { - if len >= 2 { - for i in (0..len - 1).rev() { - insert_head(&mut v[i..], &mut is_less); - } - } - return; - } - - // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it - // shallow copies of the contents of `v` without risking the dtors running on copies if - // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run, - // which will always have length at most `len / 2`. - let mut buf = Vec::with_capacity(len / 2); + let elem_alloc_fn = |len: usize| -> *mut T { + // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len > + // v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap + // elements. + unsafe { alloc::alloc(alloc::Layout::array::(len).unwrap_unchecked()) as *mut T } + }; - // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a - // strange decision, but consider the fact that merges more often go in the opposite direction - // (forwards). According to benchmarks, merging forwards is slightly faster than merging - // backwards. To conclude, identifying runs by traversing backwards improves performance. - let mut runs = vec![]; - let mut end = len; - while end > 0 { - // Find the next natural run, and reverse it if it's strictly descending. - let mut start = end - 1; - if start > 0 { - start -= 1; - unsafe { - if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) { - while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) { - start -= 1; - } - v[start..end].reverse(); - } else { - while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) - { - start -= 1; - } - } - } - } - - // Insert some more elements into the run if it's too short. Insertion sort is faster than - // merge sort on short sequences, so this significantly improves performance. - while start > 0 && end - start < MIN_RUN { - start -= 1; - insert_head(&mut v[start..end], &mut is_less); + let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| { + // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len > + // v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same + // len. + unsafe { + alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::(len).unwrap_unchecked()); } + }; - // Push this run onto the stack. - runs.push(Run { start, len: end - start }); - end = start; - - // Merge some pairs of adjacent runs to satisfy the invariants. - while let Some(r) = collapse(&runs) { - let left = runs[r + 1]; - let right = runs[r]; - unsafe { - merge( - &mut v[left.start..right.start + right.len], - left.len, - buf.as_mut_ptr(), - &mut is_less, - ); - } - runs[r] = Run { start: left.start, len: left.len + right.len }; - runs.remove(r + 1); + let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun { + // SAFETY: Creating the layout is safe as long as merge_sort never calls this with an + // obscene length or 0. + unsafe { + alloc::alloc(alloc::Layout::array::(len).unwrap_unchecked()) + as *mut sort::TimSortRun } - } - - // Finally, exactly one run must remain in the stack. - debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len); + }; - // Examines the stack of runs and identifies the next pair of runs to merge. More specifically, - // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the - // algorithm should continue building a new run instead, `None` is returned. - // - // TimSort is infamous for its buggy implementations, as described here: - // http://envisage-project.eu/timsort-specification-and-verification/ - // - // The gist of the story is: we must enforce the invariants on the top four runs on the stack. - // Enforcing them on just top three is not sufficient to ensure that the invariants will still - // hold for *all* runs in the stack. - // - // This function correctly checks invariants for the top four runs. Additionally, if the top - // run starts at index 0, it will always demand a merge operation until the stack is fully - // collapsed, in order to complete the sort. - #[inline] - fn collapse(runs: &[Run]) -> Option { - let n = runs.len(); - if n >= 2 - && (runs[n - 1].start == 0 - || runs[n - 2].len <= runs[n - 1].len - || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len) - || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len)) - { - if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) } - } else { - None + let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| { + // SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same + // len. + unsafe { + alloc::dealloc( + buf_ptr as *mut u8, + alloc::Layout::array::(len).unwrap_unchecked(), + ); } - } + }; - #[derive(Clone, Copy)] - struct Run { - start: usize, - len: usize, - } + sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn); } diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs index c32caa144594b..d93a3a57ecd27 100644 --- a/library/core/src/slice/mod.rs +++ b/library/core/src/slice/mod.rs @@ -29,13 +29,19 @@ use crate::slice; /// Pure rust memchr implementation, taken from rust-memchr pub mod memchr; +#[unstable( + feature = "slice_internals", + issue = "none", + reason = "exposed from core to be reused in std;" +)] +pub mod sort; + mod ascii; mod cmp; mod index; mod iter; mod raw; mod rotate; -mod sort; mod specialize; #[stable(feature = "rust1", since = "1.0.0")] diff --git a/library/core/src/slice/sort.rs b/library/core/src/slice/sort.rs index 3ac01d1727513..2181f9a811855 100644 --- a/library/core/src/slice/sort.rs +++ b/library/core/src/slice/sort.rs @@ -5,6 +5,9 @@ //! //! Unstable sorting is compatible with core because it doesn't allocate memory, unlike our //! stable sorting implementation. +//! +//! In addition it also contains the core logic of the stable sort used by `slice::sort` based on +//! TimSort. use crate::cmp; use crate::mem::{self, MaybeUninit, SizedTypeProperties}; @@ -905,6 +908,7 @@ fn partition_at_index_loop<'a, T, F>( } } +/// Reorder the slice such that the element at `index` is at its final sorted position. pub fn partition_at_index( v: &mut [T], index: usize, @@ -949,3 +953,513 @@ where let pivot = &mut pivot[0]; (left, pivot, right) } + +/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted. +/// +/// This is the integral subroutine of insertion sort. +fn insert_head(v: &mut [T], is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + if v.len() >= 2 && is_less(&v[1], &v[0]) { + // SAFETY: Copy tmp back even if panic, and ensure unique observation. + unsafe { + // There are three ways to implement insertion here: + // + // 1. Swap adjacent elements until the first one gets to its final destination. + // However, this way we copy data around more than is necessary. If elements are big + // structures (costly to copy), this method will be slow. + // + // 2. Iterate until the right place for the first element is found. Then shift the + // elements succeeding it to make room for it and finally place it into the + // remaining hole. This is a good method. + // + // 3. Copy the first element into a temporary variable. Iterate until the right place + // for it is found. As we go along, copy every traversed element into the slot + // preceding it. Finally, copy data from the temporary variable into the remaining + // hole. This method is very good. Benchmarks demonstrated slightly better + // performance than with the 2nd method. + // + // All methods were benchmarked, and the 3rd showed best results. So we chose that one. + let tmp = mem::ManuallyDrop::new(ptr::read(&v[0])); + + // Intermediate state of the insertion process is always tracked by `hole`, which + // serves two purposes: + // 1. Protects integrity of `v` from panics in `is_less`. + // 2. Fills the remaining hole in `v` in the end. + // + // Panic safety: + // + // If `is_less` panics at any point during the process, `hole` will get dropped and + // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it + // initially held exactly once. + let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] }; + ptr::copy_nonoverlapping(&v[1], &mut v[0], 1); + + for i in 2..v.len() { + if !is_less(&v[i], &*tmp) { + break; + } + ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1); + hole.dest = &mut v[i]; + } + // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`. + } + } + + // When dropped, copies from `src` into `dest`. + struct InsertionHole { + src: *const T, + dest: *mut T, + } + + impl Drop for InsertionHole { + fn drop(&mut self) { + // SAFETY: The caller must ensure that src and dest are correctly set. + unsafe { + ptr::copy_nonoverlapping(self.src, self.dest, 1); + } + } + } +} + +/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and +/// stores the result into `v[..]`. +/// +/// # Safety +/// +/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough +/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type. +unsafe fn merge(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + let v = v.as_mut_ptr(); + + // SAFETY: mid and len must be in-bounds of v. + let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) }; + + // The merge process first copies the shorter run into `buf`. Then it traces the newly copied + // run and the longer run forwards (or backwards), comparing their next unconsumed elements and + // copying the lesser (or greater) one into `v`. + // + // As soon as the shorter run is fully consumed, the process is done. If the longer run gets + // consumed first, then we must copy whatever is left of the shorter run into the remaining + // hole in `v`. + // + // Intermediate state of the process is always tracked by `hole`, which serves two purposes: + // 1. Protects integrity of `v` from panics in `is_less`. + // 2. Fills the remaining hole in `v` if the longer run gets consumed first. + // + // Panic safety: + // + // If `is_less` panics at any point during the process, `hole` will get dropped and fill the + // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every + // object it initially held exactly once. + let mut hole; + + if mid <= len - mid { + // The left run is shorter. + + // SAFETY: buf must have enough capacity for `v[..mid]`. + unsafe { + ptr::copy_nonoverlapping(v, buf, mid); + hole = MergeHole { start: buf, end: buf.add(mid), dest: v }; + } + + // Initially, these pointers point to the beginnings of their arrays. + let left = &mut hole.start; + let mut right = v_mid; + let out = &mut hole.dest; + + while *left < hole.end && right < v_end { + // Consume the lesser side. + // If equal, prefer the left run to maintain stability. + + // SAFETY: left and right must be valid and part of v same for out. + unsafe { + let to_copy = if is_less(&*right, &**left) { + get_and_increment(&mut right) + } else { + get_and_increment(left) + }; + ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1); + } + } + } else { + // The right run is shorter. + + // SAFETY: buf must have enough capacity for `v[mid..]`. + unsafe { + ptr::copy_nonoverlapping(v_mid, buf, len - mid); + hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid }; + } + + // Initially, these pointers point past the ends of their arrays. + let left = &mut hole.dest; + let right = &mut hole.end; + let mut out = v_end; + + while v < *left && buf < *right { + // Consume the greater side. + // If equal, prefer the right run to maintain stability. + + // SAFETY: left and right must be valid and part of v same for out. + unsafe { + let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) { + decrement_and_get(left) + } else { + decrement_and_get(right) + }; + ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1); + } + } + } + // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of + // it will now be copied into the hole in `v`. + + unsafe fn get_and_increment(ptr: &mut *mut T) -> *mut T { + let old = *ptr; + + // SAFETY: ptr.add(1) must still be a valid pointer and part of `v`. + *ptr = unsafe { ptr.add(1) }; + old + } + + unsafe fn decrement_and_get(ptr: &mut *mut T) -> *mut T { + // SAFETY: ptr.sub(1) must still be a valid pointer and part of `v`. + *ptr = unsafe { ptr.sub(1) }; + *ptr + } + + // When dropped, copies the range `start..end` into `dest..`. + struct MergeHole { + start: *mut T, + end: *mut T, + dest: *mut T, + } + + impl Drop for MergeHole { + fn drop(&mut self) { + // SAFETY: `T` is not a zero-sized type, and these are pointers into a slice's elements. + unsafe { + let len = self.end.sub_ptr(self.start); + ptr::copy_nonoverlapping(self.start, self.dest, len); + } + } + } +} + +/// This merge sort borrows some (but not all) ideas from TimSort, which used to be described in +/// detail [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). However Python +/// has switched to a Powersort based implementation. +/// +/// The algorithm identifies strictly descending and non-descending subsequences, which are called +/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed +/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are +/// satisfied: +/// +/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len` +/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len` +/// +/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case. +pub fn merge_sort( + v: &mut [T], + is_less: &mut CmpF, + elem_alloc_fn: ElemAllocF, + elem_dealloc_fn: ElemDeallocF, + run_alloc_fn: RunAllocF, + run_dealloc_fn: RunDeallocF, +) where + CmpF: FnMut(&T, &T) -> bool, + ElemAllocF: Fn(usize) -> *mut T, + ElemDeallocF: Fn(*mut T, usize), + RunAllocF: Fn(usize) -> *mut TimSortRun, + RunDeallocF: Fn(*mut TimSortRun, usize), +{ + // Slices of up to this length get sorted using insertion sort. + const MAX_INSERTION: usize = 20; + // Very short runs are extended using insertion sort to span at least this many elements. + const MIN_RUN: usize = 10; + + // The caller should have already checked that. + debug_assert!(!T::IS_ZST); + + let len = v.len(); + + // Short arrays get sorted in-place via insertion sort to avoid allocations. + if len <= MAX_INSERTION { + if len >= 2 { + for i in (0..len - 1).rev() { + insert_head(&mut v[i..], is_less); + } + } + return; + } + + // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it + // shallow copies of the contents of `v` without risking the dtors running on copies if + // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run, + // which will always have length at most `len / 2`. + let buf = BufGuard::new(len / 2, elem_alloc_fn, elem_dealloc_fn); + let buf_ptr = buf.buf_ptr; + + let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn); + + // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a + // strange decision, but consider the fact that merges more often go in the opposite direction + // (forwards). According to benchmarks, merging forwards is slightly faster than merging + // backwards. To conclude, identifying runs by traversing backwards improves performance. + let mut end = len; + while end > 0 { + // Find the next natural run, and reverse it if it's strictly descending. + let mut start = end - 1; + if start > 0 { + start -= 1; + + // SAFETY: The v.get_unchecked must be fed with correct inbound indicies. + unsafe { + if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) { + while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) { + start -= 1; + } + v[start..end].reverse(); + } else { + while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) + { + start -= 1; + } + } + } + } + + // Insert some more elements into the run if it's too short. Insertion sort is faster than + // merge sort on short sequences, so this significantly improves performance. + while start > 0 && end - start < MIN_RUN { + start -= 1; + insert_head(&mut v[start..end], is_less); + } + + // Push this run onto the stack. + runs.push(TimSortRun { start, len: end - start }); + end = start; + + // Merge some pairs of adjacent runs to satisfy the invariants. + while let Some(r) = collapse(runs.as_slice()) { + let left = runs[r + 1]; + let right = runs[r]; + // SAFETY: `buf_ptr` must hold enough capacity for the shorter of the two sides, and + // neither side may be on length 0. + unsafe { + merge(&mut v[left.start..right.start + right.len], left.len, buf_ptr, is_less); + } + runs[r] = TimSortRun { start: left.start, len: left.len + right.len }; + runs.remove(r + 1); + } + } + + // Finally, exactly one run must remain in the stack. + debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len); + + // Examines the stack of runs and identifies the next pair of runs to merge. More specifically, + // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the + // algorithm should continue building a new run instead, `None` is returned. + // + // TimSort is infamous for its buggy implementations, as described here: + // http://envisage-project.eu/timsort-specification-and-verification/ + // + // The gist of the story is: we must enforce the invariants on the top four runs on the stack. + // Enforcing them on just top three is not sufficient to ensure that the invariants will still + // hold for *all* runs in the stack. + // + // This function correctly checks invariants for the top four runs. Additionally, if the top + // run starts at index 0, it will always demand a merge operation until the stack is fully + // collapsed, in order to complete the sort. + #[inline] + fn collapse(runs: &[TimSortRun]) -> Option { + let n = runs.len(); + if n >= 2 + && (runs[n - 1].start == 0 + || runs[n - 2].len <= runs[n - 1].len + || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len) + || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len)) + { + if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) } + } else { + None + } + } + + // Extremely basic versions of Vec. + // Their use is super limited and by having the code here, it allows reuse between the sort + // implementations. + struct BufGuard + where + ElemDeallocF: Fn(*mut T, usize), + { + buf_ptr: *mut T, + capacity: usize, + elem_dealloc_fn: ElemDeallocF, + } + + impl BufGuard + where + ElemDeallocF: Fn(*mut T, usize), + { + fn new( + len: usize, + elem_alloc_fn: ElemAllocF, + elem_dealloc_fn: ElemDeallocF, + ) -> Self + where + ElemAllocF: Fn(usize) -> *mut T, + { + Self { buf_ptr: elem_alloc_fn(len), capacity: len, elem_dealloc_fn } + } + } + + impl Drop for BufGuard + where + ElemDeallocF: Fn(*mut T, usize), + { + fn drop(&mut self) { + (self.elem_dealloc_fn)(self.buf_ptr, self.capacity); + } + } + + struct RunVec + where + RunAllocF: Fn(usize) -> *mut TimSortRun, + RunDeallocF: Fn(*mut TimSortRun, usize), + { + buf_ptr: *mut TimSortRun, + capacity: usize, + len: usize, + run_alloc_fn: RunAllocF, + run_dealloc_fn: RunDeallocF, + } + + impl RunVec + where + RunAllocF: Fn(usize) -> *mut TimSortRun, + RunDeallocF: Fn(*mut TimSortRun, usize), + { + fn new(run_alloc_fn: RunAllocF, run_dealloc_fn: RunDeallocF) -> Self { + // Most slices can be sorted with at most 16 runs in-flight. + const START_RUN_CAPACITY: usize = 16; + + Self { + buf_ptr: run_alloc_fn(START_RUN_CAPACITY), + capacity: START_RUN_CAPACITY, + len: 0, + run_alloc_fn, + run_dealloc_fn, + } + } + + fn push(&mut self, val: TimSortRun) { + if self.len == self.capacity { + let old_capacity = self.capacity; + let old_buf_ptr = self.buf_ptr; + + self.capacity = self.capacity * 2; + self.buf_ptr = (self.run_alloc_fn)(self.capacity); + + // SAFETY: buf_ptr new and old were correctly allocated and old_buf_ptr has + // old_capacity valid elements. + unsafe { + ptr::copy_nonoverlapping(old_buf_ptr, self.buf_ptr, old_capacity); + } + + (self.run_dealloc_fn)(old_buf_ptr, old_capacity); + } + + // SAFETY: The invariant was just checked. + unsafe { + self.buf_ptr.add(self.len).write(val); + } + self.len += 1; + } + + fn remove(&mut self, index: usize) { + if index >= self.len { + panic!("Index out of bounds"); + } + + // SAFETY: buf_ptr needs to be valid and len invariant upheld. + unsafe { + // the place we are taking from. + let ptr = self.buf_ptr.add(index); + + // Shift everything down to fill in that spot. + ptr::copy(ptr.add(1), ptr, self.len - index - 1); + } + self.len -= 1; + } + + fn as_slice(&self) -> &[TimSortRun] { + // SAFETY: Safe as long as buf_ptr is valid and len invariant was upheld. + unsafe { &*ptr::slice_from_raw_parts(self.buf_ptr, self.len) } + } + + fn len(&self) -> usize { + self.len + } + } + + impl core::ops::Index for RunVec + where + RunAllocF: Fn(usize) -> *mut TimSortRun, + RunDeallocF: Fn(*mut TimSortRun, usize), + { + type Output = TimSortRun; + + fn index(&self, index: usize) -> &Self::Output { + if index < self.len { + // SAFETY: buf_ptr and len invariant must be upheld. + unsafe { + return &*(self.buf_ptr.add(index)); + } + } + + panic!("Index out of bounds"); + } + } + + impl core::ops::IndexMut for RunVec + where + RunAllocF: Fn(usize) -> *mut TimSortRun, + RunDeallocF: Fn(*mut TimSortRun, usize), + { + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + if index < self.len { + // SAFETY: buf_ptr and len invariant must be upheld. + unsafe { + return &mut *(self.buf_ptr.add(index)); + } + } + + panic!("Index out of bounds"); + } + } + + impl Drop for RunVec + where + RunAllocF: Fn(usize) -> *mut TimSortRun, + RunDeallocF: Fn(*mut TimSortRun, usize), + { + fn drop(&mut self) { + // As long as TimSortRun is Copy we don't need to drop them individually but just the + // whole allocation. + (self.run_dealloc_fn)(self.buf_ptr, self.capacity); + } + } +} + +/// Internal type used by merge_sort. +#[derive(Clone, Copy, Debug)] +pub struct TimSortRun { + len: usize, + start: usize, +}