Skip to content

Commit 71f5cfb

Browse files
committed
MIRI says reverse is UB, so replace it with an implementation that LLVM can vectorize
For small types with padding, the current implementation is UB because it does integer operations on uninit values. But LLVM has gotten smarter since I wrote the previous implementation in 2017, so remove all the manual magic and just write it in such a way that LLVM will vectorize. This code is much simpler (albeit nuanced) and has very little `unsafe`, and is actually faster to boot!
1 parent 14a2fd6 commit 71f5cfb

File tree

2 files changed

+58
-91
lines changed

2 files changed

+58
-91
lines changed

library/core/src/slice/mod.rs

+32-91
Original file line numberDiff line numberDiff line change
@@ -621,100 +621,41 @@ impl<T> [T] {
621621
#[stable(feature = "rust1", since = "1.0.0")]
622622
#[inline]
623623
pub fn reverse(&mut self) {
624-
let mut i: usize = 0;
625-
let ln = self.len();
626-
627-
// For very small types, all the individual reads in the normal
628-
// path perform poorly. We can do better, given efficient unaligned
629-
// load/store, by loading a larger chunk and reversing a register.
630-
631-
// Ideally LLVM would do this for us, as it knows better than we do
632-
// whether unaligned reads are efficient (since that changes between
633-
// different ARM versions, for example) and what the best chunk size
634-
// would be. Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls
635-
// the loop, so we need to do this ourselves. (Hypothesis: reverse
636-
// is troublesome because the sides can be aligned differently --
637-
// will be, when the length is odd -- so there's no way of emitting
638-
// pre- and postludes to use fully-aligned SIMD in the middle.)
639-
640-
let fast_unaligned = cfg!(any(target_arch = "x86", target_arch = "x86_64"));
641-
642-
if fast_unaligned && mem::size_of::<T>() == 1 {
643-
// Use the llvm.bswap intrinsic to reverse u8s in a usize
644-
let chunk = mem::size_of::<usize>();
645-
while i + chunk - 1 < ln / 2 {
646-
// SAFETY: There are several things to check here:
647-
//
648-
// - Note that `chunk` is either 4 or 8 due to the cfg check
649-
// above. So `chunk - 1` is positive.
650-
// - Indexing with index `i` is fine as the loop check guarantees
651-
// `i + chunk - 1 < ln / 2`
652-
// <=> `i < ln / 2 - (chunk - 1) < ln / 2 < ln`.
653-
// - Indexing with index `ln - i - chunk = ln - (i + chunk)` is fine:
654-
// - `i + chunk > 0` is trivially true.
655-
// - The loop check guarantees:
656-
// `i + chunk - 1 < ln / 2`
657-
// <=> `i + chunk ≤ ln / 2 ≤ ln`, thus subtraction does not underflow.
658-
// - The `read_unaligned` and `write_unaligned` calls are fine:
659-
// - `pa` points to index `i` where `i < ln / 2 - (chunk - 1)`
660-
// (see above) and `pb` points to index `ln - i - chunk`, so
661-
// both are at least `chunk`
662-
// many bytes away from the end of `self`.
663-
// - Any initialized memory is valid `usize`.
664-
unsafe {
665-
let ptr = self.as_mut_ptr();
666-
let pa = ptr.add(i);
667-
let pb = ptr.add(ln - i - chunk);
668-
let va = ptr::read_unaligned(pa as *mut usize);
669-
let vb = ptr::read_unaligned(pb as *mut usize);
670-
ptr::write_unaligned(pa as *mut usize, vb.swap_bytes());
671-
ptr::write_unaligned(pb as *mut usize, va.swap_bytes());
672-
}
673-
i += chunk;
674-
}
675-
}
624+
let half_len = self.len() / 2;
625+
let Range { start, end } = self.as_mut_ptr_range();
626+
627+
// These slices will skip the middle item for an odd length,
628+
// since that one doesn't need to move.
629+
let (front_half, back_half) =
630+
// SAFETY: Both are subparts of the original slice, so the memory
631+
// range is valid, and they don't overlap because they're each only
632+
// half (or less) of the original slice.
633+
unsafe {
634+
(
635+
slice::from_raw_parts_mut(start, half_len),
636+
slice::from_raw_parts_mut(end.sub(half_len), half_len),
637+
)
638+
};
676639

677-
if fast_unaligned && mem::size_of::<T>() == 2 {
678-
// Use rotate-by-16 to reverse u16s in a u32
679-
let chunk = mem::size_of::<u32>() / 2;
680-
while i + chunk - 1 < ln / 2 {
681-
// SAFETY: An unaligned u32 can be read from `i` if `i + 1 < ln`
682-
// (and obviously `i < ln`), because each element is 2 bytes and
683-
// we're reading 4.
684-
//
685-
// `i + chunk - 1 < ln / 2` # while condition
686-
// `i + 2 - 1 < ln / 2`
687-
// `i + 1 < ln / 2`
688-
//
689-
// Since it's less than the length divided by 2, then it must be
690-
// in bounds.
691-
//
692-
// This also means that the condition `0 < i + chunk <= ln` is
693-
// always respected, ensuring the `pb` pointer can be used
694-
// safely.
695-
unsafe {
696-
let ptr = self.as_mut_ptr();
697-
let pa = ptr.add(i);
698-
let pb = ptr.add(ln - i - chunk);
699-
let va = ptr::read_unaligned(pa as *mut u32);
700-
let vb = ptr::read_unaligned(pb as *mut u32);
701-
ptr::write_unaligned(pa as *mut u32, vb.rotate_left(16));
702-
ptr::write_unaligned(pb as *mut u32, va.rotate_left(16));
703-
}
704-
i += chunk;
705-
}
706-
}
640+
// Introducing a function boundary here means that the two halves
641+
// get `noalias` markers, allowing better optimization as LLVM
642+
// knows that they're disjoint, unlike in the original slice.
643+
revswap(front_half, back_half, half_len);
707644

708-
while i < ln / 2 {
709-
// SAFETY: `i` is inferior to half the length of the slice so
710-
// accessing `i` and `ln - i - 1` is safe (`i` starts at 0 and
711-
// will not go further than `ln / 2 - 1`).
712-
// The resulting pointers `pa` and `pb` are therefore valid and
713-
// aligned, and can be read from and written to.
714-
unsafe {
715-
self.swap_unchecked(i, ln - i - 1);
645+
#[inline]
646+
fn revswap<T>(a: &mut [T], b: &mut [T], n: usize) {
647+
debug_assert_eq!(a.len(), n);
648+
debug_assert_eq!(b.len(), n);
649+
650+
// Because this function is first compiled in isolation,
651+
// this check tells LLVM that the indexing below is
652+
// in-bounds. Then after inlining -- once the actual
653+
// lengths of the slices are known -- it's removed.
654+
let (a, b) = (&mut a[..n], &mut b[..n]);
655+
656+
for i in 0..n {
657+
mem::swap(&mut a[i], &mut b[n - 1 - i]);
716658
}
717-
i += 1;
718659
}
719660
}
720661

src/test/codegen/slice-reverse.rs

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// compile-flags: -O
2+
// only-x86_64
3+
4+
#![crate_type = "lib"]
5+
6+
// CHECK-LABEL: @slice_reverse_u8
7+
#[no_mangle]
8+
pub fn slice_reverse_u8(slice: &mut [u8]) {
9+
// CHECK-NOT: panic_bounds_check
10+
// CHECK-NOT: slice_end_index_len_fail
11+
// CHECK: shufflevector <{{[0-9]+}} x i8>
12+
// CHECK-NOT: panic_bounds_check
13+
// CHECK-NOT: slice_end_index_len_fail
14+
slice.reverse();
15+
}
16+
17+
// CHECK-LABEL: @slice_reverse_i32
18+
#[no_mangle]
19+
pub fn slice_reverse_i32(slice: &mut [i32]) {
20+
// CHECK-NOT: panic_bounds_check
21+
// CHECK-NOT: slice_end_index_len_fail
22+
// CHECK: shufflevector <{{[0-9]+}} x i32>
23+
// CHECK-NOT: panic_bounds_check
24+
// CHECK-NOT: slice_end_index_len_fail
25+
slice.reverse();
26+
}

0 commit comments

Comments
 (0)