Skip to content

Commit 47fa016

Browse files
committed
Reuse the mem::swap optimizations to speed up slice::rotate
Exposes the swapping logic from PR 40454 as `pub unsafe fn ptr::swap_nonoverlapping` under feature swap_nonoverlapping This is most helpful for compound types where LLVM didn't vectorize the loop. Highlight: bench slice::rotate_medium_by727_strings gets 38% faster.
1 parent 6de26f4 commit 47fa016

File tree

3 files changed

+86
-61
lines changed

3 files changed

+86
-61
lines changed

src/libcore/mem.rs

+1-53
Original file line numberDiff line numberDiff line change
@@ -499,59 +499,7 @@ pub unsafe fn uninitialized<T>() -> T {
499499
#[stable(feature = "rust1", since = "1.0.0")]
500500
pub fn swap<T>(x: &mut T, y: &mut T) {
501501
unsafe {
502-
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals
503-
// that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
504-
// Haswell E processors. LLVM is more able to optimize if we give a struct a
505-
// #[repr(simd)], even if we don't actually use this struct directly.
506-
//
507-
// FIXME repr(simd) broken on emscripten and redox
508-
#[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
509-
struct Block(u64, u64, u64, u64);
510-
struct UnalignedBlock(u64, u64, u64, u64);
511-
512-
let block_size = size_of::<Block>();
513-
514-
// Get raw pointers to the bytes of x & y for easier manipulation
515-
let x = x as *mut T as *mut u8;
516-
let y = y as *mut T as *mut u8;
517-
518-
// Loop through x & y, copying them `Block` at a time
519-
// The optimizer should unroll the loop fully for most types
520-
// N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
521-
let len = size_of::<T>();
522-
let mut i = 0;
523-
while i + block_size <= len {
524-
// Create some uninitialized memory as scratch space
525-
// Declaring `t` here avoids aligning the stack when this loop is unused
526-
let mut t: Block = uninitialized();
527-
let t = &mut t as *mut _ as *mut u8;
528-
let x = x.offset(i as isize);
529-
let y = y.offset(i as isize);
530-
531-
// Swap a block of bytes of x & y, using t as a temporary buffer
532-
// This should be optimized into efficient SIMD operations where available
533-
ptr::copy_nonoverlapping(x, t, block_size);
534-
ptr::copy_nonoverlapping(y, x, block_size);
535-
ptr::copy_nonoverlapping(t, y, block_size);
536-
i += block_size;
537-
}
538-
539-
540-
if i < len {
541-
// Swap any remaining bytes, using aligned types to copy
542-
// where appropriate (this information is lost by conversion
543-
// to *mut u8, so restore it manually here)
544-
let mut t: UnalignedBlock = uninitialized();
545-
let rem = len - i;
546-
547-
let t = &mut t as *mut _ as *mut u8;
548-
let x = x.offset(i as isize);
549-
let y = y.offset(i as isize);
550-
551-
ptr::copy_nonoverlapping(x, t, rem);
552-
ptr::copy_nonoverlapping(y, x, rem);
553-
ptr::copy_nonoverlapping(t, y, rem);
554-
}
502+
ptr::swap_nonoverlapping(x, y, 1);
555503
}
556504
}
557505

src/libcore/ptr.rs

+84
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,90 @@ pub unsafe fn swap<T>(x: *mut T, y: *mut T) {
117117
mem::forget(tmp);
118118
}
119119

120+
/// Swaps a sequence of values at two mutable locations of the same type.
121+
///
122+
/// # Safety
123+
///
124+
/// The two arguments must each point to the beginning of `count` locations
125+
/// of valid memory, and the two memory ranges must not overlap.
126+
///
127+
/// # Examples
128+
///
129+
/// Basic usage:
130+
///
131+
/// ```
132+
/// #![feature(swap_nonoverlapping)]
133+
///
134+
/// use std::ptr;
135+
///
136+
/// let mut x = [1, 2, 3, 4];
137+
/// let mut y = [7, 8, 9];
138+
///
139+
/// unsafe {
140+
/// ptr::swap_nonoverlapping(x.as_mut_ptr(), y.as_mut_ptr(), 2);
141+
/// }
142+
///
143+
/// assert_eq!(x, [7, 8, 3, 4]);
144+
/// assert_eq!(y, [1, 2, 9]);
145+
/// ```
146+
#[inline]
147+
#[unstable(feature = "swap_nonoverlapping", issue = "42818")]
148+
pub unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
149+
let x = x as *mut u8;
150+
let y = y as *mut u8;
151+
let len = mem::size_of::<T>() * count;
152+
swap_nonoverlapping_bytes(x, y, len)
153+
}
154+
155+
#[inline]
156+
unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) {
157+
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals
158+
// that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
159+
// Haswell E processors. LLVM is more able to optimize if we give a struct a
160+
// #[repr(simd)], even if we don't actually use this struct directly.
161+
//
162+
// FIXME repr(simd) broken on emscripten and redox
163+
#[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
164+
struct Block(u64, u64, u64, u64);
165+
struct UnalignedBlock(u64, u64, u64, u64);
166+
167+
let block_size = mem::size_of::<Block>();
168+
169+
// Loop through x & y, copying them `Block` at a time
170+
// The optimizer should unroll the loop fully for most types
171+
// N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
172+
let mut i = 0;
173+
while i + block_size <= len {
174+
// Create some uninitialized memory as scratch space
175+
// Declaring `t` here avoids aligning the stack when this loop is unused
176+
let mut t: Block = mem::uninitialized();
177+
let t = &mut t as *mut _ as *mut u8;
178+
let x = x.offset(i as isize);
179+
let y = y.offset(i as isize);
180+
181+
// Swap a block of bytes of x & y, using t as a temporary buffer
182+
// This should be optimized into efficient SIMD operations where available
183+
copy_nonoverlapping(x, t, block_size);
184+
copy_nonoverlapping(y, x, block_size);
185+
copy_nonoverlapping(t, y, block_size);
186+
i += block_size;
187+
}
188+
189+
if i < len {
190+
// Swap any remaining bytes
191+
let mut t: UnalignedBlock = mem::uninitialized();
192+
let rem = len - i;
193+
194+
let t = &mut t as *mut _ as *mut u8;
195+
let x = x.offset(i as isize);
196+
let y = y.offset(i as isize);
197+
198+
copy_nonoverlapping(x, t, rem);
199+
copy_nonoverlapping(y, x, rem);
200+
copy_nonoverlapping(t, y, rem);
201+
}
202+
}
203+
120204
/// Replaces the value at `dest` with `src`, returning the old
121205
/// value, without dropping either.
122206
///

src/libcore/slice/rotate.rs

+1-8
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {
7676
break;
7777
}
7878

79-
ptr_swap_n(
79+
ptr::swap_nonoverlapping(
8080
mid.offset(-(left as isize)),
8181
mid.offset((right-delta) as isize),
8282
delta);
@@ -103,10 +103,3 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {
103103
ptr::copy_nonoverlapping(buf, mid.offset(-(left as isize)), right);
104104
}
105105
}
106-
107-
unsafe fn ptr_swap_n<T>(a: *mut T, b: *mut T, n: usize) {
108-
for i in 0..n {
109-
// These are nonoverlapping, so use mem::swap instead of ptr::swap
110-
mem::swap(&mut *a.offset(i as isize), &mut *b.offset(i as isize));
111-
}
112-
}

0 commit comments

Comments
 (0)