Reuse the mem::swap optimizations to speed up slice::rotate

scottmcm · scottmcm · commit 47fa016193a7 · 2017-06-21T22:58:45.000-07:00
Exposes the swapping logic from PR 40454 as `pub unsafe fn ptr::swap_nonoverlapping` under feature swap_nonoverlapping

This is most helpful for compound types where LLVM didn't vectorize the loop.  Highlight: bench slice::rotate_medium_by727_strings gets 38% faster.
diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
@@ -499,59 +499,7 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
-        // The approach here is to utilize simd to swap x & y efficiently. Testing reveals
-        // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
-        // Haswell E processors. LLVM is more able to optimize if we give a struct a
-        // #[repr(simd)], even if we don't actually use this struct directly.
-        //
-        // FIXME repr(simd) broken on emscripten and redox
-        #[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
-        struct Block(u64, u64, u64, u64);
-        struct UnalignedBlock(u64, u64, u64, u64);
-
-        let block_size = size_of::<Block>();
-
-        // Get raw pointers to the bytes of x & y for easier manipulation
-        let x = x as *mut T as *mut u8;
-        let y = y as *mut T as *mut u8;
-
-        // Loop through x & y, copying them `Block` at a time
-        // The optimizer should unroll the loop fully for most types
-        // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
-        let len = size_of::<T>();
-        let mut i = 0;
-        while i + block_size <= len {
-            // Create some uninitialized memory as scratch space
-            // Declaring `t` here avoids aligning the stack when this loop is unused
-            let mut t: Block = uninitialized();
-            let t = &mut t as *mut _ as *mut u8;
-            let x = x.offset(i as isize);
-            let y = y.offset(i as isize);
-
-            // Swap a block of bytes of x & y, using t as a temporary buffer
-            // This should be optimized into efficient SIMD operations where available
-            ptr::copy_nonoverlapping(x, t, block_size);
-            ptr::copy_nonoverlapping(y, x, block_size);
-            ptr::copy_nonoverlapping(t, y, block_size);
-            i += block_size;
-        }
-
-
-        if i < len {
-            // Swap any remaining bytes, using aligned types to copy
-            // where appropriate (this information is lost by conversion
-            // to *mut u8, so restore it manually here)
-            let mut t: UnalignedBlock = uninitialized();
-            let rem = len - i;
-
-            let t = &mut t as *mut _ as *mut u8;
-            let x = x.offset(i as isize);
-            let y = y.offset(i as isize);
-
-            ptr::copy_nonoverlapping(x, t, rem);
-            ptr::copy_nonoverlapping(y, x, rem);
-            ptr::copy_nonoverlapping(t, y, rem);
-        }
+        ptr::swap_nonoverlapping(x, y, 1);
     }
 }
 
diff --git a/src/libcore/ptr.rs b/src/libcore/ptr.rs
@@ -117,6 +117,90 @@ pub unsafe fn swap<T>(x: *mut T, y: *mut T) {
     mem::forget(tmp);
 }
 
+/// Swaps a sequence of values at two mutable locations of the same type.
+///
+/// # Safety
+///
+/// The two arguments must each point to the beginning of `count` locations
+/// of valid memory, and the two memory ranges must not overlap.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// #![feature(swap_nonoverlapping)]
+///
+/// use std::ptr;
+///
+/// let mut x = [1, 2, 3, 4];
+/// let mut y = [7, 8, 9];
+///
+/// unsafe {
+///     ptr::swap_nonoverlapping(x.as_mut_ptr(), y.as_mut_ptr(), 2);
+/// }
+///
+/// assert_eq!(x, [7, 8, 3, 4]);
+/// assert_eq!(y, [1, 2, 9]);
+/// ```
+#[inline]
+#[unstable(feature = "swap_nonoverlapping", issue = "42818")]
+pub unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
+    let x = x as *mut u8;
+    let y = y as *mut u8;
+    let len = mem::size_of::<T>() * count;
+    swap_nonoverlapping_bytes(x, y, len)
+}
+
+#[inline]
+unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) {
+    // The approach here is to utilize simd to swap x & y efficiently. Testing reveals
+    // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
+    // Haswell E processors. LLVM is more able to optimize if we give a struct a
+    // #[repr(simd)], even if we don't actually use this struct directly.
+    //
+    // FIXME repr(simd) broken on emscripten and redox
+    #[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
+    struct Block(u64, u64, u64, u64);
+    struct UnalignedBlock(u64, u64, u64, u64);
+
+    let block_size = mem::size_of::<Block>();
+
+    // Loop through x & y, copying them `Block` at a time
+    // The optimizer should unroll the loop fully for most types
+    // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
+    let mut i = 0;
+    while i + block_size <= len {
+        // Create some uninitialized memory as scratch space
+        // Declaring `t` here avoids aligning the stack when this loop is unused
+        let mut t: Block = mem::uninitialized();
+        let t = &mut t as *mut _ as *mut u8;
+        let x = x.offset(i as isize);
+        let y = y.offset(i as isize);
+
+        // Swap a block of bytes of x & y, using t as a temporary buffer
+        // This should be optimized into efficient SIMD operations where available
+        copy_nonoverlapping(x, t, block_size);
+        copy_nonoverlapping(y, x, block_size);
+        copy_nonoverlapping(t, y, block_size);
+        i += block_size;
+    }
+
+    if i < len {
+        // Swap any remaining bytes
+        let mut t: UnalignedBlock = mem::uninitialized();
+        let rem = len - i;
+
+        let t = &mut t as *mut _ as *mut u8;
+        let x = x.offset(i as isize);
+        let y = y.offset(i as isize);
+
+        copy_nonoverlapping(x, t, rem);
+        copy_nonoverlapping(y, x, rem);
+        copy_nonoverlapping(t, y, rem);
+    }
+}
+
 /// Replaces the value at `dest` with `src`, returning the old
 /// value, without dropping either.
 ///
diff --git a/src/libcore/slice/rotate.rs b/src/libcore/slice/rotate.rs
@@ -76,7 +76,7 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {
             break;
         }
 
-        ptr_swap_n(
+        ptr::swap_nonoverlapping(
             mid.offset(-(left as isize)),
             mid.offset((right-delta) as isize),
             delta);
@@ -103,10 +103,3 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {
         ptr::copy_nonoverlapping(buf, mid.offset(-(left as isize)), right);
     }
 }
-
-unsafe fn ptr_swap_n<T>(a: *mut T, b: *mut T, n: usize) {
-    for i in 0..n {
-        // These are nonoverlapping, so use mem::swap instead of ptr::swap
-        mem::swap(&mut *a.offset(i as isize), &mut *b.offset(i as isize));
-    }
-}

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {`
`76`	`76`	`break;`
`77`	`77`	`}`
`78`	`78`
`79`		`- ptr_swap_n(`
	`79`	`+ ptr::swap_nonoverlapping(`
`80`	`80`	`mid.offset(-(left as isize)),`
`81`	`81`	`mid.offset((right-delta) as isize),`
`82`	`82`	`delta);`
`@@ -103,10 +103,3 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {`
`103`	`103`	`ptr::copy_nonoverlapping(buf, mid.offset(-(left as isize)), right);`
`104`	`104`	`}`
`105`	`105`	`}`
`106`		`-`
`107`		`-unsafe fn ptr_swap_n<T>(a: mut T, b: mut T, n: usize) {`
`108`		`- for i in 0..n {`
`109`		`- // These are nonoverlapping, so use mem::swap instead of ptr::swap`
`110`		`- mem::swap(&mut a.offset(i as isize), &mut b.offset(i as isize));`
`111`		`- }`
`112`		`-}`