Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reuse the mem::swap optimizations to speed up slice::rotate #42819

Merged
merged 1 commit into from
Jun 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 1 addition & 53 deletions src/libcore/mem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -499,59 +499,7 @@ pub unsafe fn uninitialized<T>() -> T {
#[stable(feature = "rust1", since = "1.0.0")]
pub fn swap<T>(x: &mut T, y: &mut T) {
unsafe {
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals
// that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
// Haswell E processors. LLVM is more able to optimize if we give a struct a
// #[repr(simd)], even if we don't actually use this struct directly.
//
// FIXME repr(simd) broken on emscripten and redox
#[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
struct Block(u64, u64, u64, u64);
struct UnalignedBlock(u64, u64, u64, u64);

let block_size = size_of::<Block>();

// Get raw pointers to the bytes of x & y for easier manipulation
let x = x as *mut T as *mut u8;
let y = y as *mut T as *mut u8;

// Loop through x & y, copying them `Block` at a time
// The optimizer should unroll the loop fully for most types
// N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
let len = size_of::<T>();
let mut i = 0;
while i + block_size <= len {
// Create some uninitialized memory as scratch space
// Declaring `t` here avoids aligning the stack when this loop is unused
let mut t: Block = uninitialized();
let t = &mut t as *mut _ as *mut u8;
let x = x.offset(i as isize);
let y = y.offset(i as isize);

// Swap a block of bytes of x & y, using t as a temporary buffer
// This should be optimized into efficient SIMD operations where available
ptr::copy_nonoverlapping(x, t, block_size);
ptr::copy_nonoverlapping(y, x, block_size);
ptr::copy_nonoverlapping(t, y, block_size);
i += block_size;
}


if i < len {
// Swap any remaining bytes, using aligned types to copy
// where appropriate (this information is lost by conversion
// to *mut u8, so restore it manually here)
let mut t: UnalignedBlock = uninitialized();
let rem = len - i;

let t = &mut t as *mut _ as *mut u8;
let x = x.offset(i as isize);
let y = y.offset(i as isize);

ptr::copy_nonoverlapping(x, t, rem);
ptr::copy_nonoverlapping(y, x, rem);
ptr::copy_nonoverlapping(t, y, rem);
}
ptr::swap_nonoverlapping(x, y, 1);
}
}

Expand Down
84 changes: 84 additions & 0 deletions src/libcore/ptr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,90 @@ pub unsafe fn swap<T>(x: *mut T, y: *mut T) {
mem::forget(tmp);
}

/// Swaps a sequence of values at two mutable locations of the same type.
///
/// # Safety
///
/// The two arguments must each point to the beginning of `count` locations
/// of valid memory, and the two memory ranges must not overlap.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// #![feature(swap_nonoverlapping)]
///
/// use std::ptr;
///
/// let mut x = [1, 2, 3, 4];
/// let mut y = [7, 8, 9];
///
/// unsafe {
/// ptr::swap_nonoverlapping(x.as_mut_ptr(), y.as_mut_ptr(), 2);
/// }
///
/// assert_eq!(x, [7, 8, 3, 4]);
/// assert_eq!(y, [1, 2, 9]);
/// ```
#[inline]
#[unstable(feature = "swap_nonoverlapping", issue = "42818")]
pub unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
let x = x as *mut u8;
let y = y as *mut u8;
let len = mem::size_of::<T>() * count;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this use .checked_mul(count).unwrap()?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's necessary, because of the safety requirement that the arguments refer to valid ranges of memory -- if it's valid, its length must fit in a usize.

swap_nonoverlapping_bytes(x, y, len)
}

#[inline]
unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) {
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals
// that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
// Haswell E processors. LLVM is more able to optimize if we give a struct a
// #[repr(simd)], even if we don't actually use this struct directly.
//
// FIXME repr(simd) broken on emscripten and redox
#[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
struct Block(u64, u64, u64, u64);
struct UnalignedBlock(u64, u64, u64, u64);

let block_size = mem::size_of::<Block>();

// Loop through x & y, copying them `Block` at a time
// The optimizer should unroll the loop fully for most types
// N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
let mut i = 0;
while i + block_size <= len {
// Create some uninitialized memory as scratch space
// Declaring `t` here avoids aligning the stack when this loop is unused
let mut t: Block = mem::uninitialized();
let t = &mut t as *mut _ as *mut u8;
let x = x.offset(i as isize);
let y = y.offset(i as isize);

// Swap a block of bytes of x & y, using t as a temporary buffer
// This should be optimized into efficient SIMD operations where available
copy_nonoverlapping(x, t, block_size);
copy_nonoverlapping(y, x, block_size);
copy_nonoverlapping(t, y, block_size);
i += block_size;
}

if i < len {
// Swap any remaining bytes
let mut t: UnalignedBlock = mem::uninitialized();
let rem = len - i;

let t = &mut t as *mut _ as *mut u8;
let x = x.offset(i as isize);
let y = y.offset(i as isize);

copy_nonoverlapping(x, t, rem);
copy_nonoverlapping(y, x, rem);
copy_nonoverlapping(t, y, rem);
}
}

/// Replaces the value at `dest` with `src`, returning the old
/// value, without dropping either.
///
Expand Down
9 changes: 1 addition & 8 deletions src/libcore/slice/rotate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {
break;
}

ptr_swap_n(
ptr::swap_nonoverlapping(
mid.offset(-(left as isize)),
mid.offset((right-delta) as isize),
delta);
Expand All @@ -103,10 +103,3 @@ pub unsafe fn ptr_rotate<T>(mut left: usize, mid: *mut T, mut right: usize) {
ptr::copy_nonoverlapping(buf, mid.offset(-(left as isize)), right);
}
}

unsafe fn ptr_swap_n<T>(a: *mut T, b: *mut T, n: usize) {
for i in 0..n {
// These are nonoverlapping, so use mem::swap instead of ptr::swap
mem::swap(&mut *a.offset(i as isize), &mut *b.offset(i as isize));
}
}