-
Notifications
You must be signed in to change notification settings - Fork 12.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Stop manually SIMDing in swap_nonoverlapping
Like I previously did for `reverse`, this leaves it to LLVM to pick how to vectorize it, since it can know better the chunk size to use, compared to the "32 bytes always" approach we currently have. It does still need logic to type-erase where appropriate, though, as while LLVM is now smart enough to vectorize over slices of things like `[u8; 4]`, it fails to do so over slices of `[u8; 3]`. As a bonus, this also means one no longer gets the spurious `memcpy`(s?) at the end up swapping a slice of `__m256`s: <https://rust.godbolt.org/z/joofr4v8Y>
- Loading branch information
Showing
6 changed files
with
263 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
// compile-flags: -O | ||
// only-x86_64 | ||
// ignore-debug: the debug assertions get in the way | ||
|
||
#![crate_type = "lib"] | ||
|
||
use std::mem::swap; | ||
use std::ptr::{read, copy_nonoverlapping, write}; | ||
|
||
type KeccakBuffer = [[u64; 5]; 5]; | ||
|
||
// A basic read+copy+write swap implementation ends up copying one of the values | ||
// to stack for large types, which is completely unnecessary as the lack of | ||
// overlap means we can just do whatever fits in registers at a time. | ||
|
||
// CHECK-LABEL: @swap_basic | ||
#[no_mangle] | ||
pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { | ||
// CHECK: alloca [5 x [5 x i64]] | ||
|
||
// SAFETY: exclusive references are always valid to read/write, | ||
// are non-overlapping, and nothing here panics so it's drop-safe. | ||
unsafe { | ||
let z = read(x); | ||
copy_nonoverlapping(y, x, 1); | ||
write(y, z); | ||
} | ||
} | ||
|
||
// This test verifies that the library does something smarter, and thus | ||
// doesn't need any scratch space on the stack. | ||
|
||
// CHECK-LABEL: @swap_std | ||
#[no_mangle] | ||
pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { | ||
// CHECK-NOT: alloca | ||
// CHECK: load <{{[0-9]+}} x i64> | ||
// CHECK: store <{{[0-9]+}} x i64> | ||
swap(x, y) | ||
} | ||
|
||
// CHECK-LABEL: @swap_slice | ||
#[no_mangle] | ||
pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) { | ||
// CHECK-NOT: alloca | ||
// CHECK: load <{{[0-9]+}} x i64> | ||
// CHECK: store <{{[0-9]+}} x i64> | ||
if x.len() == y.len() { | ||
x.swap_with_slice(y); | ||
} | ||
} | ||
|
||
type OneKilobyteBuffer = [u8; 1024]; | ||
|
||
// CHECK-LABEL: @swap_1kb_slices | ||
#[no_mangle] | ||
pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) { | ||
// CHECK-NOT: alloca | ||
// CHECK: load <{{[0-9]+}} x i8> | ||
// CHECK: store <{{[0-9]+}} x i8> | ||
if x.len() == y.len() { | ||
x.swap_with_slice(y); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
// compile-flags: -O -C target-feature=+avx | ||
// only-x86_64 | ||
// ignore-debug: the debug assertions get in the way | ||
|
||
#![crate_type = "lib"] | ||
|
||
use std::mem::swap; | ||
|
||
// SIMD types are highly-aligned already, so make sure the swap code leaves their | ||
// types alone and doesn't pessimize them (such as by swapping them as `usize`s). | ||
extern crate core; | ||
use core::arch::x86_64::__m256; | ||
|
||
// CHECK-LABEL: @swap_single_m256 | ||
#[no_mangle] | ||
pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) { | ||
// CHECK-NOT: alloca | ||
// CHECK: load <8 x float>{{.+}}align 32 | ||
// CHECK: store <8 x float>{{.+}}align 32 | ||
swap(x, y) | ||
} | ||
|
||
// CHECK-LABEL: @swap_m256_slice | ||
#[no_mangle] | ||
pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) { | ||
// CHECK-NOT: alloca | ||
// CHECK: load <8 x float>{{.+}}align 32 | ||
// CHECK: store <8 x float>{{.+}}align 32 | ||
if x.len() == y.len() { | ||
x.swap_with_slice(y); | ||
} | ||
} |
Oops, something went wrong.