forked from rust-lang/compiler-builtins
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use REP MOVSQ/STOSQ on x86_64 (rust-lang#365)
* mem: Move mem* functions to separate directory Signed-off-by: Joe Richey <joerichey@google.com> * memcpy: Create separate memcpy.rs file Signed-off-by: Joe Richey <joerichey@google.com> * benches: Add benchmarks for mem* functions This allows comparing the "normal" implementations to the implementations provided by this crate. Signed-off-by: Joe Richey <joerichey@google.com> * mem: Add REP MOVSB/STOSB implementations The assembly generated seems correct: https://rust.godbolt.org/z/GGnec8 Signed-off-by: Joe Richey <joerichey@google.com> * mem: Add documentations for REP string insturctions Signed-off-by: Joe Richey <joerichey@google.com> * Use quad-word rep string instructions Signed-off-by: Joe Richey <joerichey@google.com> * Prevent panic when compiled in debug mode Signed-off-by: Joe Richey <joerichey@google.com> * Add tests for mem* functions Signed-off-by: Joe Richey <joerichey@google.com> * Add build/test with the "asm" feature Signed-off-by: Joe Richey <joerichey@google.com> * Add byte length to Bencher Signed-off-by: Joe Richey <joerichey@google.com>
- Loading branch information
1 parent
58c23d4
commit e49b2d2
Showing
6 changed files
with
423 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
use super::c_int; | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { | ||
let mut i = 0; | ||
while i < n { | ||
*dest.offset(i as isize) = *src.offset(i as isize); | ||
i += 1; | ||
} | ||
dest | ||
} | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { | ||
if src < dest as *const u8 { | ||
// copy from end | ||
let mut i = n; | ||
while i != 0 { | ||
i -= 1; | ||
*dest.offset(i as isize) = *src.offset(i as isize); | ||
} | ||
} else { | ||
// copy from beginning | ||
let mut i = 0; | ||
while i < n { | ||
*dest.offset(i as isize) = *src.offset(i as isize); | ||
i += 1; | ||
} | ||
} | ||
dest | ||
} | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 { | ||
let mut i = 0; | ||
while i < n { | ||
*s.offset(i as isize) = c as u8; | ||
i += 1; | ||
} | ||
s | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
use super::c_int; | ||
|
||
// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have | ||
// been enhanced to perform better than an simple qword loop, making them ideal | ||
// for implementing memcpy/memset. Note that "rep cmps" has received no such | ||
// enhancement, so it is not used to implement memcmp. | ||
// | ||
// On certain recent Intel processors, "rep movsb" and "rep stosb" have been | ||
// further enhanced to automatically select the best microarchitectural | ||
// implementation based on length and alignment. See the following features from | ||
// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual": | ||
// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) | ||
// - FSRM - Fast Short REP MOV (Ice Lake and later) | ||
// - Fast Zero-Length MOVSB (On no current hardware) | ||
// - Fast Short STOSB (On no current hardware) | ||
// However, to avoid run-time feature detection, we don't use these byte-based | ||
// instructions for most of the copying, preferring the qword variants. | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { | ||
let qword_count = count >> 3; | ||
let byte_count = count & 0b111; | ||
asm!( | ||
"rep movsq [rdi], [rsi]", | ||
"mov ecx, {byte_count:e}", | ||
"rep movsb [rdi], [rsi]", | ||
byte_count = in(reg) byte_count, | ||
inout("rcx") qword_count => _, | ||
inout("rdi") dest => _, | ||
inout("rsi") src => _, | ||
options(nostack, preserves_flags) | ||
); | ||
dest | ||
} | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { | ||
let delta = (dest as usize).wrapping_sub(src as usize); | ||
if delta >= count { | ||
// We can copy forwards because either dest is far enough ahead of src, | ||
// or src is ahead of dest (and delta overflowed). | ||
return self::memcpy(dest, src, count); | ||
} | ||
// copy backwards | ||
let qword_count = count >> 3; | ||
let byte_count = count & 0b111; | ||
asm!( | ||
"std", | ||
"rep movsq [rdi], [rsi]", | ||
"mov ecx, {byte_count:e}", | ||
"add rdi, 7", | ||
"add rsi, 7", | ||
"rep movsb [rdi], [rsi]", | ||
"cld", | ||
byte_count = in(reg) byte_count, | ||
inout("rcx") qword_count => _, | ||
inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _, | ||
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _, | ||
options(nostack) | ||
); | ||
dest | ||
} | ||
|
||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] | ||
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 { | ||
let qword_count = count >> 3; | ||
let byte_count = count & 0b111; | ||
asm!( | ||
"rep stosq [rdi], rax", | ||
"mov ecx, {byte_count:e}", | ||
"rep stosb [rdi], al", | ||
byte_count = in(reg) byte_count, | ||
inout("rcx") qword_count => _, | ||
inout("rdi") dest => _, | ||
in("rax") (c as u8 as u64) * 0x0101010101010101, | ||
options(nostack, preserves_flags) | ||
); | ||
dest | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
#![feature(test)] | ||
|
||
extern crate test; | ||
use test::{black_box, Bencher}; | ||
|
||
extern crate compiler_builtins; | ||
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; | ||
|
||
fn memcpy_builtin(b: &mut Bencher, n: usize) { | ||
let v1 = vec![1u8; n]; | ||
let mut v2 = vec![0u8; n]; | ||
b.bytes = n as u64; | ||
b.iter(|| { | ||
let src: &[u8] = black_box(&v1); | ||
let dst: &mut [u8] = black_box(&mut v2); | ||
dst.copy_from_slice(src); | ||
}) | ||
} | ||
|
||
fn memcpy_rust(b: &mut Bencher, n: usize) { | ||
let v1 = vec![1u8; n]; | ||
let mut v2 = vec![0u8; n]; | ||
b.bytes = n as u64; | ||
b.iter(|| { | ||
let src: &[u8] = black_box(&v1); | ||
let dst: &mut [u8] = black_box(&mut v2); | ||
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) } | ||
}) | ||
} | ||
|
||
fn memset_builtin(b: &mut Bencher, n: usize) { | ||
let mut v1 = vec![0u8; n]; | ||
b.bytes = n as u64; | ||
b.iter(|| { | ||
let dst: &mut [u8] = black_box(&mut v1); | ||
let val: u8 = black_box(27); | ||
for b in dst { | ||
*b = val; | ||
} | ||
}) | ||
} | ||
|
||
fn memset_rust(b: &mut Bencher, n: usize) { | ||
let mut v1 = vec![0u8; n]; | ||
b.bytes = n as u64; | ||
b.iter(|| { | ||
let dst: &mut [u8] = black_box(&mut v1); | ||
let val = black_box(27); | ||
unsafe { memset(dst.as_mut_ptr(), val, n) } | ||
}) | ||
} | ||
|
||
fn memcmp_builtin(b: &mut Bencher, n: usize) { | ||
let v1 = vec![0u8; n]; | ||
let mut v2 = vec![0u8; n]; | ||
v2[n - 1] = 1; | ||
b.bytes = n as u64; | ||
b.iter(|| { | ||
let s1: &[u8] = black_box(&v1); | ||
let s2: &[u8] = black_box(&v2); | ||
s1.cmp(s2) | ||
}) | ||
} | ||
|
||
fn memcmp_rust(b: &mut Bencher, n: usize) { | ||
let v1 = vec![0u8; n]; | ||
let mut v2 = vec![0u8; n]; | ||
v2[n - 1] = 1; | ||
b.bytes = n as u64; | ||
b.iter(|| { | ||
let s1: &[u8] = black_box(&v1); | ||
let s2: &[u8] = black_box(&v2); | ||
unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) } | ||
}) | ||
} | ||
|
||
fn memmove_builtin(b: &mut Bencher, n: usize) { | ||
let mut v = vec![0u8; n + n / 2]; | ||
b.bytes = n as u64; | ||
b.iter(|| { | ||
let s: &mut [u8] = black_box(&mut v); | ||
s.copy_within(0..n, n / 2); | ||
}) | ||
} | ||
|
||
fn memmove_rust(b: &mut Bencher, n: usize) { | ||
let mut v = vec![0u8; n + n / 2]; | ||
b.bytes = n as u64; | ||
b.iter(|| { | ||
let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr(); | ||
let src: *const u8 = black_box(&v).as_ptr(); | ||
unsafe { memmove(dst, src, n) }; | ||
}) | ||
} | ||
|
||
#[bench] | ||
fn memcpy_builtin_4096(b: &mut Bencher) { | ||
memcpy_builtin(b, 4096) | ||
} | ||
#[bench] | ||
fn memcpy_rust_4096(b: &mut Bencher) { | ||
memcpy_rust(b, 4096) | ||
} | ||
#[bench] | ||
fn memcpy_builtin_1048576(b: &mut Bencher) { | ||
memcpy_builtin(b, 1048576) | ||
} | ||
#[bench] | ||
fn memcpy_rust_1048576(b: &mut Bencher) { | ||
memcpy_rust(b, 1048576) | ||
} | ||
|
||
#[bench] | ||
fn memset_builtin_4096(b: &mut Bencher) { | ||
memset_builtin(b, 4096) | ||
} | ||
#[bench] | ||
fn memset_rust_4096(b: &mut Bencher) { | ||
memset_rust(b, 4096) | ||
} | ||
#[bench] | ||
fn memset_builtin_1048576(b: &mut Bencher) { | ||
memset_builtin(b, 1048576) | ||
} | ||
#[bench] | ||
fn memset_rust_1048576(b: &mut Bencher) { | ||
memset_rust(b, 1048576) | ||
} | ||
|
||
#[bench] | ||
fn memcmp_builtin_4096(b: &mut Bencher) { | ||
memcmp_builtin(b, 4096) | ||
} | ||
#[bench] | ||
fn memcmp_rust_4096(b: &mut Bencher) { | ||
memcmp_rust(b, 4096) | ||
} | ||
#[bench] | ||
fn memcmp_builtin_1048576(b: &mut Bencher) { | ||
memcmp_builtin(b, 1048576) | ||
} | ||
#[bench] | ||
fn memcmp_rust_1048576(b: &mut Bencher) { | ||
memcmp_rust(b, 1048576) | ||
} | ||
|
||
#[bench] | ||
fn memmove_builtin_4096(b: &mut Bencher) { | ||
memmove_builtin(b, 4096) | ||
} | ||
#[bench] | ||
fn memmove_rust_4096(b: &mut Bencher) { | ||
memmove_rust(b, 4096) | ||
} | ||
#[bench] | ||
fn memmove_builtin_1048576(b: &mut Bencher) { | ||
memmove_builtin(b, 1048576) | ||
} | ||
#[bench] | ||
fn memmove_rust_1048576(b: &mut Bencher) { | ||
memmove_rust(b, 1048576) | ||
} |
Oops, something went wrong.