Skip to content

Commit

Permalink
Use REP MOVSQ/STOSQ on x86_64 (rust-lang#365)
Browse files Browse the repository at this point in the history
* mem: Move mem* functions to separate directory

Signed-off-by: Joe Richey <joerichey@google.com>

* memcpy: Create separate memcpy.rs file

Signed-off-by: Joe Richey <joerichey@google.com>

* benches: Add benchmarks for mem* functions

This allows comparing the "normal" implementations to the
implementations provided by this crate.

Signed-off-by: Joe Richey <joerichey@google.com>

* mem: Add REP MOVSB/STOSB implementations

The assembly generated seems correct:
    https://rust.godbolt.org/z/GGnec8

Signed-off-by: Joe Richey <joerichey@google.com>

* mem: Add documentations for REP string insturctions

Signed-off-by: Joe Richey <joerichey@google.com>

* Use quad-word rep string instructions

Signed-off-by: Joe Richey <joerichey@google.com>

* Prevent panic when compiled in debug mode

Signed-off-by: Joe Richey <joerichey@google.com>

* Add tests for mem* functions

Signed-off-by: Joe Richey <joerichey@google.com>

* Add build/test with the "asm" feature

Signed-off-by: Joe Richey <joerichey@google.com>

* Add byte length to Bencher

Signed-off-by: Joe Richey <joerichey@google.com>
  • Loading branch information
josephlr authored and AaronKutch committed Nov 28, 2020
1 parent 58c23d4 commit e49b2d2
Show file tree
Hide file tree
Showing 6 changed files with 423 additions and 39 deletions.
4 changes: 4 additions & 0 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ else
$run --release
$run --features c
$run --features c --release
$run --features asm
$run --features asm --release
fi

cargo build --target $1
cargo build --target $1 --release
cargo build --target $1 --features c
cargo build --target $1 --release --features c
cargo build --target $1 --features asm
cargo build --target $1 --release --features asm

PREFIX=$(echo $1 | sed -e 's/unknown-//')-
case $1 in
Expand Down
41 changes: 41 additions & 0 deletions src/mem/memcpy.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use super::c_int;

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
if src < dest as *const u8 {
// copy from end
let mut i = n;
while i != 0 {
i -= 1;
*dest.offset(i as isize) = *src.offset(i as isize);
}
} else {
// copy from beginning
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*s.offset(i as isize) = c as u8;
i += 1;
}
s
}
43 changes: 4 additions & 39 deletions src/mem.rs → src/mem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,45 +9,10 @@ use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div}
use core::mem;
use core::ops::{BitOr, Shl};

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
if src < dest as *const u8 {
// copy from end
let mut i = n;
while i != 0 {
i -= 1;
*dest.offset(i as isize) = *src.offset(i as isize);
}
} else {
// copy from beginning
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*s.offset(i as isize) = c as u8;
i += 1;
}
s
}
// memcpy/memmove/memset have optimized implementations on some architectures
#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
mod memcpy;
pub use self::memcpy::*;

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
Expand Down
79 changes: 79 additions & 0 deletions src/mem/x86_64.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
use super::c_int;

// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
// been enhanced to perform better than an simple qword loop, making them ideal
// for implementing memcpy/memset. Note that "rep cmps" has received no such
// enhancement, so it is not used to implement memcmp.
//
// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
// further enhanced to automatically select the best microarchitectural
// implementation based on length and alignment. See the following features from
// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
// - FSRM - Fast Short REP MOV (Ice Lake and later)
// - Fast Zero-Length MOVSB (On no current hardware)
// - Fast Short STOSB (On no current hardware)
// However, to avoid run-time feature detection, we don't use these byte-based
// instructions for most of the copying, preferring the qword variants.

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
"rep movsq [rdi], [rsi]",
"mov ecx, {byte_count:e}",
"rep movsb [rdi], [rsi]",
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(nostack, preserves_flags)
);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
let delta = (dest as usize).wrapping_sub(src as usize);
if delta >= count {
// We can copy forwards because either dest is far enough ahead of src,
// or src is ahead of dest (and delta overflowed).
return self::memcpy(dest, src, count);
}
// copy backwards
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
"std",
"rep movsq [rdi], [rsi]",
"mov ecx, {byte_count:e}",
"add rdi, 7",
"add rsi, 7",
"rep movsb [rdi], [rsi]",
"cld",
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _,
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
options(nostack)
);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
"rep stosq [rdi], rax",
"mov ecx, {byte_count:e}",
"rep stosb [rdi], al",
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest => _,
in("rax") (c as u8 as u64) * 0x0101010101010101,
options(nostack, preserves_flags)
);
dest
}
162 changes: 162 additions & 0 deletions testcrate/benches/mem.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#![feature(test)]

extern crate test;
use test::{black_box, Bencher};

extern crate compiler_builtins;
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};

fn memcpy_builtin(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
dst.copy_from_slice(src);
})
}

fn memcpy_rust(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
})
}

fn memset_builtin(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let val: u8 = black_box(27);
for b in dst {
*b = val;
}
})
}

fn memset_rust(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let val = black_box(27);
unsafe { memset(dst.as_mut_ptr(), val, n) }
})
}

fn memcmp_builtin(b: &mut Bencher, n: usize) {
let v1 = vec![0u8; n];
let mut v2 = vec![0u8; n];
v2[n - 1] = 1;
b.bytes = n as u64;
b.iter(|| {
let s1: &[u8] = black_box(&v1);
let s2: &[u8] = black_box(&v2);
s1.cmp(s2)
})
}

fn memcmp_rust(b: &mut Bencher, n: usize) {
let v1 = vec![0u8; n];
let mut v2 = vec![0u8; n];
v2[n - 1] = 1;
b.bytes = n as u64;
b.iter(|| {
let s1: &[u8] = black_box(&v1);
let s2: &[u8] = black_box(&v2);
unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) }
})
}

fn memmove_builtin(b: &mut Bencher, n: usize) {
let mut v = vec![0u8; n + n / 2];
b.bytes = n as u64;
b.iter(|| {
let s: &mut [u8] = black_box(&mut v);
s.copy_within(0..n, n / 2);
})
}

fn memmove_rust(b: &mut Bencher, n: usize) {
let mut v = vec![0u8; n + n / 2];
b.bytes = n as u64;
b.iter(|| {
let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr();
let src: *const u8 = black_box(&v).as_ptr();
unsafe { memmove(dst, src, n) };
})
}

#[bench]
fn memcpy_builtin_4096(b: &mut Bencher) {
memcpy_builtin(b, 4096)
}
#[bench]
fn memcpy_rust_4096(b: &mut Bencher) {
memcpy_rust(b, 4096)
}
#[bench]
fn memcpy_builtin_1048576(b: &mut Bencher) {
memcpy_builtin(b, 1048576)
}
#[bench]
fn memcpy_rust_1048576(b: &mut Bencher) {
memcpy_rust(b, 1048576)
}

#[bench]
fn memset_builtin_4096(b: &mut Bencher) {
memset_builtin(b, 4096)
}
#[bench]
fn memset_rust_4096(b: &mut Bencher) {
memset_rust(b, 4096)
}
#[bench]
fn memset_builtin_1048576(b: &mut Bencher) {
memset_builtin(b, 1048576)
}
#[bench]
fn memset_rust_1048576(b: &mut Bencher) {
memset_rust(b, 1048576)
}

#[bench]
fn memcmp_builtin_4096(b: &mut Bencher) {
memcmp_builtin(b, 4096)
}
#[bench]
fn memcmp_rust_4096(b: &mut Bencher) {
memcmp_rust(b, 4096)
}
#[bench]
fn memcmp_builtin_1048576(b: &mut Bencher) {
memcmp_builtin(b, 1048576)
}
#[bench]
fn memcmp_rust_1048576(b: &mut Bencher) {
memcmp_rust(b, 1048576)
}

#[bench]
fn memmove_builtin_4096(b: &mut Bencher) {
memmove_builtin(b, 4096)
}
#[bench]
fn memmove_rust_4096(b: &mut Bencher) {
memmove_rust(b, 4096)
}
#[bench]
fn memmove_builtin_1048576(b: &mut Bencher) {
memmove_builtin(b, 1048576)
}
#[bench]
fn memmove_rust_1048576(b: &mut Bencher) {
memmove_rust(b, 1048576)
}
Loading

0 comments on commit e49b2d2

Please sign in to comment.