diff --git a/build.rs b/build.rs index d4cfe0e1..fb3dc373 100644 --- a/build.rs +++ b/build.rs @@ -33,6 +33,11 @@ fn main() { println!("cargo:rustc-cfg=feature=\"mem\""); } + // These targets have hardware unaligned access support. + if target.contains("x86_64") || target.contains("i686") || target.contains("aarch64") { + println!("cargo:rustc-cfg=feature=\"mem-unaligned\""); + } + // NOTE we are going to assume that llvm-target, what determines our codegen option, matches the // target triple. This is usually correct for our built-in targets but can break in presence of // custom targets, which can have arbitrary names. diff --git a/src/mem/impls.rs b/src/mem/impls.rs index b3eef990..65887a33 100644 --- a/src/mem/impls.rs +++ b/src/mem/impls.rs @@ -1,27 +1,257 @@ +use core::intrinsics::likely; + +const WORD_SIZE: usize = core::mem::size_of::(); +const WORD_MASK: usize = WORD_SIZE - 1; + +// If the number of bytes involved exceed this threshold we will opt in word-wise copy. +// The value here selected is max(2 * WORD_SIZE, 16): +// * We need at least 2 * WORD_SIZE bytes to guarantee that at least 1 word will be copied through +// word-wise copy. +// * The word-wise copy logic needs to perform some checks so it has some small overhead. +// ensures that even on 32-bit platforms we have copied at least 8 bytes through +// word-wise copy so the saving of word-wise copy outweights the fixed overhead. +const WORD_COPY_THRESHOLD: usize = if 2 * WORD_SIZE > 16 { + 2 * WORD_SIZE +} else { + 16 +}; + +#[cfg(feature = "mem-unaligned")] +unsafe fn read_usize_unaligned(x: *const usize) -> usize { + // Do not use `core::ptr::read_unaligned` here, since it calls `copy_nonoverlapping` which + // is translated to memcpy in LLVM. + let x_read = (x as *const [u8; core::mem::size_of::()]).read(); + core::mem::transmute(x_read) +} + #[inline(always)] -pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) { - let mut i = 0; - while i < n { - *dest.add(i) = *src.add(i); - i += 1; +pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) { + #[inline(always)] + unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) { + let dest_end = dest.add(n); + while dest < dest_end { + *dest = *src; + dest = dest.add(1); + src = src.add(1); + } + } + + #[inline(always)] + unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + while dest_usize < dest_end { + *dest_usize = *src_usize; + dest_usize = dest_usize.add(1); + src_usize = src_usize.add(1); + } + } + + #[cfg(not(feature = "mem-unaligned"))] + #[inline(always)] + unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + // Calculate the misalignment offset and shift needed to reassemble value. + let offset = src as usize & WORD_MASK; + let shift = offset * 8; + + // Realign src + let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize; + // This will read (but won't use) bytes out of bound. + let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned); + + while dest_usize < dest_end { + src_aligned = src_aligned.add(1); + let cur_word = *src_aligned; + #[cfg(target_endian = "little")] + let resembled = prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift); + #[cfg(target_endian = "big")] + let resembled = prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift); + prev_word = cur_word; + + *dest_usize = resembled; + dest_usize = dest_usize.add(1); + } + } + + #[cfg(feature = "mem-unaligned")] + #[inline(always)] + unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_end = dest.add(n) as *mut usize; + + while dest_usize < dest_end { + *dest_usize = read_usize_unaligned(src_usize); + dest_usize = dest_usize.add(1); + src_usize = src_usize.add(1); + } } + + if n >= WORD_COPY_THRESHOLD { + // Align dest + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let dest_misalignment = (dest as usize).wrapping_neg() & WORD_MASK; + copy_forward_bytes(dest, src, dest_misalignment); + dest = dest.add(dest_misalignment); + src = src.add(dest_misalignment); + n -= dest_misalignment; + + let n_words = n & !WORD_MASK; + let src_misalignment = src as usize & WORD_MASK; + if likely(src_misalignment == 0) { + copy_forward_aligned_words(dest, src, n_words); + } else { + copy_forward_misaligned_words(dest, src, n_words); + } + dest = dest.add(n_words); + src = src.add(n_words); + n -= n_words; + } + copy_forward_bytes(dest, src, n); } #[inline(always)] -pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) { - // copy from end - let mut i = n; - while i != 0 { - i -= 1; - *dest.add(i) = *src.add(i); +pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, mut n: usize) { + // The following backward copy helper functions uses the pointers past the end + // as their inputs instead of pointers to the start! + #[inline(always)] + unsafe fn copy_backward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) { + let dest_start = dest.sub(n); + while dest_start < dest { + dest = dest.sub(1); + src = src.sub(1); + *dest = *src; + } } + + #[inline(always)] + unsafe fn copy_backward_aligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + while dest_start < dest_usize { + dest_usize = dest_usize.sub(1); + src_usize = src_usize.sub(1); + *dest_usize = *src_usize; + } + } + + #[cfg(not(feature = "mem-unaligned"))] + #[inline(always)] + unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + // Calculate the misalignment offset and shift needed to reassemble value. + let offset = src as usize & WORD_MASK; + let shift = offset * 8; + + // Realign src_aligned + let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize; + // This will read (but won't use) bytes out of bound. + let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned); + + while dest_start < dest_usize { + src_aligned = src_aligned.sub(1); + let cur_word = *src_aligned; + #[cfg(target_endian = "little")] + let resembled = prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift; + #[cfg(target_endian = "big")] + let resembled = prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift; + prev_word = cur_word; + + dest_usize = dest_usize.sub(1); + *dest_usize = resembled; + } + } + + #[cfg(feature = "mem-unaligned")] + #[inline(always)] + unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { + let mut dest_usize = dest as *mut usize; + let mut src_usize = src as *mut usize; + let dest_start = dest.sub(n) as *mut usize; + + while dest_start < dest_usize { + dest_usize = dest_usize.sub(1); + src_usize = src_usize.sub(1); + *dest_usize = read_usize_unaligned(src_usize); + } + } + + let mut dest = dest.add(n); + let mut src = src.add(n); + + if n >= WORD_COPY_THRESHOLD { + // Align dest + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let dest_misalignment = dest as usize & WORD_MASK; + copy_backward_bytes(dest, src, dest_misalignment); + dest = dest.sub(dest_misalignment); + src = src.sub(dest_misalignment); + n -= dest_misalignment; + + let n_words = n & !WORD_MASK; + let src_misalignment = src as usize & WORD_MASK; + if likely(src_misalignment == 0) { + copy_backward_aligned_words(dest, src, n_words); + } else { + copy_backward_misaligned_words(dest, src, n_words); + } + dest = dest.sub(n_words); + src = src.sub(n_words); + n -= n_words; + } + copy_backward_bytes(dest, src, n); } #[inline(always)] -pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) { - let mut i = 0; - while i < n { - *s.add(i) = c; - i += 1; +pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) { + #[inline(always)] + pub unsafe fn set_bytes_bytes(mut s: *mut u8, c: u8, n: usize) { + let end = s.add(n); + while s < end { + *s = c; + s = s.add(1); + } + } + + #[inline(always)] + pub unsafe fn set_bytes_words(s: *mut u8, c: u8, n: usize) { + let mut broadcast = c as usize; + let mut bits = 8; + while bits < WORD_SIZE * 8 { + broadcast |= broadcast << bits; + bits *= 2; + } + + let mut s_usize = s as *mut usize; + let end = s.add(n) as *mut usize; + + while s_usize < end { + *s_usize = broadcast; + s_usize = s_usize.add(1); + } + } + + if likely(n >= WORD_COPY_THRESHOLD) { + // Align s + // Because of n >= 2 * WORD_SIZE, dst_misalignment < n + let misalignment = (s as usize).wrapping_neg() & WORD_MASK; + set_bytes_bytes(s, c, misalignment); + s = s.add(misalignment); + n -= misalignment; + + let n_words = n & !WORD_MASK; + set_bytes_words(s, c, n_words); + s = s.add(n_words); + n -= n_words; } + set_bytes_bytes(s, c, n); } diff --git a/testcrate/benches/mem.rs b/testcrate/benches/mem.rs index cee64ae4..b6883a93 100644 --- a/testcrate/benches/mem.rs +++ b/testcrate/benches/mem.rs @@ -6,30 +6,64 @@ use test::{black_box, Bencher}; extern crate compiler_builtins; use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; -fn memcpy_builtin(b: &mut Bencher, n: usize, offset: usize) { - let v1 = vec![1u8; n + offset]; - let mut v2 = vec![0u8; n + offset]; +const WORD_SIZE: usize = core::mem::size_of::(); + +struct AlignedVec { + vec: Vec, + size: usize, +} + +impl AlignedVec { + fn new(fill: u8, size: usize) -> Self { + let mut broadcast = fill as usize; + let mut bits = 8; + while bits < WORD_SIZE * 8 { + broadcast |= broadcast << bits; + bits *= 2; + } + + let vec = vec![broadcast; (size + WORD_SIZE - 1) & !WORD_SIZE]; + AlignedVec { vec, size } + } +} + +impl core::ops::Deref for AlignedVec { + type Target = [u8]; + fn deref(&self) -> &[u8] { + unsafe { core::slice::from_raw_parts(self.vec.as_ptr() as *const u8, self.size) } + } +} + +impl core::ops::DerefMut for AlignedVec { + fn deref_mut(&mut self) -> &mut [u8] { + unsafe { core::slice::from_raw_parts_mut(self.vec.as_mut_ptr() as *mut u8, self.size) } + } +} + +fn memcpy_builtin(b: &mut Bencher, n: usize, offset1: usize, offset2: usize) { + let v1 = AlignedVec::new(1, n + offset1); + let mut v2 = AlignedVec::new(0, n + offset2); b.bytes = n as u64; b.iter(|| { - let src: &[u8] = black_box(&v1[offset..]); - let dst: &mut [u8] = black_box(&mut v2[offset..]); + let src: &[u8] = black_box(&v1[offset1..]); + let dst: &mut [u8] = black_box(&mut v2[offset2..]); dst.copy_from_slice(src); }) } -fn memcpy_rust(b: &mut Bencher, n: usize, offset: usize) { - let v1 = vec![1u8; n + offset]; - let mut v2 = vec![0u8; n + offset]; +fn memcpy_rust(b: &mut Bencher, n: usize, offset1: usize, offset2: usize) { + let v1 = AlignedVec::new(1, n + offset1); + let mut v2 = AlignedVec::new(0, n + offset2); b.bytes = n as u64; b.iter(|| { - let src: &[u8] = black_box(&v1[offset..]); - let dst: &mut [u8] = black_box(&mut v2[offset..]); + let src: &[u8] = black_box(&v1[offset1..]); + let dst: &mut [u8] = black_box(&mut v2[offset2..]); unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) } }) } fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) { - let mut v1 = vec![0u8; n + offset]; + let mut v1 = AlignedVec::new(0, n + offset); b.bytes = n as u64; b.iter(|| { let dst: &mut [u8] = black_box(&mut v1[offset..]); @@ -41,7 +75,7 @@ fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) { } fn memset_rust(b: &mut Bencher, n: usize, offset: usize) { - let mut v1 = vec![0u8; n + offset]; + let mut v1 = AlignedVec::new(0, n + offset); b.bytes = n as u64; b.iter(|| { let dst: &mut [u8] = black_box(&mut v1[offset..]); @@ -51,8 +85,8 @@ fn memset_rust(b: &mut Bencher, n: usize, offset: usize) { } fn memcmp_builtin(b: &mut Bencher, n: usize) { - let v1 = vec![0u8; n]; - let mut v2 = vec![0u8; n]; + let v1 = AlignedVec::new(0, n); + let mut v2 = AlignedVec::new(0, n); v2[n - 1] = 1; b.bytes = n as u64; b.iter(|| { @@ -63,8 +97,8 @@ fn memcmp_builtin(b: &mut Bencher, n: usize) { } fn memcmp_rust(b: &mut Bencher, n: usize) { - let v1 = vec![0u8; n]; - let mut v2 = vec![0u8; n]; + let v1 = AlignedVec::new(0, n); + let mut v2 = AlignedVec::new(0, n); v2[n - 1] = 1; b.bytes = n as u64; b.iter(|| { @@ -74,20 +108,20 @@ fn memcmp_rust(b: &mut Bencher, n: usize) { }) } -fn memmove_builtin(b: &mut Bencher, n: usize) { - let mut v = vec![0u8; n + n / 2]; +fn memmove_builtin(b: &mut Bencher, n: usize, offset: usize) { + let mut v = AlignedVec::new(0, n + n / 2 + offset); b.bytes = n as u64; b.iter(|| { let s: &mut [u8] = black_box(&mut v); - s.copy_within(0..n, n / 2); + s.copy_within(0..n, n / 2 + offset); }) } -fn memmove_rust(b: &mut Bencher, n: usize) { - let mut v = vec![0u8; n + n / 2]; +fn memmove_rust(b: &mut Bencher, n: usize, offset: usize) { + let mut v = AlignedVec::new(0, n + n / 2 + offset); b.bytes = n as u64; b.iter(|| { - let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr(); + let dst: *mut u8 = black_box(&mut v[n / 2 + offset..]).as_mut_ptr(); let src: *const u8 = black_box(&v).as_ptr(); unsafe { memmove(dst, src, n) }; }) @@ -95,35 +129,51 @@ fn memmove_rust(b: &mut Bencher, n: usize) { #[bench] fn memcpy_builtin_4096(b: &mut Bencher) { - memcpy_builtin(b, 4096, 0) + memcpy_builtin(b, 4096, 0, 0) } #[bench] fn memcpy_rust_4096(b: &mut Bencher) { - memcpy_rust(b, 4096, 0) + memcpy_rust(b, 4096, 0, 0) } #[bench] fn memcpy_builtin_1048576(b: &mut Bencher) { - memcpy_builtin(b, 1048576, 0) + memcpy_builtin(b, 1048576, 0, 0) } #[bench] fn memcpy_rust_1048576(b: &mut Bencher) { - memcpy_rust(b, 1048576, 0) + memcpy_rust(b, 1048576, 0, 0) } #[bench] fn memcpy_builtin_4096_offset(b: &mut Bencher) { - memcpy_builtin(b, 4096, 65) + memcpy_builtin(b, 4096, 65, 65) } #[bench] fn memcpy_rust_4096_offset(b: &mut Bencher) { - memcpy_rust(b, 4096, 65) + memcpy_rust(b, 4096, 65, 65) } #[bench] fn memcpy_builtin_1048576_offset(b: &mut Bencher) { - memcpy_builtin(b, 1048576, 65) + memcpy_builtin(b, 1048576, 65, 65) } #[bench] fn memcpy_rust_1048576_offset(b: &mut Bencher) { - memcpy_rust(b, 1048576, 65) + memcpy_rust(b, 1048576, 65, 65) +} +#[bench] +fn memcpy_builtin_4096_misalign(b: &mut Bencher) { + memcpy_builtin(b, 4096, 65, 66) +} +#[bench] +fn memcpy_rust_4096_misalign(b: &mut Bencher) { + memcpy_rust(b, 4096, 65, 66) +} +#[bench] +fn memcpy_builtin_1048576_misalign(b: &mut Bencher) { + memcpy_builtin(b, 1048576, 65, 66) +} +#[bench] +fn memcpy_rust_1048576_misalign(b: &mut Bencher) { + memcpy_rust(b, 1048576, 65, 66) } #[bench] @@ -178,17 +228,33 @@ fn memcmp_rust_1048576(b: &mut Bencher) { #[bench] fn memmove_builtin_4096(b: &mut Bencher) { - memmove_builtin(b, 4096) + memmove_builtin(b, 4096, 0) } #[bench] fn memmove_rust_4096(b: &mut Bencher) { - memmove_rust(b, 4096) + memmove_rust(b, 4096, 0) } #[bench] fn memmove_builtin_1048576(b: &mut Bencher) { - memmove_builtin(b, 1048576) + memmove_builtin(b, 1048576, 0) } #[bench] fn memmove_rust_1048576(b: &mut Bencher) { - memmove_rust(b, 1048576) + memmove_rust(b, 1048576, 0) +} +#[bench] +fn memmove_builtin_4096_misalign(b: &mut Bencher) { + memmove_builtin(b, 4096, 1) +} +#[bench] +fn memmove_rust_4096_misalign(b: &mut Bencher) { + memmove_rust(b, 4096, 1) +} +#[bench] +fn memmove_builtin_1048576_misalign(b: &mut Bencher) { + memmove_builtin(b, 1048576, 1) +} +#[bench] +fn memmove_rust_1048576_misalign(b: &mut Bencher) { + memmove_rust(b, 1048576, 1) } diff --git a/testcrate/tests/mem.rs b/testcrate/tests/mem.rs index a5596b28..3f20e72a 100644 --- a/testcrate/tests/mem.rs +++ b/testcrate/tests/mem.rs @@ -1,6 +1,8 @@ extern crate compiler_builtins; use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; +const WORD_SIZE: usize = core::mem::size_of::(); + #[test] fn memcpy_3() { let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; @@ -131,3 +133,130 @@ fn memcmp_ne() { assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 8) > 0); } } + +#[derive(Clone, Copy)] +struct AlignedStorage([u8; N], [usize; 0]); + +fn gen_arr() -> AlignedStorage { + let mut ret = AlignedStorage::([0; N], []); + for i in 0..N { + ret.0[i] = i as u8; + } + ret +} + +#[test] +fn memmove_forward_misaligned_nonaligned_start() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let src = arr.0.as_ptr().offset(6); + let dst = arr.0.as_mut_ptr().offset(3); + assert_eq!(memmove(dst, src, 17), dst); + reference.0.copy_within(6..6 + 17, 3); + assert_eq!(arr.0, reference.0); + } +} + +#[test] +fn memmove_forward_misaligned_aligned_start() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let src = arr.0.as_ptr().offset(6); + let dst = arr.0.as_mut_ptr().add(0); + assert_eq!(memmove(dst, src, 17), dst); + reference.0.copy_within(6..6 + 17, 0); + assert_eq!(arr.0, reference.0); + } +} + +#[test] +fn memmove_forward_aligned() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let src = arr.0.as_ptr().add(3 + WORD_SIZE); + let dst = arr.0.as_mut_ptr().add(3); + assert_eq!(memmove(dst, src, 17), dst); + reference + .0 + .copy_within(3 + WORD_SIZE..3 + WORD_SIZE + 17, 3); + assert_eq!(arr.0, reference.0); + } +} + +#[test] +fn memmove_backward_misaligned_nonaligned_start() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let src = arr.0.as_ptr().offset(3); + let dst = arr.0.as_mut_ptr().offset(6); + assert_eq!(memmove(dst, src, 17), dst); + reference.0.copy_within(3..3 + 17, 6); + assert_eq!(arr.0, reference.0); + } +} + +#[test] +fn memmove_backward_misaligned_aligned_start() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let src = arr.0.as_ptr().offset(3); + let dst = arr.0.as_mut_ptr().add(WORD_SIZE); + assert_eq!(memmove(dst, src, 17), dst); + reference.0.copy_within(3..3 + 17, WORD_SIZE); + assert_eq!(arr.0, reference.0); + } +} + +#[test] +fn memmove_backward_aligned() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let src = arr.0.as_ptr().add(3); + let dst = arr.0.as_mut_ptr().add(3 + WORD_SIZE); + assert_eq!(memmove(dst, src, 17), dst); + reference.0.copy_within(3..3 + 17, 3 + WORD_SIZE); + assert_eq!(arr.0, reference.0); + } +} + +#[test] +fn memset_backward_misaligned_nonaligned_start() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let ptr = arr.0.as_mut_ptr().offset(6); + assert_eq!(memset(ptr, 0xCC, 17), ptr); + core::ptr::write_bytes(reference.0.as_mut_ptr().add(6), 0xCC, 17); + assert_eq!(arr.0, reference.0); + } +} + +#[test] +fn memset_backward_misaligned_aligned_start() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let ptr = arr.0.as_mut_ptr().add(WORD_SIZE); + assert_eq!(memset(ptr, 0xCC, 17), ptr); + core::ptr::write_bytes(reference.0.as_mut_ptr().add(WORD_SIZE), 0xCC, 17); + assert_eq!(arr.0, reference.0); + } +} + +#[test] +fn memset_backward_aligned() { + let mut arr = gen_arr::<32>(); + let mut reference = arr; + unsafe { + let ptr = arr.0.as_mut_ptr().add(3 + WORD_SIZE); + assert_eq!(memset(ptr, 0xCC, 17), ptr); + core::ptr::write_bytes(reference.0.as_mut_ptr().add(3 + WORD_SIZE), 0xCC, 17); + assert_eq!(arr.0, reference.0); + } +}