diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index cabcfc9b42b4e..f2b211a200727 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -498,6 +498,23 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { } } + sym::untyped_swap_nonoverlapping => { + // The fallback impl uses memcpy, which leaves around allocas + // that don't optimize out for certain widths, so force it to + // use SSA registers instead. + + let chunk_ty = fn_args.type_at(0); + let layout = self.layout_of(chunk_ty).layout; + let integer_ty = self.type_ix(layout.size().bits()); + let a = args[0].immediate(); + let b = args[1].immediate(); + let a_val = self.load(integer_ty, a, layout.align().abi); + let b_val = self.load(integer_ty, b, layout.align().abi); + self.store(b_val, a, layout.align().abi); + self.store(a_val, b, layout.align().abi); + return Ok(()); + } + sym::compare_bytes => { // Here we assume that the `memcmp` provided by the target is a NOP for size 0. let cmp = self.call_intrinsic("memcmp", &[ diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index fd78bf3e8fc60..ce884c4fbf4b5 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -504,6 +504,12 @@ pub fn check_intrinsic_type( sym::typed_swap_nonoverlapping => { (1, 0, vec![Ty::new_mut_ptr(tcx, param(0)); 2], tcx.types.unit) } + sym::untyped_swap_nonoverlapping => ( + 1, + 0, + vec![Ty::new_mut_ptr(tcx, Ty::new_maybe_uninit(tcx, param(0))); 2], + tcx.types.unit, + ), sym::discriminant_value => { let assoc_items = tcx.associated_item_def_ids( diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 4ecc4201f89d5..f580a104c28a6 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -2142,6 +2142,7 @@ symbols! { unstable location; did you mean to load this crate \ from crates.io via `Cargo.toml` instead?", untagged_unions, + untyped_swap_nonoverlapping, unused_imports, unwind, unwind_attributes, diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index b5c31d824677d..922116625a0fa 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -66,7 +66,7 @@ use crate::marker::{DiscriminantKind, Tuple}; use crate::mem::SizedTypeProperties; -use crate::{ptr, ub_checks}; +use crate::{mem, ptr, ub_checks}; pub mod fallback; pub mod mir; @@ -4003,7 +4003,37 @@ pub use typed_swap as typed_swap_nonoverlapping; pub const unsafe fn typed_swap_nonoverlapping(x: *mut T, y: *mut T) { // SAFETY: The caller provided single non-overlapping items behind // pointers, so swapping them with `count: 1` is fine. - unsafe { ptr::swap_nonoverlapping(x, y, 1) }; + unsafe { crate::swapping::swap_nonoverlapping(x, y, 1) }; +} + +/// Swaps the `N` untyped & non-overlapping bytes behind the two pointers. +/// +/// Split out from `typed_swap` for the internal swaps in `swap_nonoverlapping` +/// which would otherwise cause cycles between the fallback implementations on +/// backends where neither is overridden. +/// +/// # Safety +/// +/// `x` and `y` are readable and writable as `MaybeUninit` and non-overlapping. +#[inline] +#[rustc_nounwind] +#[cfg_attr(not(bootstrap), rustc_intrinsic)] +#[miri::intrinsic_fallback_is_spec] +#[rustc_const_stable_indirect] +pub const unsafe fn untyped_swap_nonoverlapping( + x: *mut mem::MaybeUninit, + y: *mut mem::MaybeUninit, +) { + // This intentionally uses untyped memory copies, not reads/writes, + // to avoid any risk of losing padding in things like (u16, u8). + let mut temp = mem::MaybeUninit::::uninit(); + // SAFETY: Caller promised that x and y are non-overlapping & read/writeable, + // and our fresh local is always disjoint from anything otherwise readable. + unsafe { + (&raw mut temp).copy_from_nonoverlapping(x, 1); + x.copy_from_nonoverlapping(y, 1); + y.copy_from_nonoverlapping(&raw const temp, 1); + } } /// Returns whether we should perform some UB-checking at runtime. This eventually evaluates to diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 0d8a3811eded1..88eac4f4cc21b 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -376,6 +376,7 @@ pub mod alloc; // note: does not need to be public mod bool; mod escape; +pub(crate) mod swapping; mod tuple; mod unit; diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs index ac074c097d94c..b285d6a4a78de 100644 --- a/library/core/src/ptr/mod.rs +++ b/library/core/src/ptr/mod.rs @@ -395,7 +395,6 @@ #![allow(clippy::not_unsafe_ptr_arg_deref)] use crate::cmp::Ordering; -use crate::intrinsics::const_eval_select; use crate::marker::FnPtr; use crate::mem::{self, MaybeUninit, SizedTypeProperties}; use crate::{fmt, hash, intrinsics, ub_checks}; @@ -1092,84 +1091,8 @@ pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { } ); - const_eval_select!( - @capture[T] { x: *mut T, y: *mut T, count: usize }: - if const { - // At compile-time we want to always copy this in chunks of `T`, to ensure that if there - // are pointers inside `T` we will copy them in one go rather than trying to copy a part - // of a pointer (which would not work). - // SAFETY: Same preconditions as this function - unsafe { swap_nonoverlapping_simple_untyped(x, y, count) } - } else { - macro_rules! attempt_swap_as_chunks { - ($ChunkTy:ty) => { - if mem::align_of::() >= mem::align_of::<$ChunkTy>() - && mem::size_of::() % mem::size_of::<$ChunkTy>() == 0 - { - let x: *mut $ChunkTy = x.cast(); - let y: *mut $ChunkTy = y.cast(); - let count = count * (mem::size_of::() / mem::size_of::<$ChunkTy>()); - // SAFETY: these are the same bytes that the caller promised were - // ok, just typed as `MaybeUninit`s instead of as `T`s. - // The `if` condition above ensures that we're not violating - // alignment requirements, and that the division is exact so - // that we don't lose any bytes off the end. - return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }; - } - }; - } - - // Split up the slice into small power-of-two-sized chunks that LLVM is able - // to vectorize (unless it's a special type with more-than-pointer alignment, - // because we don't want to pessimize things like slices of SIMD vectors.) - if mem::align_of::() <= mem::size_of::() - && (!mem::size_of::().is_power_of_two() - || mem::size_of::() > mem::size_of::() * 2) - { - attempt_swap_as_chunks!(usize); - attempt_swap_as_chunks!(u8); - } - - // SAFETY: Same preconditions as this function - unsafe { swap_nonoverlapping_simple_untyped(x, y, count) } - } - ) -} - -/// Same behavior and safety conditions as [`swap_nonoverlapping`] -/// -/// LLVM can vectorize this (at least it can for the power-of-two-sized types -/// `swap_nonoverlapping` tries to use) so no need to manually SIMD it. -#[inline] -const unsafe fn swap_nonoverlapping_simple_untyped(x: *mut T, y: *mut T, count: usize) { - let x = x.cast::>(); - let y = y.cast::>(); - let mut i = 0; - while i < count { - // SAFETY: By precondition, `i` is in-bounds because it's below `n` - let x = unsafe { x.add(i) }; - // SAFETY: By precondition, `i` is in-bounds because it's below `n` - // and it's distinct from `x` since the ranges are non-overlapping - let y = unsafe { y.add(i) }; - - // If we end up here, it's because we're using a simple type -- like - // a small power-of-two-sized thing -- or a special type with particularly - // large alignment, particularly SIMD types. - // Thus, we're fine just reading-and-writing it, as either it's small - // and that works well anyway or it's special and the type's author - // presumably wanted things to be done in the larger chunk. - - // SAFETY: we're only ever given pointers that are valid to read/write, - // including being aligned, and nothing here panics so it's drop-safe. - unsafe { - let a: MaybeUninit = read(x); - let b: MaybeUninit = read(y); - write(x, b); - write(y, a); - } - - i += 1; - } + // SAFETY: Same preconditions as this function + unsafe { crate::swapping::swap_nonoverlapping(x, y, count) } } /// Moves `src` into the pointed `dst`, returning the previous `dst` value. diff --git a/library/core/src/swapping.rs b/library/core/src/swapping.rs new file mode 100644 index 0000000000000..25c6b0c6bf5d2 --- /dev/null +++ b/library/core/src/swapping.rs @@ -0,0 +1,182 @@ +use crate::{hint, intrinsics, mem, ptr}; + +//#[rustc_const_stable_indirect] +//#[rustc_allow_const_fn_unstable(const_eval_select)] +#[rustc_const_unstable(feature = "const_swap_nonoverlapping", issue = "133668")] +#[inline] +pub(crate) const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { + intrinsics::const_eval_select!( + @capture[T] { x: *mut T, y: *mut T, count: usize }: + if const { + // At compile-time we want to always copy this in chunks of `T`, to ensure that if there + // are pointers inside `T` we will copy them in one go rather than trying to copy a part + // of a pointer (which would not work). + // SAFETY: Same preconditions as this function + unsafe { swap_nonoverlapping_const(x, y, count) } + } else { + // At runtime we want to make sure not to swap byte-for-byte for types like [u8; 15], + // and swapping as `MaybeUninit` doesn't actually work as untyped for things like + // T = (u16, u8), so we type-erase to raw bytes and swap that way. + // SAFETY: Same preconditions as this function + unsafe { swap_nonoverlapping_runtime(x, y, count) } + } + ) +} + +/// Same behavior and safety conditions as [`swap_nonoverlapping`] +#[rustc_const_stable_indirect] +#[inline] +const unsafe fn swap_nonoverlapping_const(x: *mut T, y: *mut T, count: usize) { + let x = x.cast::>(); + let y = y.cast::>(); + let mut i = 0; + while i < count { + // SAFETY: By precondition, `i` is in-bounds because it's below `n` + // and because the two input ranges are non-overlapping and read/writeable, + // these individual items inside them are too. + unsafe { + intrinsics::untyped_swap_nonoverlapping::(x.add(i), y.add(i)); + } + + i += 1; + } +} + +// Scale the monomorphizations with the size of the machine, roughly. +const MAX_ALIGN: usize = align_of::().pow(2); + +/// Same behavior and safety conditions as [`swap_nonoverlapping`] +#[inline] +unsafe fn swap_nonoverlapping_runtime(x: *mut T, y: *mut T, count: usize) { + let bytes = { + let slice = ptr::slice_from_raw_parts(x, count); + // SAFETY: Because they both exist in memory and don't overlap, they + // must be legal slice sizes (below `isize::MAX` bytes). + unsafe { mem::size_of_val_raw(slice) } + }; + + // Generating *untyped* loops for every type is silly, so we polymorphize away + // the actual type, but we want to take advantage of alignment if possible, + // so monomorphize for a restricted set of possible alignments. + macro_rules! delegate_by_alignment { + ($($p:pat => $align:expr,)+) => {{ + #![allow(unreachable_patterns)] + match const { align_of::() } { + $( + $p => { + swap_nonoverlapping_bytes::<$align>(x.cast(), y.cast(), bytes); + } + )+ + } + }}; + } + + // SAFETY: + unsafe { + delegate_by_alignment! { + MAX_ALIGN.. => MAX_ALIGN, + 64.. => 64, + 32.. => 32, + 16.. => 16, + 8.. => 8, + 4.. => 4, + 2.. => 2, + _ => 1, + } + } +} + +/// # Safety: +/// - `x` and `y` must be aligned to `ALIGN` +/// - `bytes` must be a multiple of `ALIGN` +/// - They must be readable, writable, and non-overlapping for `bytes` bytes +#[inline] +unsafe fn swap_nonoverlapping_bytes( + x: *mut mem::MaybeUninit, + y: *mut mem::MaybeUninit, + bytes: usize, +) { + // SAFETY: Two legal non-overlapping regions can't be bigger than this. + // (And they couldn't have made allocations any bigger either anyway.) + // FIXME: Would be nice to have a type for this instead of the assume. + unsafe { hint::assert_unchecked(bytes < isize::MAX as usize) }; + + let mut i = 0; + macro_rules! swap_next_n { + ($n:expr) => {{ + let x: *mut mem::MaybeUninit<[u8; $n]> = x.add(i).cast(); + let y: *mut mem::MaybeUninit<[u8; $n]> = y.add(i).cast(); + swap_nonoverlapping_aligned_chunk::( + x.as_mut_unchecked(), + y.as_mut_unchecked(), + ); + i += $n; + }}; + } + + while bytes - i >= MAX_ALIGN { + const { assert!(MAX_ALIGN >= ALIGN) }; + // SAFETY: the const-assert above confirms we're only ever called with + // an alignment equal to or smaller than max align, so this is necessarily + // aligned, and the while loop ensures there's enough read/write memory. + unsafe { + swap_next_n!(MAX_ALIGN); + } + } + + macro_rules! handle_tail { + ($($n:literal)+) => {$( + if const { $n % ALIGN == 0 } { + // Checking this way simplifies the block end to just add+test, + // rather than needing extra math before the check. + if (bytes & $n) != 0 { + // SAFETY: The above swaps were bigger, so could not have + // impacted the `$n`-relevant bit, so checking `bytes & $n` + // was equivalent to `bytes - i >= $n`, and thus we have + // enough space left to swap another `$n` bytes. + unsafe { + swap_next_n!($n); + } + } + } + )+}; + } + const { assert!(MAX_ALIGN <= 64) }; + handle_tail!(32 16 8 4 2 1); + + debug_assert_eq!(i, bytes); +} + +/// Swaps the `C` behind `x` and `y` as untyped memory +/// +/// # Safety +/// +/// Both `x` and `y` must be aligned to `ALIGN`, in addition to their normal alignment. +/// They must be readable and writeable for `sizeof(C)` bytes, as usual for `&mut`s. +/// +/// (The actual instantiations are usually `C = [u8; _]`, so we get the alignment +/// information from the loads by `assume`ing the passed-in alignment.) +// Don't let MIR inline this, because we really want it to keep its noalias metadata +#[rustc_no_mir_inline] +#[inline] +unsafe fn swap_nonoverlapping_aligned_chunk( + x: &mut mem::MaybeUninit, + y: &mut mem::MaybeUninit, +) { + assert!(size_of::() % ALIGN == 0); + + let x = ptr::from_mut(x); + let y = ptr::from_mut(y); + + // SAFETY: One of our preconditions. + unsafe { + hint::assert_unchecked(x.is_aligned_to(ALIGN)); + hint::assert_unchecked(y.is_aligned_to(ALIGN)); + } + + // SAFETY: The memory is readable and writable because these were passed to + // us as mutable references, and the untyped swap doesn't need validity. + unsafe { + intrinsics::untyped_swap_nonoverlapping::(x, y); + } +} diff --git a/library/core/tests/ptr.rs b/library/core/tests/ptr.rs index e6825d8e39e2c..58d023bbaef1a 100644 --- a/library/core/tests/ptr.rs +++ b/library/core/tests/ptr.rs @@ -992,3 +992,32 @@ fn test_ptr_metadata_in_const() { assert_eq!(SLICE_META, 3); assert_eq!(DYN_META.size_of(), 42); } + +// See +#[test] +fn test_ptr_swap_nonoverlapping_swaps_padding() { + #[repr(C)] + struct Foo(usize, u8); + + let buf1: [usize; 2] = [1000, 2000]; + let buf2: [usize; 2] = [3000, 4000]; + + // Foo and [usize; 2] have the same size and alignment, + // so swap_nonoverlapping should treat them the same + assert_eq!(size_of::(), size_of::<[usize; 2]>()); + assert_eq!(align_of::(), align_of::<[usize; 2]>()); + + let mut b1 = buf1; + let mut b2 = buf2; + // Safety: b1 and b2 are distinct local variables, + // with the same size and alignment as Foo. + unsafe { + std::ptr::swap_nonoverlapping( + b1.as_mut_ptr().cast::(), + b2.as_mut_ptr().cast::(), + 1, + ); + } + assert_eq!(b1, buf2); + assert_eq!(b2, buf1); +} diff --git a/tests/assembly/x86_64-typed-swap.rs b/tests/assembly/x86_64-typed-swap.rs index 95e87519e6c4b..75c28724e03b7 100644 --- a/tests/assembly/x86_64-typed-swap.rs +++ b/tests/assembly/x86_64-typed-swap.rs @@ -51,3 +51,35 @@ pub fn swap_simd(x: &mut __m128, y: &mut __m128) { // CHECK: retq swap(x, y) } + +// CHECK-LABEL: swap_string: +#[no_mangle] +pub fn swap_string(x: &mut String, y: &mut String) { + // CHECK: movups (%[[ARG1]]), %[[T1a:xmm.]] + // CHECK: movups (%[[ARG2]]), %[[T2a:xmm.]] + // CHECK: movups %[[T2a]], (%[[ARG1]]) + // CHECK: movups %[[T1a]], (%[[ARG2]]) + // CHECK: movq 16(%[[ARG1]]), %[[T1b:r.+]] + // CHECK: movq 16(%[[ARG2]]), %[[T2b:r.+]] + // CHECK: movq %[[T2b]], 16(%[[ARG1]]) + // CHECK: movq %[[T1b]], 16(%[[ARG2]]) + // CHECK: retq + swap(x, y) +} + +// CHECK-LABEL: swap_44_bytes: +#[no_mangle] +pub fn swap_44_bytes(x: &mut [u8; 44], y: &mut [u8; 44]) { + // Ensure we do better than a long run of byte copies, + // see + + // CHECK-NOT: movb + // CHECK-COUNT-8: movups{{.+}}xmm + // CHECK-NOT: movb + // CHECK-COUNT-4: movq + // CHECK-NOT: movb + // CHECK-COUNT-4: movl + // CHECK-NOT: movb + // CHECK: retq + swap(x, y) +} diff --git a/tests/codegen/simd/swap-simd-types.rs b/tests/codegen/simd/swap-simd-types.rs index cd6e84286e1c9..4c72e13df3443 100644 --- a/tests/codegen/simd/swap-simd-types.rs +++ b/tests/codegen/simd/swap-simd-types.rs @@ -23,8 +23,19 @@ pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) { #[no_mangle] pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) { // CHECK-NOT: alloca - // CHECK: load <8 x float>{{.+}}align 32 - // CHECK: store <8 x float>{{.+}}align 32 + + // CHECK-NOT: load i128 + // CHECK-NOT: load i64 + // CHECK-NOT: load i32 + + // CHECK: [[A:%.+]] = load i256{{.+}}align 32 + // CHECK: [[B:%.+]] = load i256{{.+}}align 32 + // CHECK: store i256 [[B]]{{.+}}align 32 + // CHECK: store i256 [[A]]{{.+}}align 32 + + // CHECK-NOT: load i128 + // CHECK-NOT: load i64 + // CHECK-NOT: load i32 if x.len() == y.len() { x.swap_with_slice(y); } @@ -34,7 +45,18 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) { #[no_mangle] pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) { // CHECK-NOT: alloca - // CHECK: load <32 x i8>{{.+}}align 1 - // CHECK: store <32 x i8>{{.+}}align 1 + + // CHECK-NOT: load i128 + // CHECK-NOT: load i64 + // CHECK-NOT: load i32 + + // CHECK: [[A:%.+]] = load i256{{.+}}align 1 + // CHECK: [[B:%.+]] = load i256{{.+}}align 1 + // CHECK: store i256 [[B]]{{.+}}align 1 + // CHECK: store i256 [[A]]{{.+}}align 1 + + // CHECK-NOT: load i128 + // CHECK-NOT: load i64 + // CHECK-NOT: load i32 swap(x, y) } diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs index 761d48969dad9..a1cc6bd1f8d0c 100644 --- a/tests/codegen/swap-large-types.rs +++ b/tests/codegen/swap-large-types.rs @@ -33,35 +33,42 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { #[no_mangle] pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { // CHECK-NOT: alloca - // CHECK: load <{{[0-9]+}} x i64> - // CHECK: store <{{[0-9]+}} x i64> + // CHECK-COUNT-2: load i512{{.+}}align 8 + // CHECK-COUNT-2: store i512{{.+}}align 8 + // CHECK-COUNT-2: load i512{{.+}}align 8 + // CHECK-COUNT-2: store i512{{.+}}align 8 + // CHECK-COUNT-2: load i512{{.+}}align 8 + // CHECK-COUNT-2: store i512{{.+}}align 8 + // CHECK-COUNT-2: load i64{{.+}}align 8 + // CHECK-COUNT-2: store i64{{.+}}align 8 swap(x, y) } -// Verify that types with usize alignment are swapped via vectored usizes, -// not falling back to byte-level code. - -// CHECK-LABEL: @swap_slice -#[no_mangle] -pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) { - // CHECK-NOT: alloca - // CHECK: load <{{[0-9]+}} x i64> - // CHECK: store <{{[0-9]+}} x i64> - if x.len() == y.len() { - x.swap_with_slice(y); - } -} - -// But for a large align-1 type, vectorized byte copying is what we want. - type OneKilobyteBuffer = [u8; 1024]; // CHECK-LABEL: @swap_1kb_slices #[no_mangle] pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) { // CHECK-NOT: alloca - // CHECK: load <{{[0-9]+}} x i8> - // CHECK: store <{{[0-9]+}} x i8> + + // These are so big that there's only the biggest chunk size used + + // CHECK-NOT: load i256 + // CHECK-NOT: load i128 + // CHECK-NOT: load i64 + // CHECK-NOT: load i32 + // CHECK-NOT: load i16 + // CHECK-NOT: load i8 + + // CHECK-COUNT-2: load i512{{.+}}align 1 + // CHECK-COUNT-2: store i512{{.+}}align 1 + + // CHECK-NOT: store i256 + // CHECK-NOT: store i128 + // CHECK-NOT: store i64 + // CHECK-NOT: store i32 + // CHECK-NOT: store i16 + // CHECK-NOT: store i8 if x.len() == y.len() { x.swap_with_slice(y); } @@ -81,10 +88,12 @@ pub struct BigButHighlyAligned([u8; 64 * 3]); // CHECK-LABEL: @swap_big_aligned #[no_mangle] pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) { - // CHECK-NOT: call void @llvm.memcpy - // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192) - // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192) - // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192) - // CHECK-NOT: call void @llvm.memcpy + // CHECK-NOT: alloca + // CHECK-COUNT-2: load i512{{.+}}align 64 + // CHECK-COUNT-2: store i512{{.+}}align 64 + // CHECK-COUNT-2: load i512{{.+}}align 64 + // CHECK-COUNT-2: store i512{{.+}}align 64 + // CHECK-COUNT-2: load i512{{.+}}align 64 + // CHECK-COUNT-2: store i512{{.+}}align 64 swap(x, y) } diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs index 1a48c63d8139f..30e81fd4ef04e 100644 --- a/tests/codegen/swap-small-types.rs +++ b/tests/codegen/swap-small-types.rs @@ -27,13 +27,19 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) { pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) { // CHECK-NOT: alloca - // Whether `i8` is the best for this is unclear, but - // might as well record what's actually happening right now. - - // CHECK: load i8 - // CHECK: load i8 - // CHECK: store i8 - // CHECK: store i8 + // Swapping `i48` might be cleaner in LLVM-IR here, but `i32`+`i16` isn't bad, + // and is closer to the assembly it generates anyway. + + // CHECK-NOT: load + // CHECK: load i32{{.+}}align 2 + // CHECK-NEXT: load i32{{.+}}align 2 + // CHECK-NEXT: store i32{{.+}}align 2 + // CHECK-NEXT: store i32{{.+}}align 2 + // CHECK: load i16{{.+}}align 2 + // CHECK-NEXT: load i16{{.+}}align 2 + // CHECK-NEXT: store i16{{.+}}align 2 + // CHECK-NEXT: store i16{{.+}}align 2 + // CHECK-NOT: store swap(x, y) } @@ -54,19 +60,27 @@ pub fn swap_rgba64(x: &mut RGBA64, y: &mut RGBA64) { #[no_mangle] pub fn swap_vecs(x: &mut Vec, y: &mut Vec) { // CHECK-NOT: alloca - // There are plenty more loads and stores than just these, - // but at least one sure better be 64-bit (for size or capacity). - // CHECK: load i64 + + // CHECK-NOT: load + // CHECK: load i128 + // CHECK-NEXT: load i128 + // CHECK-NEXT: store i128 + // CHECK-NEXT: store i128 // CHECK: load i64 - // CHECK: store i64 - // CHECK: store i64 - // CHECK: ret void + // CHECK-NEXT: load i64 + // CHECK-NEXT: store i64 + // CHECK-NEXT: store i64 + // CHECK-NOT: store swap(x, y) } // CHECK-LABEL: @swap_slices #[no_mangle] pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) { + // Note that separate loads here is fine, as they merge to `movups` anyway + // at the assembly level, so staying more obviously typed and as a scalar + // pair -- like they're used elsewhere -- is ok, no need to force `i128`. + // CHECK-NOT: alloca // CHECK: load ptr // CHECK: load i64 @@ -76,45 +90,84 @@ pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) { swap(x, y) } -// LLVM doesn't vectorize a loop over 3-byte elements, -// so we chunk it down to bytes and loop over those instead. type RGB24 = [u8; 3]; // CHECK-LABEL: @swap_rgb24_slices #[no_mangle] pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) { // CHECK-NOT: alloca - // CHECK: load <{{[0-9]+}} x i8> - // CHECK: store <{{[0-9]+}} x i8> + + // The odd size means we need the full set. + + // CHECK-COUNT-2: load i512{{.+}}align 1 + // CHECK-NEXT: store i512{{.+}}align 1 + // CHECK-COUNT-2: load i256{{.+}}align 1 + // CHECK-NEXT: store i256{{.+}}align 1 + // CHECK-COUNT-2: load i128{{.+}}align 1 + // CHECK-NEXT: store i128{{.+}}align 1 + // CHECK-COUNT-2: load i64{{.+}}align 1 + // CHECK-NEXT: store i64{{.+}}align 1 + // CHECK-COUNT-2: load i32{{.+}}align 1 + // CHECK-NEXT: store i32{{.+}}align 1 + // CHECK-COUNT-2: load i16{{.+}}align 1 + // CHECK-NEXT: store i16{{.+}}align 1 + // CHECK-COUNT-2: load i8{{.+}}align 1 + // CHECK-NEXT: store i8{{.+}}align 1 if x.len() == y.len() { x.swap_with_slice(y); } } -// This one has a power-of-two size, so we iterate over it directly type RGBA32 = [u8; 4]; // CHECK-LABEL: @swap_rgba32_slices #[no_mangle] pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) { // CHECK-NOT: alloca - // CHECK: load <{{[0-9]+}} x i32> - // CHECK: store <{{[0-9]+}} x i32> + + // Because the size in bytes in a multiple of 4, we can skip the smallest sizes. + + // CHECK-COUNT-2: load i512{{.+}}align 1 + // CHECK-NEXT: store i512{{.+}}align 1 + // CHECK-COUNT-2: load i256{{.+}}align 1 + // CHECK-NEXT: store i256{{.+}}align 1 + // CHECK-COUNT-2: load i128{{.+}}align 1 + // CHECK-NEXT: store i128{{.+}}align 1 + // CHECK-COUNT-2: load i64{{.+}}align 1 + // CHECK-NEXT: store i64{{.+}}align 1 + // CHECK-COUNT-2: load i32{{.+}}align 1 + // CHECK-NEXT: store i32{{.+}}align 1 + // CHECK-NOT: load i16 + // CHECK-NOT: store i16 + // CHECK-NOT: load i8 + // CHECK-NOT: store i8 if x.len() == y.len() { x.swap_with_slice(y); } } -// Strings have a non-power-of-two size, but have pointer alignment, -// so we swap usizes instead of dropping all the way down to bytes. +// Strings have a non-power-of-two size, but have pointer alignment. const _: () = assert!(!std::mem::size_of::().is_power_of_two()); // CHECK-LABEL: @swap_string_slices #[no_mangle] pub fn swap_string_slices(x: &mut [String], y: &mut [String]) { // CHECK-NOT: alloca - // CHECK: load <{{[0-9]+}} x i64> - // CHECK: store <{{[0-9]+}} x i64> + + // CHECK-COUNT-2: load i512{{.+}}align 8 + // CHECK-NEXT: store i512{{.+}}align 8 + // CHECK-COUNT-2: load i256{{.+}}align 8 + // CHECK-NEXT: store i256{{.+}}align 8 + // CHECK-COUNT-2: load i128{{.+}}align 8 + // CHECK-NEXT: store i128{{.+}}align 8 + // CHECK-COUNT-2: load i64{{.+}}align 8 + // CHECK-NEXT: store i64{{.+}}align 8 + // CHECK-NOT: load i32 + // CHECK-NOT: store i32 + // CHECK-NOT: load i16 + // CHECK-NOT: store i16 + // CHECK-NOT: load i8 + // CHECK-NOT: store i8 if x.len() == y.len() { x.swap_with_slice(y); } @@ -130,6 +183,9 @@ pub struct Packed { #[no_mangle] pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) { // CHECK-NOT: alloca - // CHECK: ret void + // CHECK-COUNT-2: load i64{{.+}}align 1 + // CHECK-COUNT-2: store i64{{.+}}align 1 + // CHECK-COUNT-2: load i8{{.+}}align 1 + // CHECK-COUNT-2: store i8{{.+}}align 1 swap(x, y) }