diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index cabcfc9b42b4e..f2b211a200727 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -498,6 +498,23 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 }
             }
 
+            sym::untyped_swap_nonoverlapping => {
+                // The fallback impl uses memcpy, which leaves around allocas
+                // that don't optimize out for certain widths, so force it to
+                // use SSA registers instead.
+
+                let chunk_ty = fn_args.type_at(0);
+                let layout = self.layout_of(chunk_ty).layout;
+                let integer_ty = self.type_ix(layout.size().bits());
+                let a = args[0].immediate();
+                let b = args[1].immediate();
+                let a_val = self.load(integer_ty, a, layout.align().abi);
+                let b_val = self.load(integer_ty, b, layout.align().abi);
+                self.store(b_val, a, layout.align().abi);
+                self.store(a_val, b, layout.align().abi);
+                return Ok(());
+            }
+
             sym::compare_bytes => {
                 // Here we assume that the `memcmp` provided by the target is a NOP for size 0.
                 let cmp = self.call_intrinsic("memcmp", &[
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
index fd78bf3e8fc60..ce884c4fbf4b5 100644
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -504,6 +504,12 @@ pub fn check_intrinsic_type(
             sym::typed_swap_nonoverlapping => {
                 (1, 0, vec![Ty::new_mut_ptr(tcx, param(0)); 2], tcx.types.unit)
             }
+            sym::untyped_swap_nonoverlapping => (
+                1,
+                0,
+                vec![Ty::new_mut_ptr(tcx, Ty::new_maybe_uninit(tcx, param(0))); 2],
+                tcx.types.unit,
+            ),
 
             sym::discriminant_value => {
                 let assoc_items = tcx.associated_item_def_ids(
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
index 4ecc4201f89d5..f580a104c28a6 100644
--- a/compiler/rustc_span/src/symbol.rs
+++ b/compiler/rustc_span/src/symbol.rs
@@ -2142,6 +2142,7 @@ symbols! {
                           unstable location; did you mean to load this crate \
                           from crates.io via `Cargo.toml` instead?",
         untagged_unions,
+        untyped_swap_nonoverlapping,
         unused_imports,
         unwind,
         unwind_attributes,
diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs
index b5c31d824677d..922116625a0fa 100644
--- a/library/core/src/intrinsics/mod.rs
+++ b/library/core/src/intrinsics/mod.rs
@@ -66,7 +66,7 @@
 
 use crate::marker::{DiscriminantKind, Tuple};
 use crate::mem::SizedTypeProperties;
-use crate::{ptr, ub_checks};
+use crate::{mem, ptr, ub_checks};
 
 pub mod fallback;
 pub mod mir;
@@ -4003,7 +4003,37 @@ pub use typed_swap as typed_swap_nonoverlapping;
 pub const unsafe fn typed_swap_nonoverlapping<T>(x: *mut T, y: *mut T) {
     // SAFETY: The caller provided single non-overlapping items behind
     // pointers, so swapping them with `count: 1` is fine.
-    unsafe { ptr::swap_nonoverlapping(x, y, 1) };
+    unsafe { crate::swapping::swap_nonoverlapping(x, y, 1) };
+}
+
+/// Swaps the `N` untyped & non-overlapping bytes behind the two pointers.
+///
+/// Split out from `typed_swap` for the internal swaps in `swap_nonoverlapping`
+/// which would otherwise cause cycles between the fallback implementations on
+/// backends where neither is overridden.
+///
+/// # Safety
+///
+/// `x` and `y` are readable and writable as `MaybeUninit<C>` and non-overlapping.
+#[inline]
+#[rustc_nounwind]
+#[cfg_attr(not(bootstrap), rustc_intrinsic)]
+#[miri::intrinsic_fallback_is_spec]
+#[rustc_const_stable_indirect]
+pub const unsafe fn untyped_swap_nonoverlapping<C>(
+    x: *mut mem::MaybeUninit<C>,
+    y: *mut mem::MaybeUninit<C>,
+) {
+    // This intentionally uses untyped memory copies, not reads/writes,
+    // to avoid any risk of losing padding in things like (u16, u8).
+    let mut temp = mem::MaybeUninit::<C>::uninit();
+    // SAFETY: Caller promised that x and y are non-overlapping & read/writeable,
+    // and our fresh local is always disjoint from anything otherwise readable.
+    unsafe {
+        (&raw mut temp).copy_from_nonoverlapping(x, 1);
+        x.copy_from_nonoverlapping(y, 1);
+        y.copy_from_nonoverlapping(&raw const temp, 1);
+    }
 }
 
 /// Returns whether we should perform some UB-checking at runtime. This eventually evaluates to
diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs
index 0d8a3811eded1..88eac4f4cc21b 100644
--- a/library/core/src/lib.rs
+++ b/library/core/src/lib.rs
@@ -376,6 +376,7 @@ pub mod alloc;
 // note: does not need to be public
 mod bool;
 mod escape;
+pub(crate) mod swapping;
 mod tuple;
 mod unit;
 
diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs
index ac074c097d94c..b285d6a4a78de 100644
--- a/library/core/src/ptr/mod.rs
+++ b/library/core/src/ptr/mod.rs
@@ -395,7 +395,6 @@
 #![allow(clippy::not_unsafe_ptr_arg_deref)]
 
 use crate::cmp::Ordering;
-use crate::intrinsics::const_eval_select;
 use crate::marker::FnPtr;
 use crate::mem::{self, MaybeUninit, SizedTypeProperties};
 use crate::{fmt, hash, intrinsics, ub_checks};
@@ -1092,84 +1091,8 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
         }
     );
 
-    const_eval_select!(
-        @capture[T] { x: *mut T, y: *mut T, count: usize }:
-        if const {
-            // At compile-time we want to always copy this in chunks of `T`, to ensure that if there
-            // are pointers inside `T` we will copy them in one go rather than trying to copy a part
-            // of a pointer (which would not work).
-            // SAFETY: Same preconditions as this function
-            unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
-        } else {
-            macro_rules! attempt_swap_as_chunks {
-                ($ChunkTy:ty) => {
-                    if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
-                        && mem::size_of::<T>() % mem::size_of::<$ChunkTy>() == 0
-                    {
-                        let x: *mut $ChunkTy = x.cast();
-                        let y: *mut $ChunkTy = y.cast();
-                        let count = count * (mem::size_of::<T>() / mem::size_of::<$ChunkTy>());
-                        // SAFETY: these are the same bytes that the caller promised were
-                        // ok, just typed as `MaybeUninit<ChunkTy>`s instead of as `T`s.
-                        // The `if` condition above ensures that we're not violating
-                        // alignment requirements, and that the division is exact so
-                        // that we don't lose any bytes off the end.
-                        return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) };
-                    }
-                };
-            }
-
-            // Split up the slice into small power-of-two-sized chunks that LLVM is able
-            // to vectorize (unless it's a special type with more-than-pointer alignment,
-            // because we don't want to pessimize things like slices of SIMD vectors.)
-            if mem::align_of::<T>() <= mem::size_of::<usize>()
-            && (!mem::size_of::<T>().is_power_of_two()
-                || mem::size_of::<T>() > mem::size_of::<usize>() * 2)
-            {
-                attempt_swap_as_chunks!(usize);
-                attempt_swap_as_chunks!(u8);
-            }
-
-            // SAFETY: Same preconditions as this function
-            unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
-        }
-    )
-}
-
-/// Same behavior and safety conditions as [`swap_nonoverlapping`]
-///
-/// LLVM can vectorize this (at least it can for the power-of-two-sized types
-/// `swap_nonoverlapping` tries to use) so no need to manually SIMD it.
-#[inline]
-const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, count: usize) {
-    let x = x.cast::<MaybeUninit<T>>();
-    let y = y.cast::<MaybeUninit<T>>();
-    let mut i = 0;
-    while i < count {
-        // SAFETY: By precondition, `i` is in-bounds because it's below `n`
-        let x = unsafe { x.add(i) };
-        // SAFETY: By precondition, `i` is in-bounds because it's below `n`
-        // and it's distinct from `x` since the ranges are non-overlapping
-        let y = unsafe { y.add(i) };
-
-        // If we end up here, it's because we're using a simple type -- like
-        // a small power-of-two-sized thing -- or a special type with particularly
-        // large alignment, particularly SIMD types.
-        // Thus, we're fine just reading-and-writing it, as either it's small
-        // and that works well anyway or it's special and the type's author
-        // presumably wanted things to be done in the larger chunk.
-
-        // SAFETY: we're only ever given pointers that are valid to read/write,
-        // including being aligned, and nothing here panics so it's drop-safe.
-        unsafe {
-            let a: MaybeUninit<T> = read(x);
-            let b: MaybeUninit<T> = read(y);
-            write(x, b);
-            write(y, a);
-        }
-
-        i += 1;
-    }
+    // SAFETY: Same preconditions as this function
+    unsafe { crate::swapping::swap_nonoverlapping(x, y, count) }
 }
 
 /// Moves `src` into the pointed `dst`, returning the previous `dst` value.
diff --git a/library/core/src/swapping.rs b/library/core/src/swapping.rs
new file mode 100644
index 0000000000000..25c6b0c6bf5d2
--- /dev/null
+++ b/library/core/src/swapping.rs
@@ -0,0 +1,182 @@
+use crate::{hint, intrinsics, mem, ptr};
+
+//#[rustc_const_stable_indirect]
+//#[rustc_allow_const_fn_unstable(const_eval_select)]
+#[rustc_const_unstable(feature = "const_swap_nonoverlapping", issue = "133668")]
+#[inline]
+pub(crate) const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
+    intrinsics::const_eval_select!(
+        @capture[T] { x: *mut T, y: *mut T, count: usize }:
+        if const {
+            // At compile-time we want to always copy this in chunks of `T`, to ensure that if there
+            // are pointers inside `T` we will copy them in one go rather than trying to copy a part
+            // of a pointer (which would not work).
+            // SAFETY: Same preconditions as this function
+            unsafe { swap_nonoverlapping_const(x, y, count) }
+        } else {
+            // At runtime we want to make sure not to swap byte-for-byte for types like [u8; 15],
+            // and swapping as `MaybeUninit<T>` doesn't actually work as untyped for things like
+            // T = (u16, u8), so we type-erase to raw bytes and swap that way.
+            // SAFETY: Same preconditions as this function
+            unsafe { swap_nonoverlapping_runtime(x, y, count) }
+        }
+    )
+}
+
+/// Same behavior and safety conditions as [`swap_nonoverlapping`]
+#[rustc_const_stable_indirect]
+#[inline]
+const unsafe fn swap_nonoverlapping_const<T>(x: *mut T, y: *mut T, count: usize) {
+    let x = x.cast::<mem::MaybeUninit<T>>();
+    let y = y.cast::<mem::MaybeUninit<T>>();
+    let mut i = 0;
+    while i < count {
+        // SAFETY: By precondition, `i` is in-bounds because it's below `n`
+        // and because the two input ranges are non-overlapping and read/writeable,
+        // these individual items inside them are too.
+        unsafe {
+            intrinsics::untyped_swap_nonoverlapping::<T>(x.add(i), y.add(i));
+        }
+
+        i += 1;
+    }
+}
+
+// Scale the monomorphizations with the size of the machine, roughly.
+const MAX_ALIGN: usize = align_of::<usize>().pow(2);
+
+/// Same behavior and safety conditions as [`swap_nonoverlapping`]
+#[inline]
+unsafe fn swap_nonoverlapping_runtime<T>(x: *mut T, y: *mut T, count: usize) {
+    let bytes = {
+        let slice = ptr::slice_from_raw_parts(x, count);
+        // SAFETY: Because they both exist in memory and don't overlap, they
+        // must be legal slice sizes (below `isize::MAX` bytes).
+        unsafe { mem::size_of_val_raw(slice) }
+    };
+
+    // Generating *untyped* loops for every type is silly, so we polymorphize away
+    // the actual type, but we want to take advantage of alignment if possible,
+    // so monomorphize for a restricted set of possible alignments.
+    macro_rules! delegate_by_alignment {
+        ($($p:pat => $align:expr,)+) => {{
+            #![allow(unreachable_patterns)]
+            match const { align_of::<T>() } {
+                $(
+                    $p => {
+                        swap_nonoverlapping_bytes::<$align>(x.cast(), y.cast(), bytes);
+                    }
+                )+
+            }
+        }};
+    }
+
+    // SAFETY:
+    unsafe {
+        delegate_by_alignment! {
+            MAX_ALIGN.. => MAX_ALIGN,
+            64.. => 64,
+            32.. => 32,
+            16.. => 16,
+            8.. => 8,
+            4.. => 4,
+            2.. => 2,
+            _ => 1,
+        }
+    }
+}
+
+/// # Safety:
+/// - `x` and `y` must be aligned to `ALIGN`
+/// - `bytes` must be a multiple of `ALIGN`
+/// - They must be readable, writable, and non-overlapping for `bytes` bytes
+#[inline]
+unsafe fn swap_nonoverlapping_bytes<const ALIGN: usize>(
+    x: *mut mem::MaybeUninit<u8>,
+    y: *mut mem::MaybeUninit<u8>,
+    bytes: usize,
+) {
+    // SAFETY: Two legal non-overlapping regions can't be bigger than this.
+    // (And they couldn't have made allocations any bigger either anyway.)
+    // FIXME: Would be nice to have a type for this instead of the assume.
+    unsafe { hint::assert_unchecked(bytes < isize::MAX as usize) };
+
+    let mut i = 0;
+    macro_rules! swap_next_n {
+        ($n:expr) => {{
+            let x: *mut mem::MaybeUninit<[u8; $n]> = x.add(i).cast();
+            let y: *mut mem::MaybeUninit<[u8; $n]> = y.add(i).cast();
+            swap_nonoverlapping_aligned_chunk::<ALIGN, [u8; $n]>(
+                x.as_mut_unchecked(),
+                y.as_mut_unchecked(),
+            );
+            i += $n;
+        }};
+    }
+
+    while bytes - i >= MAX_ALIGN {
+        const { assert!(MAX_ALIGN >= ALIGN) };
+        // SAFETY: the const-assert above confirms we're only ever called with
+        // an alignment equal to or smaller than max align, so this is necessarily
+        // aligned, and the while loop ensures there's enough read/write memory.
+        unsafe {
+            swap_next_n!(MAX_ALIGN);
+        }
+    }
+
+    macro_rules! handle_tail {
+        ($($n:literal)+) => {$(
+            if const { $n % ALIGN == 0 } {
+                // Checking this way simplifies the block end to just add+test,
+                // rather than needing extra math before the check.
+                if (bytes & $n) != 0 {
+                    // SAFETY: The above swaps were bigger, so could not have
+                    // impacted the `$n`-relevant bit, so checking `bytes & $n`
+                    // was equivalent to `bytes - i >= $n`, and thus we have
+                    // enough space left to swap another `$n` bytes.
+                    unsafe {
+                        swap_next_n!($n);
+                    }
+                }
+            }
+        )+};
+    }
+    const { assert!(MAX_ALIGN <= 64) };
+    handle_tail!(32 16 8 4 2 1);
+
+    debug_assert_eq!(i, bytes);
+}
+
+/// Swaps the `C` behind `x` and `y` as untyped memory
+///
+/// # Safety
+///
+/// Both `x` and `y` must be aligned to `ALIGN`, in addition to their normal alignment.
+/// They must be readable and writeable for `sizeof(C)` bytes, as usual for `&mut`s.
+///
+/// (The actual instantiations are usually `C = [u8; _]`, so we get the alignment
+/// information from the loads by `assume`ing the passed-in alignment.)
+// Don't let MIR inline this, because we really want it to keep its noalias metadata
+#[rustc_no_mir_inline]
+#[inline]
+unsafe fn swap_nonoverlapping_aligned_chunk<const ALIGN: usize, C>(
+    x: &mut mem::MaybeUninit<C>,
+    y: &mut mem::MaybeUninit<C>,
+) {
+    assert!(size_of::<C>() % ALIGN == 0);
+
+    let x = ptr::from_mut(x);
+    let y = ptr::from_mut(y);
+
+    // SAFETY: One of our preconditions.
+    unsafe {
+        hint::assert_unchecked(x.is_aligned_to(ALIGN));
+        hint::assert_unchecked(y.is_aligned_to(ALIGN));
+    }
+
+    // SAFETY: The memory is readable and writable because these were passed to
+    // us as mutable references, and the untyped swap doesn't need validity.
+    unsafe {
+        intrinsics::untyped_swap_nonoverlapping::<C>(x, y);
+    }
+}
diff --git a/library/core/tests/ptr.rs b/library/core/tests/ptr.rs
index e6825d8e39e2c..58d023bbaef1a 100644
--- a/library/core/tests/ptr.rs
+++ b/library/core/tests/ptr.rs
@@ -992,3 +992,32 @@ fn test_ptr_metadata_in_const() {
     assert_eq!(SLICE_META, 3);
     assert_eq!(DYN_META.size_of(), 42);
 }
+
+// See <https://github.com/rust-lang/rust/issues/134713>
+#[test]
+fn test_ptr_swap_nonoverlapping_swaps_padding() {
+    #[repr(C)]
+    struct Foo(usize, u8);
+
+    let buf1: [usize; 2] = [1000, 2000];
+    let buf2: [usize; 2] = [3000, 4000];
+
+    // Foo and [usize; 2] have the same size and alignment,
+    // so swap_nonoverlapping should treat them the same
+    assert_eq!(size_of::<Foo>(), size_of::<[usize; 2]>());
+    assert_eq!(align_of::<Foo>(), align_of::<[usize; 2]>());
+
+    let mut b1 = buf1;
+    let mut b2 = buf2;
+    // Safety: b1 and b2 are distinct local variables,
+    // with the same size and alignment as Foo.
+    unsafe {
+        std::ptr::swap_nonoverlapping(
+            b1.as_mut_ptr().cast::<Foo>(),
+            b2.as_mut_ptr().cast::<Foo>(),
+            1,
+        );
+    }
+    assert_eq!(b1, buf2);
+    assert_eq!(b2, buf1);
+}
diff --git a/tests/assembly/x86_64-typed-swap.rs b/tests/assembly/x86_64-typed-swap.rs
index 95e87519e6c4b..75c28724e03b7 100644
--- a/tests/assembly/x86_64-typed-swap.rs
+++ b/tests/assembly/x86_64-typed-swap.rs
@@ -51,3 +51,35 @@ pub fn swap_simd(x: &mut __m128, y: &mut __m128) {
     // CHECK: retq
     swap(x, y)
 }
+
+// CHECK-LABEL: swap_string:
+#[no_mangle]
+pub fn swap_string(x: &mut String, y: &mut String) {
+    // CHECK: movups (%[[ARG1]]), %[[T1a:xmm.]]
+    // CHECK: movups (%[[ARG2]]), %[[T2a:xmm.]]
+    // CHECK: movups %[[T2a]], (%[[ARG1]])
+    // CHECK: movups %[[T1a]], (%[[ARG2]])
+    // CHECK: movq   16(%[[ARG1]]), %[[T1b:r.+]]
+    // CHECK: movq   16(%[[ARG2]]), %[[T2b:r.+]]
+    // CHECK: movq   %[[T2b]], 16(%[[ARG1]])
+    // CHECK: movq   %[[T1b]], 16(%[[ARG2]])
+    // CHECK: retq
+    swap(x, y)
+}
+
+// CHECK-LABEL: swap_44_bytes:
+#[no_mangle]
+pub fn swap_44_bytes(x: &mut [u8; 44], y: &mut [u8; 44]) {
+    // Ensure we do better than a long run of byte copies,
+    // see <https://github.com/rust-lang/rust/issues/134946>
+
+    // CHECK-NOT: movb
+    // CHECK-COUNT-8: movups{{.+}}xmm
+    // CHECK-NOT: movb
+    // CHECK-COUNT-4: movq
+    // CHECK-NOT: movb
+    // CHECK-COUNT-4: movl
+    // CHECK-NOT: movb
+    // CHECK: retq
+    swap(x, y)
+}
diff --git a/tests/codegen/simd/swap-simd-types.rs b/tests/codegen/simd/swap-simd-types.rs
index cd6e84286e1c9..4c72e13df3443 100644
--- a/tests/codegen/simd/swap-simd-types.rs
+++ b/tests/codegen/simd/swap-simd-types.rs
@@ -23,8 +23,19 @@ pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
 #[no_mangle]
 pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
     // CHECK-NOT: alloca
-    // CHECK: load <8 x float>{{.+}}align 32
-    // CHECK: store <8 x float>{{.+}}align 32
+
+    // CHECK-NOT: load i128
+    // CHECK-NOT: load i64
+    // CHECK-NOT: load i32
+
+    // CHECK: [[A:%.+]] = load i256{{.+}}align 32
+    // CHECK: [[B:%.+]] = load i256{{.+}}align 32
+    // CHECK: store i256 [[B]]{{.+}}align 32
+    // CHECK: store i256 [[A]]{{.+}}align 32
+
+    // CHECK-NOT: load i128
+    // CHECK-NOT: load i64
+    // CHECK-NOT: load i32
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -34,7 +45,18 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
 #[no_mangle]
 pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <32 x i8>{{.+}}align 1
-    // CHECK: store <32 x i8>{{.+}}align 1
+
+    // CHECK-NOT: load i128
+    // CHECK-NOT: load i64
+    // CHECK-NOT: load i32
+
+    // CHECK: [[A:%.+]] = load i256{{.+}}align 1
+    // CHECK: [[B:%.+]] = load i256{{.+}}align 1
+    // CHECK: store i256 [[B]]{{.+}}align 1
+    // CHECK: store i256 [[A]]{{.+}}align 1
+
+    // CHECK-NOT: load i128
+    // CHECK-NOT: load i64
+    // CHECK-NOT: load i32
     swap(x, y)
 }
diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs
index 761d48969dad9..a1cc6bd1f8d0c 100644
--- a/tests/codegen/swap-large-types.rs
+++ b/tests/codegen/swap-large-types.rs
@@ -33,35 +33,42 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
 #[no_mangle]
 pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK-COUNT-2: load i512{{.+}}align 8
+    // CHECK-COUNT-2: store i512{{.+}}align 8
+    // CHECK-COUNT-2: load i512{{.+}}align 8
+    // CHECK-COUNT-2: store i512{{.+}}align 8
+    // CHECK-COUNT-2: load i512{{.+}}align 8
+    // CHECK-COUNT-2: store i512{{.+}}align 8
+    // CHECK-COUNT-2: load i64{{.+}}align 8
+    // CHECK-COUNT-2: store i64{{.+}}align 8
     swap(x, y)
 }
 
-// Verify that types with usize alignment are swapped via vectored usizes,
-// not falling back to byte-level code.
-
-// CHECK-LABEL: @swap_slice
-#[no_mangle]
-pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
-    // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
-    if x.len() == y.len() {
-        x.swap_with_slice(y);
-    }
-}
-
-// But for a large align-1 type, vectorized byte copying is what we want.
-
 type OneKilobyteBuffer = [u8; 1024];
 
 // CHECK-LABEL: @swap_1kb_slices
 #[no_mangle]
 pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // These are so big that there's only the biggest chunk size used
+
+    // CHECK-NOT: load i256
+    // CHECK-NOT: load i128
+    // CHECK-NOT: load i64
+    // CHECK-NOT: load i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: load i8
+
+    // CHECK-COUNT-2: load i512{{.+}}align 1
+    // CHECK-COUNT-2: store i512{{.+}}align 1
+
+    // CHECK-NOT: store i256
+    // CHECK-NOT: store i128
+    // CHECK-NOT: store i64
+    // CHECK-NOT: store i32
+    // CHECK-NOT: store i16
+    // CHECK-NOT: store i8
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -81,10 +88,12 @@ pub struct BigButHighlyAligned([u8; 64 * 3]);
 // CHECK-LABEL: @swap_big_aligned
 #[no_mangle]
 pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
-    // CHECK-NOT: call void @llvm.memcpy
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK-NOT: call void @llvm.memcpy
+    // CHECK-NOT: alloca
+    // CHECK-COUNT-2: load i512{{.+}}align 64
+    // CHECK-COUNT-2: store i512{{.+}}align 64
+    // CHECK-COUNT-2: load i512{{.+}}align 64
+    // CHECK-COUNT-2: store i512{{.+}}align 64
+    // CHECK-COUNT-2: load i512{{.+}}align 64
+    // CHECK-COUNT-2: store i512{{.+}}align 64
     swap(x, y)
 }
diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs
index 1a48c63d8139f..30e81fd4ef04e 100644
--- a/tests/codegen/swap-small-types.rs
+++ b/tests/codegen/swap-small-types.rs
@@ -27,13 +27,19 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
 pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
     // CHECK-NOT: alloca
 
-    // Whether `i8` is the best for this is unclear, but
-    // might as well record what's actually happening right now.
-
-    // CHECK: load i8
-    // CHECK: load i8
-    // CHECK: store i8
-    // CHECK: store i8
+    // Swapping `i48` might be cleaner in LLVM-IR here, but `i32`+`i16` isn't bad,
+    // and is closer to the assembly it generates anyway.
+
+    // CHECK-NOT: load
+    // CHECK: load i32{{.+}}align 2
+    // CHECK-NEXT: load i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK: load i16{{.+}}align 2
+    // CHECK-NEXT: load i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NOT: store
     swap(x, y)
 }
 
@@ -54,19 +60,27 @@ pub fn swap_rgba64(x: &mut RGBA64, y: &mut RGBA64) {
 #[no_mangle]
 pub fn swap_vecs(x: &mut Vec<u32>, y: &mut Vec<u32>) {
     // CHECK-NOT: alloca
-    // There are plenty more loads and stores than just these,
-    // but at least one sure better be 64-bit (for size or capacity).
-    // CHECK: load i64
+
+    // CHECK-NOT: load
+    // CHECK: load i128
+    // CHECK-NEXT: load i128
+    // CHECK-NEXT: store i128
+    // CHECK-NEXT: store i128
     // CHECK: load i64
-    // CHECK: store i64
-    // CHECK: store i64
-    // CHECK: ret void
+    // CHECK-NEXT: load i64
+    // CHECK-NEXT: store i64
+    // CHECK-NEXT: store i64
+    // CHECK-NOT: store
     swap(x, y)
 }
 
 // CHECK-LABEL: @swap_slices
 #[no_mangle]
 pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
+    // Note that separate loads here is fine, as they merge to `movups` anyway
+    // at the assembly level, so staying more obviously typed and as a scalar
+    // pair -- like they're used elsewhere -- is ok, no need to force `i128`.
+
     // CHECK-NOT: alloca
     // CHECK: load ptr
     // CHECK: load i64
@@ -76,45 +90,84 @@ pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
     swap(x, y)
 }
 
-// LLVM doesn't vectorize a loop over 3-byte elements,
-// so we chunk it down to bytes and loop over those instead.
 type RGB24 = [u8; 3];
 
 // CHECK-LABEL: @swap_rgb24_slices
 #[no_mangle]
 pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // The odd size means we need the full set.
+
+    // CHECK-COUNT-2: load i512{{.+}}align 1
+    // CHECK-NEXT: store i512{{.+}}align 1
+    // CHECK-COUNT-2: load i256{{.+}}align 1
+    // CHECK-NEXT: store i256{{.+}}align 1
+    // CHECK-COUNT-2: load i128{{.+}}align 1
+    // CHECK-NEXT: store i128{{.+}}align 1
+    // CHECK-COUNT-2: load i64{{.+}}align 1
+    // CHECK-NEXT: store i64{{.+}}align 1
+    // CHECK-COUNT-2: load i32{{.+}}align 1
+    // CHECK-NEXT: store i32{{.+}}align 1
+    // CHECK-COUNT-2: load i16{{.+}}align 1
+    // CHECK-NEXT: store i16{{.+}}align 1
+    // CHECK-COUNT-2: load i8{{.+}}align 1
+    // CHECK-NEXT: store i8{{.+}}align 1
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This one has a power-of-two size, so we iterate over it directly
 type RGBA32 = [u8; 4];
 
 // CHECK-LABEL: @swap_rgba32_slices
 #[no_mangle]
 pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i32>
-    // CHECK: store <{{[0-9]+}} x i32>
+
+    // Because the size in bytes in a multiple of 4, we can skip the smallest sizes.
+
+    // CHECK-COUNT-2: load i512{{.+}}align 1
+    // CHECK-NEXT: store i512{{.+}}align 1
+    // CHECK-COUNT-2: load i256{{.+}}align 1
+    // CHECK-NEXT: store i256{{.+}}align 1
+    // CHECK-COUNT-2: load i128{{.+}}align 1
+    // CHECK-NEXT: store i128{{.+}}align 1
+    // CHECK-COUNT-2: load i64{{.+}}align 1
+    // CHECK-NEXT: store i64{{.+}}align 1
+    // CHECK-COUNT-2: load i32{{.+}}align 1
+    // CHECK-NEXT: store i32{{.+}}align 1
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// Strings have a non-power-of-two size, but have pointer alignment,
-// so we swap usizes instead of dropping all the way down to bytes.
+// Strings have a non-power-of-two size, but have pointer alignment.
 const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
 
 // CHECK-LABEL: @swap_string_slices
 #[no_mangle]
 pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i512{{.+}}align 8
+    // CHECK-NEXT: store i512{{.+}}align 8
+    // CHECK-COUNT-2: load i256{{.+}}align 8
+    // CHECK-NEXT: store i256{{.+}}align 8
+    // CHECK-COUNT-2: load i128{{.+}}align 8
+    // CHECK-NEXT: store i128{{.+}}align 8
+    // CHECK-COUNT-2: load i64{{.+}}align 8
+    // CHECK-NEXT: store i64{{.+}}align 8
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -130,6 +183,9 @@ pub struct Packed {
 #[no_mangle]
 pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
     // CHECK-NOT: alloca
-    // CHECK: ret void
+    // CHECK-COUNT-2: load i64{{.+}}align 1
+    // CHECK-COUNT-2: store i64{{.+}}align 1
+    // CHECK-COUNT-2: load i8{{.+}}align 1
+    // CHECK-COUNT-2: store i8{{.+}}align 1
     swap(x, y)
 }