folkertdev · folkertdev · Mar 3, 2025 · Mar 5, 2025 · Mar 15, 2025 · Mar 3, 2025
diff --git a/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
@@ -8,8 +8,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
   wget \
   xz-utils
 
-RUN wget https://downloadmirror.intel.com/843185/sde-external-9.48.0-2024-11-25-lin.tar.xz
-RUN tar -xJf sde-external-9.48.0-2024-11-25-lin.tar.xz
-ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-9.48.0-2024-11-25-lin/sde64 \
+RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-9.48.0-2024-11-25-lin.tar.xz -O sde.tar.xz
+RUN mkdir intel-sde
+RUN tar -xJf sde.tar.xz --strip-components=1 -C intel-sde
+ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/intel-sde/sde64 \
             -cpuid-in /checkout/ci/docker/x86_64-unknown-linux-gnu-emulated/cpuid.def \
             -rtm-mode full -tsx --"
diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
@@ -38,7 +38,8 @@
     x86_amx_intrinsics,
     f16,
     keylocker_x86,
-    aarch64_unstable_target_feature
+    aarch64_unstable_target_feature,
+    bigint_helper_methods
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall, stdarch_internal))]
 #![deny(clippy::missing_inline_in_public_items)]

diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs
@@ -132,7 +132,7 @@ macro_rules! types {
         impl crate::fmt::Debug for $name {
             #[inline]
             fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {
-                crate::core_arch::simd::debug_simd_finish(f, stringify!($name), self.0)
+                crate::core_arch::simd::debug_simd_finish(f, stringify!($name), self.as_array())
             }
         }
     )*);

diff --git a/crates/core_arch/src/s390x/macros.rs b/crates/core_arch/src/s390x/macros.rs
@@ -250,6 +250,19 @@ macro_rules! l_t_t {
         u8
     };
 
+    (vector_bool_long_long ) => {
+        u64
+    };
+    (vector_bool_int ) => {
+        u32
+    };
+    (vector_bool_short ) => {
+        u16
+    };
+    (vector_bool_char ) => {
+        u8
+    };
+
     (vector_float) => {
         f32
     };
@@ -338,6 +351,9 @@ macro_rules! t_u {
     (vector_bool_int) => {
         vector_unsigned_int
     };
+    (vector_bool_long_long) => {
+        vector_unsigned_long_long
+    };
     (vector_unsigned_char) => {
         vector_unsigned_char
     };
@@ -380,6 +396,9 @@ macro_rules! t_b {
     (vector_bool_int) => {
         vector_bool_int
     };
+    (vector_bool_long_long) => {
+        vector_bool_long_long
+    };
     (vector_signed_char) => {
         vector_bool_char
     };

diff --git a/crates/core_arch/src/s390x/vector.rs b/crates/core_arch/src/s390x/vector.rs
diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
@@ -5,7 +5,7 @@
 macro_rules! simd_ty {
     ($id:ident [$elem_type:ty ; $len:literal]: $($param_name:ident),*) => {
         #[repr(simd)]
-        #[derive(Copy, Clone, Debug, PartialEq)]
+        #[derive(Copy, Clone)]
         pub(crate) struct $id([$elem_type; $len]);
 
         #[allow(clippy::use_self)]
@@ -38,13 +38,32 @@ macro_rules! simd_ty {
             /// Use for testing only.
             // FIXME: Workaround rust@60637
             #[inline(always)]
-            pub(crate) fn extract(self, index: usize) -> $elem_type {
-                assert!(index < $len);
-                // Now that we know this is in-bounds, use pointer arithmetic to access the right element.
-                let self_ptr = &self as *const Self as *const $elem_type;
-                unsafe {
-                    self_ptr.add(index).read()
-                }
+            pub(crate) fn extract(&self, index: usize) -> $elem_type {
+                self.as_array()[index]
+            }
+
+            #[inline]
+            pub(crate) fn as_array(&self) -> &[$elem_type; $len] {
+                let simd_ptr: *const Self = self;
+                let array_ptr: *const [$elem_type; $len] = simd_ptr.cast();
+                // SAFETY: We can always read the prefix of a simd type as an array.
+                // There might be more padding afterwards for some widths, but
+                // that's not a problem for reading less than that.
+                unsafe { &*array_ptr }
+            }
+        }
+
+        impl core::cmp::PartialEq for $id {
+            #[inline]
+            fn eq(&self, other: &Self) -> bool {
+                self.as_array() == other.as_array()
+            }
+        }
+
+        impl core::fmt::Debug for $id {
+            #[inline]
+            fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+                debug_simd_finish(f, stringify!($id), self.as_array())
             }
         }
     }
@@ -53,7 +72,7 @@ macro_rules! simd_ty {
 macro_rules! simd_m_ty {
     ($id:ident [$elem_type:ident ; $len:literal]: $($param_name:ident),*) => {
         #[repr(simd)]
-        #[derive(Copy, Clone, Debug, PartialEq)]
+        #[derive(Copy, Clone)]
         pub(crate) struct $id([$elem_type; $len]);
 
         #[allow(clippy::use_self)]
@@ -79,6 +98,30 @@ macro_rules! simd_m_ty {
                 // a simd type with exactly one element.
                 unsafe { simd_shuffle!(one, one, [0; $len]) }
             }
+
+            #[inline]
+            pub(crate) fn as_array(&self) -> &[$elem_type; $len] {
+                let simd_ptr: *const Self = self;
+                let array_ptr: *const [$elem_type; $len] = simd_ptr.cast();
+                // SAFETY: We can always read the prefix of a simd type as an array.
+                // There might be more padding afterwards for some widths, but
+                // that's not a problem for reading less than that.
+                unsafe { &*array_ptr }
+            }
+        }
+
+        impl core::cmp::PartialEq for $id {
+            #[inline]
+            fn eq(&self, other: &Self) -> bool {
+                self.as_array() == other.as_array()
+            }
+        }
+
+        impl core::fmt::Debug for $id {
+            #[inline]
+            fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+                debug_simd_finish(f, stringify!($id), self.as_array())
+            }
         }
     }
 }
@@ -968,7 +1011,7 @@ simd_ty!(
 pub(crate) fn debug_simd_finish<T: crate::fmt::Debug, const N: usize>(
     formatter: &mut crate::fmt::Formatter<'_>,
     type_name: &str,
-    array: [T; N],
+    array: &[T; N],
 ) -> crate::fmt::Result {
     crate::fmt::Formatter::debug_tuple_fields_finish(
         formatter,

diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
@@ -2318,7 +2318,25 @@ pub fn u8x16_narrow_i16x8(a: v128, b: v128) -> v128 {
 #[doc(alias("i8x16.shl"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shl(a.as_i8x16(), simd::i8x16::splat(amt as i8)).v128() }
+    // SAFETY: the safety of this intrinsic relies on the fact that the
+    // shift amount for each lane is less than the number of bits in the input
+    // lane. In this case the input has 8-bit lanes but the shift amount above
+    // is `u32`, so a mask is required to discard all the upper bits of `amt` to
+    // ensure that the safety condition is met.
+    //
+    // Note that this is distinct from the behavior of the native WebAssembly
+    // instruction here where WebAssembly defines this instruction as performing
+    // a mask as well. This is nonetheless required since this must have defined
+    // semantics in LLVM, not just WebAssembly.
+    //
+    // Finally note that this mask operation is not actually emitted into the
+    // final binary itself. LLVM understands that the wasm operation implicitly
+    // masks, so it knows this mask operation is redundant.
+    //
+    // Basically the extra mask here is required as a bridge from the documented
+    // semantics through LLVM back out to WebAssembly. Both ends have the
+    // documented semantics, and the mask is required by LLVM in the middle.
+    unsafe { simd_shl(a.as_i8x16(), simd::i8x16::splat((amt & 0x7) as i8)).v128() }
 }
 
 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -2335,7 +2353,9 @@ pub use i8x16_shl as u8x16_shl;
 #[doc(alias("i8x16.shr_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i8x16_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_i8x16(), simd::i8x16::splat(amt as i8)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_i8x16(), simd::i8x16::splat((amt & 0x7) as i8)).v128() }
 }
 
 /// Shifts each lane to the right by the specified number of bits, shifting in
@@ -2349,7 +2369,9 @@ pub fn i8x16_shr(a: v128, amt: u32) -> v128 {
 #[doc(alias("i8x16.shr_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u8x16_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_u8x16(), simd::u8x16::splat(amt as u8)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_u8x16(), simd::u8x16::splat((amt & 0x7) as u8)).v128() }
 }
 
 /// Adds two 128-bit vectors as if they were two packed sixteen 8-bit integers.
@@ -2686,7 +2708,9 @@ pub use i16x8_extend_high_u8x16 as u16x8_extend_high_u8x16;
 #[doc(alias("i16x8.shl"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shl(a.as_i16x8(), simd::i16x8::splat(amt as i16)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shl(a.as_i16x8(), simd::i16x8::splat((amt & 0xf) as i16)).v128() }
 }
 
 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -2703,7 +2727,9 @@ pub use i16x8_shl as u16x8_shl;
 #[doc(alias("i16x8.shr_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_i16x8(), simd::i16x8::splat(amt as i16)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_i16x8(), simd::i16x8::splat((amt & 0xf) as i16)).v128() }
 }
 
 /// Shifts each lane to the right by the specified number of bits, shifting in
@@ -2717,7 +2743,9 @@ pub fn i16x8_shr(a: v128, amt: u32) -> v128 {
 #[doc(alias("i16x8.shr_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u16x8_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_u16x8(), simd::u16x8::splat(amt as u16)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_u16x8(), simd::u16x8::splat((amt & 0xf) as u16)).v128() }
 }
 
 /// Adds two 128-bit vectors as if they were two packed eight 16-bit integers.
@@ -3136,7 +3164,9 @@ pub use i32x4_extend_high_u16x8 as u32x4_extend_high_u16x8;
 #[doc(alias("i32x4.shl"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shl(a.as_i32x4(), simd::i32x4::splat(amt as i32)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shl(a.as_i32x4(), simd::i32x4::splat((amt & 0x1f) as i32)).v128() }
 }
 
 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3153,7 +3183,9 @@ pub use i32x4_shl as u32x4_shl;
 #[doc(alias("i32x4.shr_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_i32x4(), simd::i32x4::splat(amt as i32)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_i32x4(), simd::i32x4::splat((amt & 0x1f) as i32)).v128() }
 }
 
 /// Shifts each lane to the right by the specified number of bits, shifting in
@@ -3167,7 +3199,9 @@ pub fn i32x4_shr(a: v128, amt: u32) -> v128 {
 #[doc(alias("i32x4.shr_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u32x4_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_u32x4(), simd::u32x4::splat(amt)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_u32x4(), simd::u32x4::splat(amt & 0x1f)).v128() }
 }
 
 /// Adds two 128-bit vectors as if they were two packed four 32-bit integers.
@@ -3502,7 +3536,9 @@ pub use i64x2_extend_high_u32x4 as u64x2_extend_high_u32x4;
 #[doc(alias("i64x2.shl"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shl(a.as_i64x2(), simd::i64x2::splat(amt as i64)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shl(a.as_i64x2(), simd::i64x2::splat((amt & 0x3f) as i64)).v128() }
 }
 
 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3519,7 +3555,9 @@ pub use i64x2_shl as u64x2_shl;
 #[doc(alias("i64x2.shr_s"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_i64x2(), simd::i64x2::splat(amt as i64)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_i64x2(), simd::i64x2::splat((amt & 0x3f) as i64)).v128() }
 }
 
 /// Shifts each lane to the right by the specified number of bits, shifting in
@@ -3533,7 +3571,9 @@ pub fn i64x2_shr(a: v128, amt: u32) -> v128 {
 #[doc(alias("i64x2.shr_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u64x2_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_u64x2(), simd::u64x2::splat(amt as u64)).v128() }
+    // SAFETY: see i8x16_shl for more documentation why this is unsafe,
+    // essentially the shift amount must be valid hence the mask.
+    unsafe { simd_shr(a.as_u64x2(), simd::u64x2::splat((amt & 0x3f) as u64)).v128() }
 }
 
 /// Adds two 128-bit vectors as if they were two packed two 64-bit integers.

diff --git a/crates/intrinsic-test/missing_aarch64.txt b/crates/intrinsic-test/missing_aarch64.txt
@@ -30,6 +30,32 @@ vrnd32x_f64
 vrnd32z_f64
 vrnd64x_f64
 vrnd64z_f64
+vluti2_lane_p16
+vluti2_lane_p8
+vluti2_lane_s16
+vluti2_lane_s8
+vluti2_lane_u16
+vluti2_lane_u8
+vluti2q_lane_p16
+vluti2q_lane_p8
+vluti2q_lane_s16
+vluti2q_lane_s8
+vluti2q_lane_u16
+vluti2q_lane_u8
+vluti4q_lane_f16_x2
+vluti4q_lane_p16_x2
+vluti4q_lane_p8
+vluti4q_lane_s16_x2
+vluti4q_lane_s8
+vluti4q_lane_u16_x2
+vluti4q_lane_u8
+vluti4q_laneq_f16_x2
+vluti4q_laneq_p16_x2
+vluti4q_laneq_p8
+vluti4q_laneq_s16_x2
+vluti4q_laneq_s8
+vluti4q_laneq_u16_x2
+vluti4q_laneq_u8
 
 # Broken in Clang
 vcvth_s16_f16

diff --git a/crates/intrinsic-test/src/main.rs b/crates/intrinsic-test/src/main.rs
@@ -240,7 +240,7 @@ fn compile_c(
     let arch_flags = if target.contains("v7") {
         "-march=armv8.6-a+crypto+crc+dotprod+fp16"
     } else {
-        "-march=armv8.6-a+crypto+sha3+crc+dotprod+fp16+faminmax"
+        "-march=armv8.6-a+crypto+sha3+crc+dotprod+fp16+faminmax+lut"
     };
 
     let intrinsic_name = &intrinsic.name;