From ff238f3135246fe1ed49f1cfc2bd79012ec5b877 Mon Sep 17 00:00:00 2001
From: Amanieu d'Antras <amanieu@gmail.com>
Date: Fri, 18 Feb 2022 17:45:35 +0000
Subject: [PATCH 1/2] Stabilize vget_low_s8 on AArch64

This was missed  when most of the AArch64 intrinsics were stabilized.
---
 crates/core_arch/src/arm_shared/neon/mod.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs
index 7d170a00d1..952d1ca2e3 100644
--- a/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -4989,6 +4989,10 @@ pub unsafe fn vget_high_f32(a: float32x4_t) -> float32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(test, assert_instr(nop))]
+#[cfg_attr(
+    target_arch = "aarch64",
+    stable(feature = "vget_low_s8", since = "1.60.0")
+)]
 pub unsafe fn vget_low_s8(a: int8x16_t) -> int8x8_t {
     simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
 }

From 49f501648a59782b112cca028d8154efd55c5e04 Mon Sep 17 00:00:00 2001
From: Amanieu d'Antras <amanieu@gmail.com>
Date: Sat, 19 Feb 2022 18:54:52 +0000
Subject: [PATCH 2/2] Updates for LLVM 14 on nightly

---
 .../core_arch/src/aarch64/neon/generated.rs   | 84 ++++++++++++-------
 .../src/arm_shared/neon/generated.rs          |  8 +-
 crates/intrinsic-test/missing_aarch64.txt     | 23 -----
 crates/stdarch-gen/neon.spec                  | 11 +--
 4 files changed, 66 insertions(+), 60 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index e5df1b72c0..010c5de5dc 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -1564,7 +1564,7 @@ pub unsafe fn vclezd_f64(a: f64) -> u64 {
 /// Compare signed less than zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sshr))]
+#[cfg_attr(test, assert_instr(cmlt))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltz_s8(a: int8x8_t) -> uint8x8_t {
     let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
@@ -1574,7 +1574,7 @@ pub unsafe fn vcltz_s8(a: int8x8_t) -> uint8x8_t {
 /// Compare signed less than zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sshr))]
+#[cfg_attr(test, assert_instr(cmlt))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltzq_s8(a: int8x16_t) -> uint8x16_t {
     let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
@@ -1584,7 +1584,7 @@ pub unsafe fn vcltzq_s8(a: int8x16_t) -> uint8x16_t {
 /// Compare signed less than zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sshr))]
+#[cfg_attr(test, assert_instr(cmlt))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltz_s16(a: int16x4_t) -> uint16x4_t {
     let b: i16x4 = i16x4::new(0, 0, 0, 0);
@@ -1594,7 +1594,7 @@ pub unsafe fn vcltz_s16(a: int16x4_t) -> uint16x4_t {
 /// Compare signed less than zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sshr))]
+#[cfg_attr(test, assert_instr(cmlt))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltzq_s16(a: int16x8_t) -> uint16x8_t {
     let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
@@ -1604,7 +1604,7 @@ pub unsafe fn vcltzq_s16(a: int16x8_t) -> uint16x8_t {
 /// Compare signed less than zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sshr))]
+#[cfg_attr(test, assert_instr(cmlt))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltz_s32(a: int32x2_t) -> uint32x2_t {
     let b: i32x2 = i32x2::new(0, 0);
@@ -1614,7 +1614,7 @@ pub unsafe fn vcltz_s32(a: int32x2_t) -> uint32x2_t {
 /// Compare signed less than zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sshr))]
+#[cfg_attr(test, assert_instr(cmlt))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltzq_s32(a: int32x4_t) -> uint32x4_t {
     let b: i32x4 = i32x4::new(0, 0, 0, 0);
@@ -1624,7 +1624,7 @@ pub unsafe fn vcltzq_s32(a: int32x4_t) -> uint32x4_t {
 /// Compare signed less than zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sshr))]
+#[cfg_attr(test, assert_instr(cmlt))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltz_s64(a: int64x1_t) -> uint64x1_t {
     let b: i64x1 = i64x1::new(0);
@@ -1634,7 +1634,7 @@ pub unsafe fn vcltz_s64(a: int64x1_t) -> uint64x1_t {
 /// Compare signed less than zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sshr))]
+#[cfg_attr(test, assert_instr(cmlt))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltzq_s64(a: int64x2_t) -> uint64x2_t {
     let b: i64x2 = i64x2::new(0, 0);
@@ -2714,7 +2714,7 @@ pub unsafe fn vcopyq_lane_p16<const LANE1: i32, const LANE2: i32>(a: poly16x8_t,
 /// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(zip1, LANE1 = 1, LANE2 = 0))]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
 #[rustc_legacy_const_generics(1, 3)]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcopyq_lane_s64<const LANE1: i32, const LANE2: i32>(a: int64x2_t, b: int64x1_t) -> int64x2_t {
@@ -2731,7 +2731,7 @@ pub unsafe fn vcopyq_lane_s64<const LANE1: i32, const LANE2: i32>(a: int64x2_t,
 /// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(zip1, LANE1 = 1, LANE2 = 0))]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
 #[rustc_legacy_const_generics(1, 3)]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcopyq_lane_u64<const LANE1: i32, const LANE2: i32>(a: uint64x2_t, b: uint64x1_t) -> uint64x2_t {
@@ -2748,7 +2748,7 @@ pub unsafe fn vcopyq_lane_u64<const LANE1: i32, const LANE2: i32>(a: uint64x2_t,
 /// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(zip1, LANE1 = 1, LANE2 = 0))]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
 #[rustc_legacy_const_generics(1, 3)]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcopyq_lane_p64<const LANE1: i32, const LANE2: i32>(a: poly64x2_t, b: poly64x1_t) -> poly64x2_t {
@@ -2784,7 +2784,7 @@ pub unsafe fn vcopyq_lane_f32<const LANE1: i32, const LANE2: i32>(a: float32x4_t
 /// Insert vector element from another vector element
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(zip1, LANE1 = 1, LANE2 = 0))]
+#[cfg_attr(test, assert_instr(mov, LANE1 = 1, LANE2 = 0))]
 #[rustc_legacy_const_generics(1, 3)]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcopyq_lane_f64<const LANE1: i32, const LANE2: i32>(a: float64x2_t, b: float64x1_t) -> float64x2_t {
@@ -9183,7 +9183,7 @@ pub unsafe fn vmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     vmaxq_f64_(a, b)
 }
 
-/// Floating-point Maximun Number (vector)
+/// Floating-point Maximum Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fmaxnm))]
@@ -9197,7 +9197,7 @@ pub unsafe fn vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
     vmaxnm_f64_(a, b)
 }
 
-/// Floating-point Maximun Number (vector)
+/// Floating-point Maximum Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fmaxnm))]
@@ -9379,7 +9379,7 @@ pub unsafe fn vminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     vminq_f64_(a, b)
 }
 
-/// Floating-point Minimun Number (vector)
+/// Floating-point Minimum Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fminnm))]
@@ -9393,7 +9393,7 @@ pub unsafe fn vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
     vminnm_f64_(a, b)
 }
 
-/// Floating-point Minimun Number (vector)
+/// Floating-point Minimum Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fminnm))]
@@ -10535,7 +10535,12 @@ pub unsafe fn vqrdmulhs_laneq_s32<const LANE: i32>(a: i32, b: int32x4_t) -> i32
 #[target_feature(enable = "rdm")]
 #[cfg_attr(test, assert_instr(sqrdmlah))]
 pub unsafe fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
-    vqadd_s16(a, vqrdmulh_s16(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlah.v4i16")]
+        fn vqrdmlah_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t;
+    }
+    vqrdmlah_s16_(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10543,7 +10548,12 @@ pub unsafe fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_
 #[target_feature(enable = "rdm")]
 #[cfg_attr(test, assert_instr(sqrdmlah))]
 pub unsafe fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
-    vqaddq_s16(a, vqrdmulhq_s16(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlah.v8i16")]
+        fn vqrdmlahq_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    vqrdmlahq_s16_(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10551,7 +10561,12 @@ pub unsafe fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8
 #[target_feature(enable = "rdm")]
 #[cfg_attr(test, assert_instr(sqrdmlah))]
 pub unsafe fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
-    vqadd_s32(a, vqrdmulh_s32(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlah.v2i32")]
+        fn vqrdmlah_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t;
+    }
+    vqrdmlah_s32_(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10559,7 +10574,12 @@ pub unsafe fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_
 #[target_feature(enable = "rdm")]
 #[cfg_attr(test, assert_instr(sqrdmlah))]
 pub unsafe fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
-    vqaddq_s32(a, vqrdmulhq_s32(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrdmlah.v4i32")]
+        fn vqrdmlahq_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    vqrdmlahq_s32_(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10591,7 +10611,8 @@ pub unsafe fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32 {
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vqrdmlah_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
     static_assert_imm2!(LANE);
-    vqadd_s16(a, vqrdmulh_lane_s16::<LANE>(b, c))
+    let c: int16x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlah_s16(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10601,7 +10622,8 @@ pub unsafe fn vqrdmlah_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vqrdmlah_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
     static_assert_imm3!(LANE);
-    vqadd_s16(a, vqrdmulh_laneq_s16::<LANE>(b, c))
+    let c: int16x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlah_s16(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10611,7 +10633,8 @@ pub unsafe fn vqrdmlah_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vqrdmlahq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
     static_assert_imm2!(LANE);
-    vqaddq_s16(a, vqrdmulhq_lane_s16::<LANE>(b, c))
+    let c: int16x8_t = simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlahq_s16(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10621,7 +10644,8 @@ pub unsafe fn vqrdmlahq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vqrdmlahq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
     static_assert_imm3!(LANE);
-    vqaddq_s16(a, vqrdmulhq_laneq_s16::<LANE>(b, c))
+    let c: int16x8_t = simd_shuffle8!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlahq_s16(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10631,7 +10655,8 @@ pub unsafe fn vqrdmlahq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vqrdmlah_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
     static_assert_imm1!(LANE);
-    vqadd_s32(a, vqrdmulh_lane_s32::<LANE>(b, c))
+    let c: int32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmlah_s32(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10641,7 +10666,8 @@ pub unsafe fn vqrdmlah_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vqrdmlah_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
     static_assert_imm2!(LANE);
-    vqadd_s32(a, vqrdmulh_laneq_s32::<LANE>(b, c))
+    let c: int32x2_t = simd_shuffle2!(c, c, <const LANE: i32> [LANE as u32, LANE as u32]);
+    vqrdmlah_s32(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10651,7 +10677,8 @@ pub unsafe fn vqrdmlah_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vqrdmlahq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
     static_assert_imm1!(LANE);
-    vqaddq_s32(a, vqrdmulhq_lane_s32::<LANE>(b, c))
+    let c: int32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlahq_s32(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -10661,7 +10688,8 @@ pub unsafe fn vqrdmlahq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c:
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vqrdmlahq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
     static_assert_imm2!(LANE);
-    vqaddq_s32(a, vqrdmulhq_laneq_s32::<LANE>(b, c))
+    let c: int32x4_t = simd_shuffle4!(c, c, <const LANE: i32> [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vqrdmlahq_s32(a, b, c)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
index 7837307e44..1f7e41975b 100644
--- a/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -16535,7 +16535,7 @@ pub unsafe fn vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 vmaxq_f32_(a, b)
 }
 
-/// Floating-point Maximun Number (vector)
+/// Floating-point Maximum Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
@@ -16552,7 +16552,7 @@ pub unsafe fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 vmaxnm_f32_(a, b)
 }
 
-/// Floating-point Maximun Number (vector)
+/// Floating-point Maximum Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
@@ -16807,7 +16807,7 @@ pub unsafe fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 vminq_f32_(a, b)
 }
 
-/// Floating-point Minimun Number (vector)
+/// Floating-point Minimum Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
@@ -16824,7 +16824,7 @@ pub unsafe fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 vminnm_f32_(a, b)
 }
 
-/// Floating-point Minimun Number (vector)
+/// Floating-point Minimum Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
diff --git a/crates/intrinsic-test/missing_aarch64.txt b/crates/intrinsic-test/missing_aarch64.txt
index b6ba2eab05..56ec274b5a 100644
--- a/crates/intrinsic-test/missing_aarch64.txt
+++ b/crates/intrinsic-test/missing_aarch64.txt
@@ -108,26 +108,3 @@ vqshluq_n_s32
 vqshluq_n_s64
 vqshluq_n_s8
 vqshlus_n_s32
-
-# These tests produce a different result from C but only in debug builds of
-# stdarch. This likely both a bug in stdarch (expanding to a different LLVM
-# intrinsic) and a bug in LLVM (incorrect optimization changing the behavior of
-# integer operations).
-vqrdmlah_lane_s16
-vqrdmlah_lane_s32
-vqrdmlah_laneq_s16
-vqrdmlah_laneq_s32
-vqrdmlah_s16
-vqrdmlah_s32
-vqrdmlahh_lane_s16
-vqrdmlahh_laneq_s16
-vqrdmlahh_s16
-vqrdmlahq_lane_s16
-vqrdmlahq_lane_s32
-vqrdmlahq_laneq_s16
-vqrdmlahq_laneq_s32
-vqrdmlahq_s16
-vqrdmlahq_s32
-vqrdmlahs_lane_s32
-vqrdmlahs_laneq_s32
-vqrdmlahs_s32
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index c4a720aa35..b3eb07e997 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -846,7 +846,7 @@ a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0
 fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
 
-aarch64 = sshr
+aarch64 = cmlt
 generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
 
 /// Floating-point compare less than zero
@@ -1069,7 +1069,7 @@ b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 n = 1:0
 validate 1, MAX
 
-aarch64 = zip1
+aarch64 = mov
 generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t
 
 /// Insert vector element from another vector element
@@ -1087,7 +1087,7 @@ validate 1., 0.5, 3., 4.
 
 aarch64 = mov
 generate float32x4_t:float32x2_t:float32x4_t
-aarch64 = zip1
+aarch64 = mov
 generate float64x2_t:float64x1_t:float64x2_t
 
 /// Insert vector element from another vector element
@@ -5527,13 +5527,13 @@ generate i16:int16x4_t:i16, i16:int16x8_t:i16, i32:int32x2_t:i32, i32:int32x4_t:
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
 name = vqrdmlah
-multi_fn = vqadd-out-noext, a, {vqrdmulh-out-noext, b, c}
 a = 1, 1, 1, 1, 1, 1, 1, 1
 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
 c = 2, 2, 2, 2, 2, 2, 2, 2
 validate 3, 3, 3, 3, 3, 3, 3, 3
 
 aarch64 = sqrdmlah
+link-aarch64 = sqrdmlah._EXT_
 target = rdm
 generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
 
@@ -5557,7 +5557,8 @@ name = vqrdmlah
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vqadd-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c}
+multi_fn = simd_shuffle-out_len-!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = vqrdmlah-out-noext, a, b, c
 a = 1, 1, 1, 1, 1, 1, 1, 1
 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
 c = 0, 2, 0, 0, 0, 0, 0, 0