diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md
index 8da6074cac..11ad3f04c1 100644
--- a/crates/core_arch/missing-x86.md
+++ b/crates/core_arch/missing-x86.md
@@ -149,21 +149,10 @@
 
 <details><summary>["AVX512F"]</summary><p>
 
-  * [ ] [`_cvtmask16_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtmask16_u32)
-  * [ ] [`_cvtu32_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtu32_mask16)
-  * [ ] [`_kortest_mask16_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortest_mask16_u8)
-  * [ ] [`_kortestc_mask16_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestc_mask16_u8)
-  * [ ] [`_kortestz_mask16_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestz_mask16_u8)
-  * [ ] [`_kshiftli_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftli_mask16)
-  * [ ] [`_kshiftri_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftri_mask16)
-  * [ ] [`_load_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_load_mask16)
-  * [ ] [`_mm512_cvtsd_f64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsd_f64)
-  * [ ] [`_mm512_cvtss_f32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtss_f32)
   * [ ] [`_mm512_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64)
   * [ ] [`_mm512_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd)
   * [ ] [`_mm512_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64)
   * [ ] [`_mm512_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd)
-  * [ ] [`_mm512_kortestz`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kortestz)
   * [ ] [`_mm512_mask_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64)
   * [ ] [`_mm512_mask_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
   * [ ] [`_mm512_mask_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
@@ -175,7 +164,6 @@
   * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss)
   * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd)
   * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss)
-  * [ ] [`_store_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_store_mask16)
 </p></details>
 
 
@@ -204,9 +192,6 @@
   * [ ] [`_mm256_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64)
   * [ ] [`_mm256_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd)
   * [ ] [`_mm256_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps)
-  * [ ] [`_mm256_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt14_pd)
-  * [ ] [`_mm256_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt14_ps)
-  * [ ] [`_mm_abs_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi64)
   * [ ] [`_mm_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32)
   * [ ] [`_mm_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64)
   * [ ] [`_mm_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd)
@@ -215,7 +200,6 @@
   * [ ] [`_mm_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64)
   * [ ] [`_mm_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd)
   * [ ] [`_mm_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps)
-  * [ ] [`_mm_mask_abs_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_abs_epi64)
   * [ ] [`_mm_mask_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32)
   * [ ] [`_mm_mask_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64)
   * [ ] [`_mm_mask_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd)
@@ -224,10 +208,6 @@
   * [ ] [`_mm_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64)
   * [ ] [`_mm_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_pd)
   * [ ] [`_mm_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps)
-  * [ ] [`_mm_mask_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epi64)
-  * [ ] [`_mm_maskz_abs_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_abs_epi64)
-  * [ ] [`_mm_maskz_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epi64)
-  * [ ] [`_mm_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi64)
   * [ ] [`_mm_mmask_i32gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32)
   * [ ] [`_mm_mmask_i32gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64)
   * [ ] [`_mm_mmask_i32gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd)
@@ -236,8 +216,6 @@
   * [ ] [`_mm_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64)
   * [ ] [`_mm_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd)
   * [ ] [`_mm_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps)
-  * [ ] [`_mm_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_pd)
-  * [ ] [`_mm_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ps)
 </p></details>
 
 
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index e5f78a2ffb..8c88d3aa2f 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2,6 +2,7 @@ use crate::{
     arch::asm,
     core_arch::{simd::*, x86::*},
     intrinsics::simd::*,
+    intrinsics::{fmaf32, fmaf64},
     mem, ptr,
 };
 
@@ -48,11 +49,9 @@ use stdarch_test::assert_instr;
 #[cfg_attr(test, assert_instr(vpabsd))]
 pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i {
     let a = a.as_i32x16();
-    // all-0 is a properly initialized i32x16
-    let zero: i32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a);
-    let cmp: i32x16 = simd_gt(a, zero);
-    transmute(simd_select(cmp, a, sub))
+    let zero = i32x16::splat(0);
+    let r = simd_select::<i32x16, _>(simd_lt(a, zero), simd_neg(a), a);
+    transmute(r)
 }
 
 /// Computes the absolute value of packed 32-bit integers in `a`, and store the
@@ -143,11 +142,9 @@ pub unsafe fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpabsq))]
 pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i {
     let a = a.as_i64x8();
-    // all-0 is a properly initialized i64x8
-    let zero: i64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a);
-    let cmp: i64x8 = simd_gt(a, zero);
-    transmute(simd_select(cmp, a, sub))
+    let zero = i64x8::splat(0);
+    let r = simd_select::<i64x8, _>(simd_lt(a, zero), simd_neg(a), a);
+    transmute(r)
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -184,11 +181,9 @@ pub unsafe fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[cfg_attr(test, assert_instr(vpabsq))]
 pub unsafe fn _mm256_abs_epi64(a: __m256i) -> __m256i {
     let a = a.as_i64x4();
-    // all-0 is a properly initialized i64x4
-    let zero: i64x4 = mem::zeroed();
-    let sub = simd_sub(zero, a);
-    let cmp: i64x4 = simd_gt(a, zero);
-    transmute(simd_select(cmp, a, sub))
+    let zero = i64x4::splat(0);
+    let r = simd_select::<i64x4, _>(simd_lt(a, zero), simd_neg(a), a);
+    transmute(r)
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -205,7 +200,7 @@ pub unsafe fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi64&expand=45)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi64)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
@@ -216,18 +211,54 @@ pub unsafe fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
     transmute(simd_select_bitmask(k, abs, zero))
 }
 
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm_abs_epi64(a: __m128i) -> __m128i {
+    let a = a.as_i64x2();
+    let zero = i64x2::splat(0);
+    let r = simd_select::<i64x2, _>(simd_lt(a, zero), simd_neg(a), a);
+    transmute(r)
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm_mask_abs_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let abs = _mm_abs_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, abs, src.as_i64x2()))
+}
+
+/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vpabsq))]
+pub unsafe fn _mm_maskz_abs_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let abs = _mm_abs_epi64(a).as_i64x2();
+    let zero = i64x2::splat(0);
+    transmute(simd_select_bitmask(k, abs, zero))
+}
+
 /// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_ps&expand=65)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vpandq))]
+#[cfg_attr(test, assert_instr(vpandd))]
 pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 {
-    let a = _mm512_set1_epi32(0x7FFFFFFF); // from LLVM code
-    let b = transmute::<f32x16, __m512i>(v2.as_f32x16());
-    let abs = _mm512_and_epi32(a, b);
-    transmute(abs)
+    simd_fabs(v2)
 }
 
 /// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -238,8 +269,7 @@ pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
 pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
-    let abs = _mm512_abs_ps(v2).as_f32x16();
-    transmute(simd_select_bitmask(k, abs, src.as_f32x16()))
+    simd_select_bitmask(k, simd_fabs(v2), src)
 }
 
 /// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
@@ -250,10 +280,7 @@ pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
 pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d {
-    let a = _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF); // from LLVM code
-    let b = transmute::<f64x8, __m512i>(v2.as_f64x8());
-    let abs = _mm512_and_epi64(a, b);
-    transmute(abs)
+    simd_fabs(v2)
 }
 
 /// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -264,8 +291,7 @@ pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
 pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
-    let abs = _mm512_abs_pd(v2).as_f64x8();
-    transmute(simd_select_bitmask(k, abs, src.as_f64x8()))
+    simd_select_bitmask(k, simd_fabs(v2), src)
 }
 
 /// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1264,7 +1290,9 @@ pub unsafe fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
 pub unsafe fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmuldq(a.as_i32x16(), b.as_i32x16()))
+    let a = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(a.as_i64x8()));
+    let b = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(b.as_i64x8()));
+    transmute(simd_mul(a, b))
 }
 
 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1476,7 +1504,10 @@ pub unsafe fn _mm512_mask_mullox_epi64(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
 pub unsafe fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmuludq(a.as_u32x16(), b.as_u32x16()))
+    let a = a.as_u64x8();
+    let b = b.as_u64x8();
+    let mask = u64x8::splat(u32::MAX.into());
+    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
 }
 
 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1906,7 +1937,9 @@ pub unsafe fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
 pub unsafe fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaxsd(a.as_i32x16(), b.as_i32x16()))
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1992,7 +2025,9 @@ pub unsafe fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
 pub unsafe fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaxsq(a.as_i64x8(), b.as_i64x8()))
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2028,7 +2063,9 @@ pub unsafe fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
 pub unsafe fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpmaxsq256(a.as_i64x4(), b.as_i64x4()))
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2064,7 +2101,9 @@ pub unsafe fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
 pub unsafe fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpmaxsq128(a.as_i64x2(), b.as_i64x2()))
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2276,7 +2315,9 @@ pub unsafe fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
 pub unsafe fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaxud(a.as_u32x16(), b.as_u32x16()))
+    let a = a.as_u32x16();
+    let b = b.as_u32x16();
+    transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2362,7 +2403,9 @@ pub unsafe fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
 pub unsafe fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaxuq(a.as_u64x8(), b.as_u64x8()))
+    let a = a.as_u64x8();
+    let b = b.as_u64x8();
+    transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2398,7 +2441,9 @@ pub unsafe fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
 pub unsafe fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpmaxuq256(a.as_u64x4(), b.as_u64x4()))
+    let a = a.as_u64x4();
+    let b = b.as_u64x4();
+    transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2434,7 +2479,9 @@ pub unsafe fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
 pub unsafe fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpmaxuq128(a.as_u64x2(), b.as_u64x2()))
+    let a = a.as_u64x2();
+    let b = b.as_u64x2();
+    transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2470,7 +2517,9 @@ pub unsafe fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsd))]
 pub unsafe fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpminsd(a.as_i32x16(), b.as_i32x16()))
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2556,7 +2605,9 @@ pub unsafe fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
 pub unsafe fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpminsq(a.as_i64x8(), b.as_i64x8()))
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2592,7 +2643,9 @@ pub unsafe fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
 pub unsafe fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpminsq256(a.as_i64x4(), b.as_i64x4()))
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2620,6 +2673,44 @@ pub unsafe fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m
     transmute(simd_select_bitmask(k, min, zero))
 }
 
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm_min_epi64(a: __m128i, b: __m128i) -> __m128i {
+    let a = a.as_i64x2();
+    let b = b.as_i64x2();
+    transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm_mask_min_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, min, src.as_i64x2()))
+}
+
+/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vpminsq))]
+pub unsafe fn _mm_maskz_min_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let min = _mm_min_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, min, zero))
+}
+
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_ps&expand=3769)
@@ -2806,7 +2897,9 @@ pub unsafe fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminud))]
 pub unsafe fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpminud(a.as_u32x16(), b.as_u32x16()))
+    let a = a.as_u32x16();
+    let b = b.as_u32x16();
+    transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2892,7 +2985,9 @@ pub unsafe fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
 pub unsafe fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpminuq(a.as_u64x8(), b.as_u64x8()))
+    let a = a.as_u64x8();
+    let b = b.as_u64x8();
+    transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2928,7 +3023,9 @@ pub unsafe fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
 pub unsafe fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpminuq256(a.as_u64x4(), b.as_u64x4()))
+    let a = a.as_u64x4();
+    let b = b.as_u64x4();
+    transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2964,7 +3061,9 @@ pub unsafe fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
 pub unsafe fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpminuq128(a.as_u64x2(), b.as_u64x2()))
+    let a = a.as_u64x2();
+    let b = b.as_u64x2();
+    transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3000,7 +3099,7 @@ pub unsafe fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 {
-    transmute(vsqrtps(a.as_f32x16(), _MM_FROUND_CUR_DIRECTION))
+    simd_fsqrt(a)
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3011,8 +3110,7 @@ pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    let sqrt = _mm512_sqrt_ps(a).as_f32x16();
-    transmute(simd_select_bitmask(k, sqrt, src.as_f32x16()))
+    simd_select_bitmask(k, simd_fsqrt(a), src)
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3023,9 +3121,7 @@ pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
-    let sqrt = _mm512_sqrt_ps(a).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, sqrt, zero))
+    simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_ps())
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3036,8 +3132,7 @@ pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    let sqrt = _mm256_sqrt_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, sqrt, src.as_f32x8()))
+    simd_select_bitmask(k, simd_fsqrt(a), src)
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3048,9 +3143,7 @@ pub unsafe fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
-    let sqrt = _mm256_sqrt_ps(a).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, sqrt, zero))
+    simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_ps())
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3061,8 +3154,7 @@ pub unsafe fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    let sqrt = _mm_sqrt_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, sqrt, src.as_f32x4()))
+    simd_select_bitmask(k, simd_fsqrt(a), src)
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3073,9 +3165,7 @@ pub unsafe fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
-    let sqrt = _mm_sqrt_ps(a).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, sqrt, zero))
+    simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_ps())
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
@@ -3086,7 +3176,7 @@ pub unsafe fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
-    transmute(vsqrtpd(a.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
+    simd_fsqrt(a)
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3097,8 +3187,7 @@ pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    let sqrt = _mm512_sqrt_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, sqrt, src.as_f64x8()))
+    simd_select_bitmask(k, simd_fsqrt(a), src)
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3109,9 +3198,7 @@ pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m5
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
-    let sqrt = _mm512_sqrt_pd(a).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, sqrt, zero))
+    simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_pd())
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3122,8 +3209,7 @@ pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    let sqrt = _mm256_sqrt_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, sqrt, src.as_f64x4()))
+    simd_select_bitmask(k, simd_fsqrt(a), src)
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3134,9 +3220,7 @@ pub unsafe fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
-    let sqrt = _mm256_sqrt_pd(a).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, sqrt, zero))
+    simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_pd())
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3147,8 +3231,7 @@ pub unsafe fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    let sqrt = _mm_sqrt_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, sqrt, src.as_f64x2()))
+    simd_select_bitmask(k, simd_fsqrt(a), src)
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3159,9 +3242,7 @@ pub unsafe fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
-    let sqrt = _mm_sqrt_pd(a).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, sqrt, zero))
+    simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_pd())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
@@ -3172,7 +3253,7 @@ pub unsafe fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16()))
+    simd_fma(a, b, c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3183,8 +3264,7 @@ pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fmadd, a.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3195,9 +3275,7 @@ pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, fmadd, zero))
+    simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3208,8 +3286,7 @@ pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fmadd, c.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3220,8 +3297,7 @@ pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fmadd, a.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3232,9 +3308,7 @@ pub unsafe fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, fmadd, zero))
+    simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), _mm256_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3245,8 +3319,7 @@ pub unsafe fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fmadd, c.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3257,8 +3330,7 @@ pub unsafe fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fmadd, a.as_f32x4()))
+    simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3269,9 +3341,7 @@ pub unsafe fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, fmadd, zero))
+    simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), _mm_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3282,8 +3352,7 @@ pub unsafe fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 pub unsafe fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fmadd, c.as_f32x4()))
+    simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
@@ -3294,7 +3363,7 @@ pub unsafe fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8()))
+    simd_fma(a, b, c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3305,8 +3374,7 @@ pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fmadd, a.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3317,9 +3385,7 @@ pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, fmadd, zero))
+    simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3330,8 +3396,7 @@ pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m5
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fmadd, c.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3342,8 +3407,7 @@ pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fmadd, a.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3354,9 +3418,7 @@ pub unsafe fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, fmadd, zero))
+    simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), _mm256_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3367,8 +3429,7 @@ pub unsafe fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fmadd, c.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3379,8 +3440,7 @@ pub unsafe fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fmadd, a.as_f64x2()))
+    simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3391,9 +3451,7 @@ pub unsafe fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, fmadd, zero))
+    simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), _mm_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3404,8 +3462,7 @@ pub unsafe fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fmadd, c.as_f64x2()))
+    simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
@@ -3414,11 +3471,9 @@ pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub))
+    simd_fma(a, b, simd_neg(c))
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3427,10 +3482,9 @@ pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fmsub, a.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3439,11 +3493,9 @@ pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, fmsub, zero))
+    simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3452,10 +3504,9 @@ pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m51
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fmsub, c.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3466,8 +3517,7 @@ pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fmsub, a.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3478,9 +3528,7 @@ pub unsafe fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, fmsub, zero))
+    simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), _mm256_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3491,8 +3539,7 @@ pub unsafe fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fmsub, c.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3503,8 +3550,7 @@ pub unsafe fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fmsub, a.as_f32x4()))
+    simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3515,9 +3561,7 @@ pub unsafe fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, fmsub, zero))
+    simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), _mm_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3528,8 +3572,7 @@ pub unsafe fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
 pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fmsub, c.as_f32x4()))
+    simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
@@ -3538,11 +3581,9 @@ pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
-    transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub))
+    simd_fma(a, b, simd_neg(c))
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3551,10 +3592,9 @@ pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fmsub, a.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3563,11 +3603,9 @@ pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m51
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, fmsub, zero))
+    simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3576,10 +3614,9 @@ pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
+#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fmsub, c.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3590,8 +3627,7 @@ pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fmsub, a.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3602,9 +3638,7 @@ pub unsafe fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, fmsub, zero))
+    simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), _mm256_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3615,8 +3649,7 @@ pub unsafe fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fmsub, c.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3627,8 +3660,7 @@ pub unsafe fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fmsub, a.as_f64x2()))
+    simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3639,9 +3671,7 @@ pub unsafe fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, fmsub, zero))
+    simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), _mm_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3652,8 +3682,7 @@ pub unsafe fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
 pub unsafe fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fmsub, c.as_f64x2()))
+    simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
@@ -3664,12 +3693,13 @@ pub unsafe fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    transmute(vfmaddsub213ps(
-        a.as_f32x16(),
-        b.as_f32x16(),
-        c.as_f32x16(),
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    let add = simd_fma(a, b, c);
+    let sub = simd_fma(a, b, simd_neg(c));
+    simd_shuffle!(
+        add,
+        sub,
+        [16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15]
+    )
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3680,8 +3710,7 @@ pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3692,9 +3721,7 @@ pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, fmaddsub, zero))
+    simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3705,8 +3732,7 @@ pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3717,8 +3743,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3729,9 +3754,7 @@ pub unsafe fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, fmaddsub, zero))
+    simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), _mm256_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3742,8 +3765,7 @@ pub unsafe fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3754,8 +3776,7 @@ pub unsafe fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x4()))
+    simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3766,9 +3787,7 @@ pub unsafe fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, fmaddsub, zero))
+    simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), _mm_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3779,8 +3798,7 @@ pub unsafe fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 pub unsafe fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x4()))
+    simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
@@ -3791,12 +3809,9 @@ pub unsafe fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    transmute(vfmaddsub213pd(
-        a.as_f64x8(),
-        b.as_f64x8(),
-        c.as_f64x8(),
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    let add = simd_fma(a, b, c);
+    let sub = simd_fma(a, b, simd_neg(c));
+    simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3807,8 +3822,7 @@ pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3819,9 +3833,7 @@ pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, fmaddsub, zero))
+    simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), _mm512_setzero_pd())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3832,8 +3844,7 @@ pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3844,8 +3855,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3856,9 +3866,7 @@ pub unsafe fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, fmaddsub, zero))
+    simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), _mm256_setzero_pd())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3869,8 +3877,7 @@ pub unsafe fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3881,8 +3888,7 @@ pub unsafe fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x2()))
+    simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3893,9 +3899,7 @@ pub unsafe fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, fmaddsub, zero))
+    simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), _mm_setzero_pd())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3906,8 +3910,7 @@ pub unsafe fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x2()))
+    simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
@@ -3916,16 +3919,15 @@ pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    transmute(vfmaddsub213ps(
-        a.as_f32x16(),
-        b.as_f32x16(),
+    let add = simd_fma(a, b, c);
+    let sub = simd_fma(a, b, simd_neg(c));
+    simd_shuffle!(
+        add,
         sub,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+        [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
+    )
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3934,10 +3936,9 @@ pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3946,11 +3947,9 @@ pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, fmsubadd, zero))
+    simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3959,10 +3958,9 @@ pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3973,8 +3971,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3985,9 +3982,7 @@ pub unsafe fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, fmsubadd, zero))
+    simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), _mm256_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3998,8 +3993,7 @@ pub unsafe fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4010,8 +4004,7 @@ pub unsafe fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x4()))
+    simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4022,9 +4015,7 @@ pub unsafe fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, fmsubadd, zero))
+    simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), _mm_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4035,8 +4026,7 @@ pub unsafe fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x4()))
+    simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
@@ -4045,16 +4035,11 @@ pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
-    transmute(vfmaddsub213pd(
-        a.as_f64x8(),
-        b.as_f64x8(),
-        sub,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    let add = simd_fma(a, b, c);
+    let sub = simd_fma(a, b, simd_neg(c));
+    simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4063,10 +4048,9 @@ pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4075,11 +4059,9 @@ pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, fmsubadd, zero))
+    simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4088,10 +4070,9 @@ pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: _
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4102,8 +4083,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4114,9 +4094,7 @@ pub unsafe fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, fmsubadd, zero))
+    simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), _mm256_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4127,8 +4105,7 @@ pub unsafe fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4139,8 +4116,7 @@ pub unsafe fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x2()))
+    simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4151,9 +4127,7 @@ pub unsafe fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, fmsubadd, zero))
+    simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), _mm_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4164,8 +4138,7 @@ pub unsafe fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x2()))
+    simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
@@ -4174,11 +4147,9 @@ pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
-    transmute(vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16()))
+    simd_fma(simd_neg(a), b, c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4187,10 +4158,9 @@ pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4199,11 +4169,9 @@ pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m51
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, fnmadd, zero))
+    simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4212,10 +4180,9 @@ pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4226,8 +4193,7 @@ pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4238,9 +4204,7 @@ pub unsafe fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, fnmadd, zero))
+    simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), _mm256_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4251,8 +4215,7 @@ pub unsafe fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4263,8 +4226,7 @@ pub unsafe fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x4()))
+    simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4275,9 +4237,7 @@ pub unsafe fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, fnmadd, zero))
+    simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), _mm_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4288,8 +4248,7 @@ pub unsafe fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x4()))
+    simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
@@ -4298,11 +4257,9 @@ pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f64x8());
-    transmute(vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8()))
+    simd_fma(simd_neg(a), b, c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4311,10 +4268,9 @@ pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4323,11 +4279,9 @@ pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, fnmadd, zero))
+    simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4336,10 +4290,9 @@ pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4350,8 +4303,7 @@ pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4362,9 +4314,7 @@ pub unsafe fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, fnmadd, zero))
+    simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), _mm256_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4375,8 +4325,7 @@ pub unsafe fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4387,8 +4336,7 @@ pub unsafe fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x2()))
+    simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4399,9 +4347,7 @@ pub unsafe fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, fnmadd, zero))
+    simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), _mm_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4412,8 +4358,7 @@ pub unsafe fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x2()))
+    simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
@@ -4422,12 +4367,9 @@ pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let zero: f32x16 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f32x16());
-    let subc = simd_sub(zero, c.as_f32x16());
-    transmute(vfmadd132ps(suba, b.as_f32x16(), subc))
+    simd_fma(simd_neg(a), b, simd_neg(c))
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4436,10 +4378,9 @@ pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4448,11 +4389,9 @@ pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m51
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, fnmsub, zero))
+    simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4461,10 +4400,9 @@ pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
-    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x16()))
+    simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4475,8 +4413,7 @@ pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4487,9 +4424,7 @@ pub unsafe fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, fnmsub, zero))
+    simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), _mm256_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4500,8 +4435,7 @@ pub unsafe fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
-    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x8()))
+    simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4512,8 +4446,7 @@ pub unsafe fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x4()))
+    simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4524,9 +4457,7 @@ pub unsafe fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, fnmsub, zero))
+    simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), _mm_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4537,8 +4468,7 @@ pub unsafe fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
-    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x4()))
+    simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
@@ -4547,12 +4477,9 @@ pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let zero: f64x8 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f64x8());
-    let subc = simd_sub(zero, c.as_f64x8());
-    transmute(vfmadd132pd(suba, b.as_f64x8(), subc))
+    simd_fma(simd_neg(a), b, simd_neg(c))
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4561,10 +4488,9 @@ pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4573,11 +4499,9 @@ pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, fnmsub, zero))
+    simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4586,10 +4510,9 @@ pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
-    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x8()))
+    simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4600,8 +4523,7 @@ pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4612,9 +4534,7 @@ pub unsafe fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, fnmsub, zero))
+    simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), _mm256_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4625,8 +4545,7 @@ pub unsafe fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
-    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x4()))
+    simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4637,8 +4556,7 @@ pub unsafe fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x2()))
+    simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4649,9 +4567,7 @@ pub unsafe fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, fnmsub, zero))
+    simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), _mm_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4662,8 +4578,7 @@ pub unsafe fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 pub unsafe fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
-    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x2()))
+    simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), c)
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4929,6 +4844,21 @@ pub unsafe fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
     ))
 }
 
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm256_rsqrt14_ps(a: __m256) -> __m256 {
+    transmute(vrsqrt14ps256(
+        a.as_f32x8(),
+        _mm256_setzero_ps().as_f32x8(),
+        0b11111111,
+    ))
+}
+
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_ps&expand=4815)
@@ -4955,6 +4885,21 @@ pub unsafe fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
     ))
 }
 
+/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vrsqrt14ps))]
+pub unsafe fn _mm_rsqrt14_ps(a: __m128) -> __m128 {
+    transmute(vrsqrt14ps128(
+        a.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b00001111,
+    ))
+}
+
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_ps&expand=4813)
@@ -5014,6 +4959,21 @@ pub unsafe fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
     transmute(vrsqrt14pd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
 }
 
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm256_rsqrt14_pd(a: __m256d) -> __m256d {
+    transmute(vrsqrt14pd256(
+        a.as_f64x4(),
+        _mm256_setzero_pd().as_f64x4(),
+        0b00001111,
+    ))
+}
+
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_pd&expand=4808)
@@ -5040,6 +5000,21 @@ pub unsafe fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
     ))
 }
 
+/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vrsqrt14pd))]
+pub unsafe fn _mm_rsqrt14_pd(a: __m128d) -> __m128d {
+    transmute(vrsqrt14pd128(
+        a.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b00000011,
+    ))
+}
+
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_pd&expand=4806)
@@ -8153,11 +8128,7 @@ pub unsafe fn _mm512_fmadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmadd132psround(a, b, c, ROUNDING);
-    transmute(r)
+    vfmadd132psround(a, b, c, ROUNDING)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8182,11 +8153,7 @@ pub unsafe fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmadd132psround(a, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8211,12 +8178,7 @@ pub unsafe fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmadd132psround(a, b, c, ROUNDING);
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8241,11 +8203,7 @@ pub unsafe fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmadd132psround(a, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
@@ -8269,11 +8227,7 @@ pub unsafe fn _mm512_fmadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmadd132pdround(a, b, c, ROUNDING);
-    transmute(r)
+    vfmadd132pdround(a, b, c, ROUNDING)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8298,11 +8252,7 @@ pub unsafe fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmadd132pdround(a, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8327,12 +8277,7 @@ pub unsafe fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmadd132pdround(a, b, c, ROUNDING);
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8357,11 +8302,7 @@ pub unsafe fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmadd132pdround(a, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
@@ -8377,7 +8318,7 @@ pub unsafe fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8385,12 +8326,7 @@ pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vfmadd132psround(a, b, sub, ROUNDING);
-    transmute(r)
+    vfmadd132psround(a, b, simd_neg(c), ROUNDING)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8406,7 +8342,7 @@ pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8415,12 +8351,8 @@ pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vfmadd132psround(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8436,7 +8368,7 @@ pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
@@ -8445,12 +8377,8 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vfmadd132psround(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8466,7 +8394,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8475,13 +8403,8 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let c = c.as_f32x16();
-    let sub = simd_sub(zero, c);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vfmadd132psround(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
@@ -8497,7 +8420,7 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -8505,12 +8428,7 @@ pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vfmadd132pdround(a, b, sub, ROUNDING);
-    transmute(r)
+    vfmadd132pdround(a, b, simd_neg(c), ROUNDING)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8526,7 +8444,7 @@ pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -8535,12 +8453,8 @@ pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vfmadd132pdround(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8556,7 +8470,7 @@ pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
@@ -8565,12 +8479,8 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vfmadd132pdround(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8586,7 +8496,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -8595,13 +8505,8 @@ pub unsafe fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let c = c.as_f64x8();
-    let sub = simd_sub(zero, c);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vfmadd132pdround(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
@@ -8625,11 +8530,7 @@ pub unsafe fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmaddsub213ps(a, b, c, ROUNDING);
-    transmute(r)
+    vfmaddsubpsround(a, b, c, ROUNDING)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8654,11 +8555,7 @@ pub unsafe fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmaddsub213ps(a, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8683,12 +8580,7 @@ pub unsafe fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmaddsub213ps(a, b, c, ROUNDING);
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8713,11 +8605,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmaddsub213ps(a, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
@@ -8741,11 +8629,7 @@ pub unsafe fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmaddsub213pd(a, b, c, ROUNDING);
-    transmute(r)
+    vfmaddsubpdround(a, b, c, ROUNDING)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8770,11 +8654,7 @@ pub unsafe fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmaddsub213pd(a, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8799,12 +8679,7 @@ pub unsafe fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmaddsub213pd(a, b, c, ROUNDING);
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), _mm512_setzero_pd())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8829,11 +8704,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmaddsub213pd(a, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
@@ -8849,7 +8720,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8857,12 +8728,7 @@ pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
-    transmute(r)
+    vfmaddsubpsround(a, b, simd_neg(c), ROUNDING)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8878,7 +8744,7 @@ pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8887,12 +8753,8 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8908,7 +8770,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
@@ -8917,12 +8779,8 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f32x16());
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8938,7 +8796,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -8947,13 +8805,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let c = c.as_f32x16();
-    let sub = simd_sub(zero, c);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
@@ -8969,7 +8822,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -8977,12 +8830,7 @@ pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
-    transmute(r)
+    vfmaddsubpdround(a, b, simd_neg(c), ROUNDING)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8998,7 +8846,7 @@ pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9007,12 +8855,8 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9028,7 +8872,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
@@ -9037,12 +8881,8 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, c.as_f64x8());
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -9058,7 +8898,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
+#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9067,13 +8907,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let c = c.as_f64x8();
-    let sub = simd_sub(zero, c);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
@@ -9089,7 +8924,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9097,12 +8932,7 @@ pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmadd132psround(sub, b, c, ROUNDING);
-    transmute(r)
+    vfmadd132psround(simd_neg(a), b, c, ROUNDING)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -9118,7 +8948,7 @@ pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9127,12 +8957,8 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmadd132psround(sub, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a.as_f32x16()))
+    let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+    simd_select_bitmask(k, r, a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9148,7 +8974,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
@@ -9157,12 +8983,8 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmadd132psround(sub, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+    simd_select_bitmask(k, r, _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -9178,7 +9000,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9187,12 +9009,8 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f32x16());
-    let b = b.as_f32x16();
-    let c = c.as_f32x16();
-    let r = vfmadd132psround(sub, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+    simd_select_bitmask(k, r, c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
@@ -9208,7 +9026,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9216,12 +9034,7 @@ pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f64x8());
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmadd132pdround(sub, b, c, ROUNDING);
-    transmute(r)
+    vfmadd132pdround(simd_neg(a), b, c, ROUNDING)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -9237,7 +9050,7 @@ pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9246,13 +9059,8 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let a = a.as_f64x8();
-    let sub = simd_sub(zero, a);
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmadd132pdround(sub, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+    simd_select_bitmask(k, r, a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9268,7 +9076,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
@@ -9277,12 +9085,8 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f64x8());
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmadd132pdround(sub, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+    simd_select_bitmask(k, r, _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -9298,7 +9102,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9307,12 +9111,8 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let sub = simd_sub(zero, a.as_f64x8());
-    let b = b.as_f64x8();
-    let c = c.as_f64x8();
-    let r = vfmadd132pdround(sub, b, c, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+    simd_select_bitmask(k, r, c)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
@@ -9328,7 +9128,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9336,12 +9136,7 @@ pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f32x16());
-    let subc = simd_sub(zero, c.as_f32x16());
-    let b = b.as_f32x16();
-    let r = vfmadd132psround(suba, b, subc, ROUNDING);
-    transmute(r)
+    vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -9357,7 +9152,7 @@ pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9366,13 +9161,8 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let a = a.as_f32x16();
-    let suba = simd_sub(zero, a);
-    let subc = simd_sub(zero, c.as_f32x16());
-    let b = b.as_f32x16();
-    let r = vfmadd132psround(suba, b, subc, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, a)
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9388,7 +9178,7 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
@@ -9397,12 +9187,8 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
     c: __m512,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f32x16());
-    let subc = simd_sub(zero, c.as_f32x16());
-    let b = b.as_f32x16();
-    let r = vfmadd132psround(suba, b, subc, ROUNDING);
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, _mm512_setzero_ps())
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -9418,7 +9204,7 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
@@ -9427,13 +9213,8 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
 ) -> __m512 {
     static_assert_rounding!(ROUNDING);
-    let zero: f32x16 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f32x16());
-    let c = c.as_f32x16();
-    let subc = simd_sub(zero, c);
-    let b = b.as_f32x16();
-    let r = vfmadd132psround(suba, b, subc, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, c)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
@@ -9449,7 +9230,7 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9457,12 +9238,7 @@ pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f64x8());
-    let subc = simd_sub(zero, c.as_f64x8());
-    let b = b.as_f64x8();
-    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
-    transmute(r)
+    vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -9478,7 +9254,7 @@ pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9487,13 +9263,8 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let a = a.as_f64x8();
-    let suba = simd_sub(zero, a);
-    let subc = simd_sub(zero, c.as_f64x8());
-    let b = b.as_f64x8();
-    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
-    transmute(simd_select_bitmask(k, r, a))
+    let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, a)
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9509,7 +9280,7 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
@@ -9518,12 +9289,8 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
     c: __m512d,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f64x8());
-    let subc = simd_sub(zero, c.as_f64x8());
-    let b = b.as_f64x8();
-    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, _mm512_setzero_pd())
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -9539,7 +9306,7 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
@@ -9548,13 +9315,8 @@ pub unsafe fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m512d {
     static_assert_rounding!(ROUNDING);
-    let zero: f64x8 = mem::zeroed();
-    let suba = simd_sub(zero, a.as_f64x8());
-    let c = c.as_f64x8();
-    let subc = simd_sub(zero, c);
-    let b = b.as_f64x8();
-    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
-    transmute(simd_select_bitmask(k, r, c))
+    let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+    simd_select_bitmask(k, r, c)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\
@@ -15947,7 +15709,7 @@ pub unsafe fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm512_setzero_pd() -> __m512d {
     // All-0 is a properly initialized __m512d
-    mem::zeroed()
+    const { mem::zeroed() }
 }
 
 /// Returns vector of type `__m512` with all elements set to zero.
@@ -15959,7 +15721,7 @@ pub unsafe fn _mm512_setzero_pd() -> __m512d {
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm512_setzero_ps() -> __m512 {
     // All-0 is a properly initialized __m512
-    mem::zeroed()
+    const { mem::zeroed() }
 }
 
 /// Return vector of type `__m512` with all elements set to zero.
@@ -15971,7 +15733,7 @@ pub unsafe fn _mm512_setzero_ps() -> __m512 {
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm512_setzero() -> __m512 {
     // All-0 is a properly initialized __m512
-    mem::zeroed()
+    const { mem::zeroed() }
 }
 
 /// Returns vector of type `__m512i` with all elements set to zero.
@@ -15983,7 +15745,7 @@ pub unsafe fn _mm512_setzero() -> __m512 {
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm512_setzero_si512() -> __m512i {
     // All-0 is a properly initialized __m512i
-    mem::zeroed()
+    const { mem::zeroed() }
 }
 
 /// Return vector of type `__m512i` with all elements set to zero.
@@ -15995,7 +15757,7 @@ pub unsafe fn _mm512_setzero_si512() -> __m512i {
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm512_setzero_epi32() -> __m512i {
     // All-0 is a properly initialized __m512i
-    mem::zeroed()
+    const { mem::zeroed() }
 }
 
 /// Sets packed 32-bit integers in `dst` with the supplied values in reverse
@@ -25556,8 +25318,27 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(vmovd))]
 pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
-    let extract: i32 = simd_extract!(a.as_i32x16(), 0);
-    extract
+    simd_extract!(a.as_i32x16(), 0)
+}
+
+/// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtss_f32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_cvtss_f32(a: __m512) -> f32 {
+    simd_extract!(a, 0)
+}
+
+/// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_cvtsd_f64(a: __m512d) -> f64 {
+    simd_extract!(a, 0)
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst.
@@ -27493,6 +27274,26 @@ pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
     _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
 }
 
+/// Convert 16-bit mask a into an integer value, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask16_u32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _cvtmask16_u32(a: __mmask16) -> u32 {
+    a as u32
+}
+
+/// Convert 32-bit integer value a to an 16-bit mask and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _cvtu32_mask16(a: u32) -> __mmask16 {
+    a as __mmask16
+}
+
 /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kand_mask16&expand=3212)
@@ -27623,6 +27424,83 @@ pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
     _mm512_knot(_mm512_kxor(a, b))
 }
 
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortest_mask16_u8(a: __mmask16, b: __mmask16, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask16(a, b);
+    *all_ones = (tmp == 0xffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kor_mask16(a, b) == 0xffff) as u8
+}
+
+/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask16_u8)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+    (_kor_mask16(a, b) == 0) as u8
+}
+
+/// Shift 16-bit mask a left by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
+    a << COUNT
+}
+
+/// Shift 16-bit mask a right by count bits while shifting in zeros, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kshiftri_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
+    a >> COUNT
+}
+
+/// Load 16-bit mask from memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _load_mask16(mem_addr: *const __mmask16) -> __mmask16 {
+    *mem_addr
+}
+
+/// Store 16-bit mask to memory
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask16)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _store_mask16(mem_addr: *mut __mmask16, a: __mmask16) {
+    *mem_addr = a;
+}
+
 /// Copy 16-bit mask a to k.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_kmov&expand=3228)
@@ -27674,12 +27552,20 @@ pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw
 pub unsafe fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
-    let r = a | b;
-    if r == 0b11111111_11111111 {
-        1
-    } else {
-        0
-    }
+    let r = (a | b) == 0b11111111_11111111;
+    r as i32
+}
+
+/// Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=512_kortestz)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kortestw
+pub unsafe fn _mm512_kortestz(a: __mmask16, b: __mmask16) -> i32 {
+    let r = (a | b) == 0;
+    r as i32
 }
 
 /// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
@@ -29208,7 +29094,7 @@ pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
     simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16()))
 }
 
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu32_mask&expand=1057)
 #[inline]
@@ -29216,7 +29102,7 @@ pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmplt_epu32_mask(a, b) & k1
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -29230,7 +29116,7 @@ pub unsafe fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
     simd_bitmask::<u32x8, _>(simd_lt(a.as_u32x8(), b.as_u32x8()))
 }
 
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu32_mask&expand=1055)
 #[inline]
@@ -29238,7 +29124,7 @@ pub unsafe fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmplt_epu32_mask(a, b) & k1
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -29252,7 +29138,7 @@ pub unsafe fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
     simd_bitmask::<u32x4, _>(simd_lt(a.as_u32x4(), b.as_u32x4()))
 }
 
-/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu32_mask&expand=1053)
 #[inline]
@@ -29260,7 +29146,7 @@ pub unsafe fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmplt_epu32_mask(a, b) & k1
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -29282,7 +29168,7 @@ pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmpgt_epu32_mask(a, b) & k1
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -29304,7 +29190,7 @@ pub unsafe fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpgt_epu32_mask(a, b) & k1
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -29326,7 +29212,7 @@ pub unsafe fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpgt_epu32_mask(a, b) & k1
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -29340,7 +29226,7 @@ pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
     simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16()))
 }
 
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu32_mask&expand=996)
 #[inline]
@@ -29348,7 +29234,7 @@ pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmple_epu32_mask(a, b) & k1
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -29362,7 +29248,7 @@ pub unsafe fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
     simd_bitmask::<u32x8, _>(simd_le(a.as_u32x8(), b.as_u32x8()))
 }
 
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu32_mask&expand=994)
 #[inline]
@@ -29370,7 +29256,7 @@ pub unsafe fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmple_epu32_mask(a, b) & k1
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -29384,7 +29270,7 @@ pub unsafe fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
     simd_bitmask::<u32x4, _>(simd_le(a.as_u32x4(), b.as_u32x4()))
 }
 
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu32_mask&expand=992)
 #[inline]
@@ -29392,7 +29278,7 @@ pub unsafe fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmple_epu32_mask(a, b) & k1
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -29414,7 +29300,7 @@ pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmpge_epu32_mask(a, b) & k1
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -29436,7 +29322,7 @@ pub unsafe fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpge_epu32_mask(a, b) & k1
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -29458,7 +29344,7 @@ pub unsafe fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpge_epu32_mask(a, b) & k1
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
@@ -29480,7 +29366,7 @@ pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmpeq_epu32_mask(a, b) & k1
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
@@ -29502,7 +29388,7 @@ pub unsafe fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpeq_epu32_mask(a, b) & k1
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
@@ -29524,7 +29410,7 @@ pub unsafe fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpeq_epu32_mask(a, b) & k1
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -29546,7 +29432,7 @@ pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmpneq_epu32_mask(a, b) & k1
+    _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -29568,7 +29454,7 @@ pub unsafe fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpneq_epu32_mask(a, b) & k1
+    _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -29590,7 +29476,7 @@ pub unsafe fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
 pub unsafe fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpneq_epu32_mask(a, b) & k1
+    _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -29606,11 +29492,19 @@ pub unsafe fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m512i,
 ) -> __mmask16 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let r = vpcmpud(a, b, IMM3, neg_one);
-    transmute(r)
+    let a = a.as_u32x16();
+    let b = b.as_u32x16();
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i32x16::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i32x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29627,10 +29521,20 @@ pub unsafe fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m512i,
 ) -> __mmask16 {
     static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let r = vpcmpud(a, b, IMM3, k1 as i16);
-    transmute(r)
+    let a = a.as_u32x16();
+    let b = b.as_u32x16();
+    let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i32x16::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i32x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -29646,11 +29550,19 @@ pub unsafe fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m256i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let r = vpcmpud256(a, b, IMM3, neg_one);
-    transmute(r)
+    let a = a.as_u32x8();
+    let b = b.as_u32x8();
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i32x8::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i32x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29667,10 +29579,20 @@ pub unsafe fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m256i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let r = vpcmpud256(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let a = a.as_u32x8();
+    let b = b.as_u32x8();
+    let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i32x8::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i32x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -29683,11 +29605,19 @@ pub unsafe fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
 pub unsafe fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let r = vpcmpud128(a, b, IMM3, neg_one);
-    transmute(r)
+    let a = a.as_u32x4();
+    let b = b.as_u32x4();
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i32x4::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i32x4::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29704,10 +29634,20 @@ pub unsafe fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m128i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let r = vpcmpud128(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let a = a.as_u32x4();
+    let b = b.as_u32x4();
+    let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i32x4::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i32x4::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -29721,7 +29661,7 @@ pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
     simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16()))
 }
 
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi32_mask&expand=1031)
 #[inline]
@@ -29729,7 +29669,7 @@ pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmplt_epi32_mask(a, b) & k1
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -29743,7 +29683,7 @@ pub unsafe fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
     simd_bitmask::<i32x8, _>(simd_lt(a.as_i32x8(), b.as_i32x8()))
 }
 
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi32_mask&expand=1028)
 #[inline]
@@ -29751,7 +29691,7 @@ pub unsafe fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmplt_epi32_mask(a, b) & k1
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -29765,7 +29705,7 @@ pub unsafe fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
     simd_bitmask::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
 }
 
-/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi32_mask&expand=1026)
 #[inline]
@@ -29773,7 +29713,7 @@ pub unsafe fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmplt_epi32_mask(a, b) & k1
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -29795,7 +29735,7 @@ pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmpgt_epi32_mask(a, b) & k1
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -29817,7 +29757,7 @@ pub unsafe fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpgt_epi32_mask(a, b) & k1
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -29839,7 +29779,7 @@ pub unsafe fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpgt_epi32_mask(a, b) & k1
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -29853,7 +29793,7 @@ pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
     simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16()))
 }
 
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi32_mask&expand=972)
 #[inline]
@@ -29861,7 +29801,7 @@ pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmple_epi32_mask(a, b) & k1
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -29875,7 +29815,7 @@ pub unsafe fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
     simd_bitmask::<i32x8, _>(simd_le(a.as_i32x8(), b.as_i32x8()))
 }
 
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi32_mask&expand=970)
 #[inline]
@@ -29883,7 +29823,7 @@ pub unsafe fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmple_epi32_mask(a, b) & k1
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -29897,7 +29837,7 @@ pub unsafe fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
     simd_bitmask::<i32x4, _>(simd_le(a.as_i32x4(), b.as_i32x4()))
 }
 
-/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi32_mask&expand=968)
 #[inline]
@@ -29905,7 +29845,7 @@ pub unsafe fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmple_epi32_mask(a, b) & k1
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -29927,7 +29867,7 @@ pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmpge_epi32_mask(a, b) & k1
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -29949,7 +29889,7 @@ pub unsafe fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpge_epi32_mask(a, b) & k1
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -29971,7 +29911,7 @@ pub unsafe fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpge_epi32_mask(a, b) & k1
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
@@ -29993,7 +29933,7 @@ pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmpeq_epi32_mask(a, b) & k1
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
@@ -30015,7 +29955,7 @@ pub unsafe fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpeq_epi32_mask(a, b) & k1
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
@@ -30037,7 +29977,7 @@ pub unsafe fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpeq_epi32_mask(a, b) & k1
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -30059,7 +29999,7 @@ pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmpneq_epi32_mask(a, b) & k1
+    _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -30081,7 +30021,7 @@ pub unsafe fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpneq_epi32_mask(a, b) & k1
+    _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -30103,7 +30043,7 @@ pub unsafe fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
 pub unsafe fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpneq_epi32_mask(a, b) & k1
+    _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30119,11 +30059,19 @@ pub unsafe fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m512i,
 ) -> __mmask16 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
     let a = a.as_i32x16();
     let b = b.as_i32x16();
-    let r = vpcmpd(a, b, IMM3, neg_one);
-    transmute(r)
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i32x16::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i32x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30142,8 +30090,18 @@ pub unsafe fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
     static_assert_uimm_bits!(IMM3, 3);
     let a = a.as_i32x16();
     let b = b.as_i32x16();
-    let r = vpcmpd(a, b, IMM3, k1 as i16);
-    transmute(r)
+    let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i32x16::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i32x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30159,11 +30117,19 @@ pub unsafe fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m256i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
     let a = a.as_i32x8();
     let b = b.as_i32x8();
-    let r = vpcmpd256(a, b, IMM3, neg_one);
-    transmute(r)
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i32x8::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i32x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30182,8 +30148,18 @@ pub unsafe fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
     static_assert_uimm_bits!(IMM3, 3);
     let a = a.as_i32x8();
     let b = b.as_i32x8();
-    let r = vpcmpd256(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i32x8::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i32x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30196,11 +30172,19 @@ pub unsafe fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
 pub unsafe fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
     let a = a.as_i32x4();
     let b = b.as_i32x4();
-    let r = vpcmpd128(a, b, IMM3, neg_one);
-    transmute(r)
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i32x4::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i32x4::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30219,8 +30203,18 @@ pub unsafe fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
     static_assert_uimm_bits!(IMM3, 3);
     let a = a.as_i32x4();
     let b = b.as_i32x4();
-    let r = vpcmpd128(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i32x4::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i32x4::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -30242,7 +30236,7 @@ pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmplt_epu64_mask(a, b) & k1
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -30264,7 +30258,7 @@ pub unsafe fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmplt_epu64_mask(a, b) & k1
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -30286,7 +30280,7 @@ pub unsafe fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmplt_epu64_mask(a, b) & k1
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -30308,7 +30302,7 @@ pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmpgt_epu64_mask(a, b) & k1
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -30330,7 +30324,7 @@ pub unsafe fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpgt_epu64_mask(a, b) & k1
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -30352,7 +30346,7 @@ pub unsafe fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpgt_epu64_mask(a, b) & k1
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -30374,7 +30368,7 @@ pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmple_epu64_mask(a, b) & k1
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -30396,7 +30390,7 @@ pub unsafe fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmple_epu64_mask(a, b) & k1
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -30418,7 +30412,7 @@ pub unsafe fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmple_epu64_mask(a, b) & k1
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -30440,7 +30434,7 @@ pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmpge_epu64_mask(a, b) & k1
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -30462,7 +30456,7 @@ pub unsafe fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpge_epu64_mask(a, b) & k1
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -30484,7 +30478,7 @@ pub unsafe fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpge_epu64_mask(a, b) & k1
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
@@ -30506,7 +30500,7 @@ pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmpeq_epu64_mask(a, b) & k1
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
@@ -30528,7 +30522,7 @@ pub unsafe fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpeq_epu64_mask(a, b) & k1
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
@@ -30550,7 +30544,7 @@ pub unsafe fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpeq_epu64_mask(a, b) & k1
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -30572,7 +30566,7 @@ pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmpneq_epu64_mask(a, b) & k1
+    _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -30594,7 +30588,7 @@ pub unsafe fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpneq_epu64_mask(a, b) & k1
+    _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -30616,7 +30610,7 @@ pub unsafe fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
 pub unsafe fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpneq_epu64_mask(a, b) & k1
+    _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30632,11 +30626,19 @@ pub unsafe fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m512i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    let r = vpcmpuq(a, b, IMM3, neg_one);
-    transmute(r)
+    let a = a.as_u64x8();
+    let b = b.as_u64x8();
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i64x8::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i64x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30653,10 +30655,20 @@ pub unsafe fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m512i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    let r = vpcmpuq(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let a = a.as_u64x8();
+    let b = b.as_u64x8();
+    let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i64x8::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i64x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30672,11 +30684,19 @@ pub unsafe fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m256i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    let r = vpcmpuq256(a, b, IMM3, neg_one);
-    transmute(r)
+    let a = a.as_u64x4();
+    let b = b.as_u64x4();
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i64x4::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i64x4::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30693,10 +30713,20 @@ pub unsafe fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m256i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    let r = vpcmpuq256(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let a = a.as_u64x4();
+    let b = b.as_u64x4();
+    let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i64x4::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i64x4::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30709,11 +30739,19 @@ pub unsafe fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
 pub unsafe fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    let r = vpcmpuq128(a, b, IMM3, neg_one);
-    transmute(r)
+    let a = a.as_u64x2();
+    let b = b.as_u64x2();
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i64x2::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i64x2::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30730,10 +30768,20 @@ pub unsafe fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m128i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    let r = vpcmpuq128(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let a = a.as_u64x2();
+    let b = b.as_u64x2();
+    let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i64x2::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i64x2::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -30755,7 +30803,7 @@ pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmplt_epi64_mask(a, b) & k1
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -30777,7 +30825,7 @@ pub unsafe fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmplt_epi64_mask(a, b) & k1
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -30799,7 +30847,7 @@ pub unsafe fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmplt_epi64_mask(a, b) & k1
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -30821,7 +30869,7 @@ pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmpgt_epi64_mask(a, b) & k1
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -30843,7 +30891,7 @@ pub unsafe fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpgt_epi64_mask(a, b) & k1
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -30865,7 +30913,7 @@ pub unsafe fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpgt_epi64_mask(a, b) & k1
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -30887,7 +30935,7 @@ pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmple_epi64_mask(a, b) & k1
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -30909,7 +30957,7 @@ pub unsafe fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmple_epi64_mask(a, b) & k1
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -30931,7 +30979,7 @@ pub unsafe fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmple_epi64_mask(a, b) & k1
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -30953,7 +31001,7 @@ pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmpge_epi64_mask(a, b) & k1
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -30975,7 +31023,7 @@ pub unsafe fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpge_epi64_mask(a, b) & k1
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -30997,7 +31045,7 @@ pub unsafe fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpge_epi64_mask(a, b) & k1
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
@@ -31019,7 +31067,7 @@ pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmpeq_epi64_mask(a, b) & k1
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
@@ -31041,7 +31089,7 @@ pub unsafe fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpeq_epi64_mask(a, b) & k1
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
@@ -31063,7 +31111,7 @@ pub unsafe fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpeq_epi64_mask(a, b) & k1
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -31085,7 +31133,7 @@ pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
-    _mm512_cmpneq_epi64_mask(a, b) & k1
+    _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -31107,7 +31155,7 @@ pub unsafe fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
-    _mm256_cmpneq_epi64_mask(a, b) & k1
+    _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -31129,7 +31177,7 @@ pub unsafe fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
 pub unsafe fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpneq_epi64_mask(a, b) & k1
+    _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -31145,11 +31193,19 @@ pub unsafe fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m512i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
     let a = a.as_i64x8();
     let b = b.as_i64x8();
-    let r = vpcmpq(a, b, IMM3, neg_one);
-    transmute(r)
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i64x8::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i64x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31168,8 +31224,18 @@ pub unsafe fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
     static_assert_uimm_bits!(IMM3, 3);
     let a = a.as_i64x8();
     let b = b.as_i64x8();
-    let r = vpcmpq(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i64x8::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i64x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -31185,11 +31251,19 @@ pub unsafe fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
     b: __m256i,
 ) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
     let a = a.as_i64x4();
     let b = b.as_i64x4();
-    let r = vpcmpq256(a, b, IMM3, neg_one);
-    transmute(r)
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i64x4::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i64x4::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31208,8 +31282,18 @@ pub unsafe fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
     static_assert_uimm_bits!(IMM3, 3);
     let a = a.as_i64x4();
     let b = b.as_i64x4();
-    let r = vpcmpq256(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i64x4::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i64x4::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -31222,11 +31306,19 @@ pub unsafe fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
 pub unsafe fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
     static_assert_uimm_bits!(IMM3, 3);
-    let neg_one = -1;
     let a = a.as_i64x2();
     let b = b.as_i64x2();
-    let r = vpcmpq128(a, b, IMM3, neg_one);
-    transmute(r)
+    let r = match IMM3 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i64x2::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i64x2::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31245,8 +31337,18 @@ pub unsafe fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
     static_assert_uimm_bits!(IMM3, 3);
     let a = a.as_i64x2();
     let b = b.as_i64x2();
-    let r = vpcmpq128(a, b, IMM3, k1 as i8);
-    transmute(r)
+    let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::splat(0));
+    let r = match IMM3 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i64x2::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i64x2::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
@@ -35212,13 +35314,7 @@ pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtss))]
 pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vsqrtss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
-    ))
+    vsqrtss(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35229,13 +35325,7 @@ pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtss))]
 pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vsqrtss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
-    ))
+    vsqrtss(a, b, _mm_setzero_ps(), k, _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35246,13 +35336,7 @@ pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtsd))]
 pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vsqrtsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
-    ))
+    vsqrtsd(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35263,13 +35347,7 @@ pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtsd))]
 pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vsqrtsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
-    ))
+    vsqrtsd(a, b, _mm_setzero_pd(), k, _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -36014,13 +36092,13 @@ pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213ss))]
+#[cfg_attr(test, assert_instr(vfmadd))]
 pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
     let mut fmadd: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
-        fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmadd = fmaf32(fmadd, extractb, extractc);
     }
     simd_insert!(a, 0, fmadd)
 }
@@ -36031,14 +36109,14 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213ss))]
+#[cfg_attr(test, assert_instr(vfmadd))]
 pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
     let mut fmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract!(a, 0);
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
-        fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmadd = fmaf32(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fmadd)
 }
@@ -36049,13 +36127,13 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213ss))]
+#[cfg_attr(test, assert_instr(vfmadd))]
 pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
     let mut fmadd: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract!(a, 0);
         let extractb: f32 = simd_extract!(b, 0);
-        fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+        fmadd = fmaf32(extracta, extractb, fmadd);
     }
     simd_insert!(c, 0, fmadd)
 }
@@ -36066,13 +36144,13 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213sd))]
+#[cfg_attr(test, assert_instr(vfmadd))]
 pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
     let mut fmadd: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
-        fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmadd = fmaf64(fmadd, extractb, extractc);
     }
     simd_insert!(a, 0, fmadd)
 }
@@ -36083,14 +36161,14 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213sd))]
+#[cfg_attr(test, assert_instr(vfmadd))]
 pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
     let mut fmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract!(a, 0);
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
-        fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmadd = fmaf64(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fmadd)
 }
@@ -36101,13 +36179,13 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213sd))]
+#[cfg_attr(test, assert_instr(vfmadd))]
 pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
     let mut fmadd: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract!(a, 0);
         let extractb: f64 = simd_extract!(b, 0);
-        fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+        fmadd = fmaf64(extracta, extractb, fmadd);
     }
     simd_insert!(c, 0, fmadd)
 }
@@ -36118,14 +36196,14 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213ss))]
+#[cfg_attr(test, assert_instr(vfmsub))]
 pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
     let mut fmsub: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmsub = fmaf32(fmsub, extractb, extractc);
     }
     simd_insert!(a, 0, fmsub)
 }
@@ -36136,7 +36214,7 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213ss))]
+#[cfg_attr(test, assert_instr(vfmsub))]
 pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
     let mut fmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
@@ -36144,7 +36222,7 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmsub = fmaf32(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fmsub)
 }
@@ -36155,14 +36233,14 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213ss))]
+#[cfg_attr(test, assert_instr(vfmsub))]
 pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
     let mut fmsub: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract!(a, 0);
         let extractb: f32 = simd_extract!(b, 0);
         let extractc = -fmsub;
-        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmsub = fmaf32(extracta, extractb, extractc);
     }
     simd_insert!(c, 0, fmsub)
 }
@@ -36173,14 +36251,14 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213sd))]
+#[cfg_attr(test, assert_instr(vfmsub))]
 pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
     let mut fmsub: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmsub = fmaf64(fmsub, extractb, extractc);
     }
     simd_insert!(a, 0, fmsub)
 }
@@ -36191,7 +36269,7 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213sd))]
+#[cfg_attr(test, assert_instr(vfmsub))]
 pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
     let mut fmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
@@ -36199,7 +36277,7 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmsub = fmaf64(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fmsub)
 }
@@ -36210,14 +36288,14 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213sd))]
+#[cfg_attr(test, assert_instr(vfmsub))]
 pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
     let mut fmsub: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract!(a, 0);
         let extractb: f64 = simd_extract!(b, 0);
         let extractc = -fmsub;
-        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fmsub = fmaf64(extracta, extractb, extractc);
     }
     simd_insert!(c, 0, fmsub)
 }
@@ -36228,14 +36306,14 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+#[cfg_attr(test, assert_instr(vfnmadd))]
 pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
     let mut fnmadd: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmadd = fmaf32(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fnmadd)
 }
@@ -36246,7 +36324,7 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+#[cfg_attr(test, assert_instr(vfnmadd))]
 pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
     let mut fnmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
@@ -36254,7 +36332,7 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
         let extracta = -extracta;
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmadd = fmaf32(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fnmadd)
 }
@@ -36265,14 +36343,14 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+#[cfg_attr(test, assert_instr(vfnmadd))]
 pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
     let mut fnmadd: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
         let extractb: f32 = simd_extract!(b, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
+        fnmadd = fmaf32(extracta, extractb, fnmadd);
     }
     simd_insert!(c, 0, fnmadd)
 }
@@ -36283,14 +36361,14 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+#[cfg_attr(test, assert_instr(vfnmadd))]
 pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
     let mut fnmadd: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmadd = fmaf64(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fnmadd)
 }
@@ -36301,7 +36379,7 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+#[cfg_attr(test, assert_instr(vfnmadd))]
 pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
     let mut fnmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
@@ -36309,7 +36387,7 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
         let extracta = -extracta;
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmadd = fmaf64(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fnmadd)
 }
@@ -36320,14 +36398,14 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+#[cfg_attr(test, assert_instr(vfnmadd))]
 pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
     let mut fnmadd: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
         let extractb: f64 = simd_extract!(b, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
+        fnmadd = fmaf64(extracta, extractb, fnmadd);
     }
     simd_insert!(c, 0, fnmadd)
 }
@@ -36338,7 +36416,7 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+#[cfg_attr(test, assert_instr(vfnmsub))]
 pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
     let mut fnmsub: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
@@ -36346,7 +36424,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmsub = fmaf32(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fnmsub)
 }
@@ -36357,7 +36435,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+#[cfg_attr(test, assert_instr(vfnmsub))]
 pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
     let mut fnmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
@@ -36366,7 +36444,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmsub = fmaf32(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fnmsub)
 }
@@ -36377,7 +36455,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+#[cfg_attr(test, assert_instr(vfnmsub))]
 pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
     let mut fnmsub: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
@@ -36385,7 +36463,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
         let extracta = -extracta;
         let extractb: f32 = simd_extract!(b, 0);
         let extractc = -fnmsub;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmsub = fmaf32(extracta, extractb, extractc);
     }
     simd_insert!(c, 0, fnmsub)
 }
@@ -36396,7 +36474,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+#[cfg_attr(test, assert_instr(vfnmsub))]
 pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
     let mut fnmsub: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
@@ -36404,7 +36482,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmsub = fmaf64(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fnmsub)
 }
@@ -36415,7 +36493,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+#[cfg_attr(test, assert_instr(vfnmsub))]
 pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
     let mut fnmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
@@ -36424,7 +36502,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmsub = fmaf64(extracta, extractb, extractc);
     }
     simd_insert!(a, 0, fnmsub)
 }
@@ -36435,7 +36513,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+#[cfg_attr(test, assert_instr(vfnmsub))]
 pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
     let mut fnmsub: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
@@ -36443,7 +36521,7 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
         let extracta = -extracta;
         let extractb: f64 = simd_extract!(b, 0);
         let extractc = -fnmsub;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        fnmsub = fmaf64(extracta, extractb, extractc);
     }
     simd_insert!(c, 0, fnmsub)
 }
@@ -37357,11 +37435,7 @@ pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vsqrtss(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+    vsqrtss(a, b, _mm_setzero_ps(), 0b1, ROUNDING)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37386,11 +37460,7 @@ pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
     b: __m128,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vsqrtss(a, b, src, k, ROUNDING);
-    transmute(r)
+    vsqrtss(a, b, src, k, ROUNDING)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37414,11 +37484,7 @@ pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
     b: __m128,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vsqrtss(a, b, zero, k, ROUNDING);
-    transmute(r)
+    vsqrtss(a, b, _mm_setzero_ps(), k, ROUNDING)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -37438,11 +37504,7 @@ pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vsqrtsd(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+    vsqrtsd(a, b, _mm_setzero_pd(), 0b1, ROUNDING)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37467,11 +37529,7 @@ pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
     b: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vsqrtsd(a, b, src, k, ROUNDING);
-    transmute(r)
+    vsqrtsd(a, b, src, k, ROUNDING)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37495,11 +37553,7 @@ pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
     b: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vsqrtsd(a, b, zero, k, ROUNDING);
-    transmute(r)
+    vsqrtsd(a, b, _mm_setzero_pd(), k, ROUNDING)
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -38198,14 +38252,14 @@ pub unsafe fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract!(a, 0);
     let extractb: f32 = simd_extract!(b, 0);
     let extractc: f32 = simd_extract!(c, 0);
-    let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let r = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     simd_insert!(a, 0, r)
 }
 
@@ -38222,7 +38276,7 @@ pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
@@ -38235,7 +38289,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
     if (k & 0b00000001) != 0 {
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
-        fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
+        fmadd = vfmaddssround(fmadd, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fmadd)
 }
@@ -38253,7 +38307,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
@@ -38267,7 +38321,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
         let extracta: f32 = simd_extract!(a, 0);
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
-        fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+        fmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fmadd)
 }
@@ -38285,7 +38339,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
@@ -38298,7 +38352,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract!(a, 0);
         let extractb: f32 = simd_extract!(b, 0);
-        fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
+        fmadd = vfmaddssround(extracta, extractb, fmadd, ROUNDING);
     }
     simd_insert!(c, 0, fmadd)
 }
@@ -38316,7 +38370,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38327,7 +38381,7 @@ pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
     let extracta: f64 = simd_extract!(a, 0);
     let extractb: f64 = simd_extract!(b, 0);
     let extractc: f64 = simd_extract!(c, 0);
-    let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    let fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     simd_insert!(a, 0, fmadd)
 }
 
@@ -38344,7 +38398,7 @@ pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38357,7 +38411,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
     if (k & 0b00000001) != 0 {
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
-        fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
+        fmadd = vfmaddsdround(fmadd, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fmadd)
 }
@@ -38375,7 +38429,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
@@ -38389,7 +38443,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
         let extracta: f64 = simd_extract!(a, 0);
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
-        fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+        fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fmadd)
 }
@@ -38407,7 +38461,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38420,7 +38474,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract!(a, 0);
         let extractb: f64 = simd_extract!(b, 0);
-        fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
+        fmadd = vfmaddsdround(extracta, extractb, fmadd, ROUNDING);
     }
     simd_insert!(c, 0, fmadd)
 }
@@ -38438,7 +38492,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
@@ -38446,7 +38500,7 @@ pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
     let extractb: f32 = simd_extract!(b, 0);
     let extractc: f32 = simd_extract!(c, 0);
     let extractc = -extractc;
-    let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     simd_insert!(a, 0, fmsub)
 }
 
@@ -38463,7 +38517,7 @@ pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
@@ -38477,7 +38531,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
+        fmsub = vfmaddssround(fmsub, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fmsub)
 }
@@ -38495,7 +38549,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
@@ -38510,7 +38564,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+        fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fmsub)
 }
@@ -38528,7 +38582,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
@@ -38542,7 +38596,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
         let extracta: f32 = simd_extract!(a, 0);
         let extractb: f32 = simd_extract!(b, 0);
         let extractc = -fmsub;
-        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+        fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(c, 0, fmsub)
 }
@@ -38560,7 +38614,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38572,7 +38626,7 @@ pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
     let extractb: f64 = simd_extract!(b, 0);
     let extractc: f64 = simd_extract!(c, 0);
     let extractc = -extractc;
-    let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    let fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     simd_insert!(a, 0, fmsub)
 }
 
@@ -38589,7 +38643,7 @@ pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38603,7 +38657,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
+        fmsub = vfmaddsdround(fmsub, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fmsub)
 }
@@ -38621,7 +38675,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
@@ -38636,7 +38690,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+        fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fmsub)
 }
@@ -38654,7 +38708,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38668,7 +38722,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
         let extracta: f64 = simd_extract!(a, 0);
         let extractb: f64 = simd_extract!(b, 0);
         let extractc = -fmsub;
-        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+        fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(c, 0, fmsub)
 }
@@ -38686,7 +38740,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
@@ -38694,7 +38748,7 @@ pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
     let extracta = -extracta;
     let extractb: f32 = simd_extract!(b, 0);
     let extractc: f32 = simd_extract!(c, 0);
-    let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     simd_insert!(a, 0, fnmadd)
 }
 
@@ -38711,7 +38765,7 @@ pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
@@ -38725,7 +38779,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
         let extracta = -fnmadd;
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+        fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fnmadd)
 }
@@ -38743,7 +38797,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
@@ -38758,7 +38812,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
         let extracta = -extracta;
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+        fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fnmadd)
 }
@@ -38776,7 +38830,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
@@ -38790,7 +38844,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
         let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
         let extractb: f32 = simd_extract!(b, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
+        fnmadd = vfmaddssround(extracta, extractb, fnmadd, ROUNDING);
     }
     simd_insert!(c, 0, fnmadd)
 }
@@ -38808,7 +38862,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38820,7 +38874,7 @@ pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
     let extracta = -extracta;
     let extractb: f64 = simd_extract!(b, 0);
     let extractc: f64 = simd_extract!(c, 0);
-    let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    let fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     simd_insert!(a, 0, fnmadd)
 }
 
@@ -38837,7 +38891,7 @@ pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38851,7 +38905,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
         let extracta = -fnmadd;
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+        fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fnmadd)
 }
@@ -38869,7 +38923,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
@@ -38884,7 +38938,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
         let extracta = -extracta;
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+        fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fnmadd)
 }
@@ -38902,7 +38956,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -38916,7 +38970,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
         let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
         let extractb: f64 = simd_extract!(b, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
+        fnmadd = vfmaddsdround(extracta, extractb, fnmadd, ROUNDING);
     }
     simd_insert!(c, 0, fnmadd)
 }
@@ -38934,7 +38988,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
@@ -38943,7 +38997,7 @@ pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
     let extractb: f32 = simd_extract!(b, 0);
     let extractc: f32 = simd_extract!(c, 0);
     let extractc = -extractc;
-    let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     simd_insert!(a, 0, fnmsub)
 }
 
@@ -38960,7 +39014,7 @@ pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
@@ -38975,7 +39029,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+        fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fnmsub)
 }
@@ -38993,7 +39047,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
@@ -39009,7 +39063,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
         let extractb: f32 = simd_extract!(b, 0);
         let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+        fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fnmsub)
 }
@@ -39027,7 +39081,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
@@ -39042,7 +39096,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
         let extracta = -extracta;
         let extractb: f32 = simd_extract!(b, 0);
         let extractc = -fnmsub;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+        fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(c, 0, fnmsub)
 }
@@ -39060,7 +39114,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -39073,7 +39127,7 @@ pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
     let extractb: f64 = simd_extract!(b, 0);
     let extractc: f64 = simd_extract!(c, 0);
     let extractc = -extractc;
-    let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+    let fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     simd_insert!(a, 0, fnmsub)
 }
 
@@ -39090,7 +39144,7 @@ pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -39105,7 +39159,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+        fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fnmsub)
 }
@@ -39123,7 +39177,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
@@ -39139,7 +39193,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
         let extractb: f64 = simd_extract!(b, 0);
         let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+        fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(a, 0, fnmsub)
 }
@@ -39157,7 +39211,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
@@ -39172,7 +39226,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
         let extracta = -extracta;
         let extractb: f64 = simd_extract!(b, 0);
         let extractc = -fnmsub;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
+        fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
     }
     simd_insert!(c, 0, fnmsub)
 }
@@ -39512,7 +39566,7 @@ pub unsafe fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d)
         b.as_f64x2(),
         src.as_f32x4(),
         k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        _MM_FROUND_CUR_DIRECTION,
     ))
 }
 
@@ -39529,7 +39583,7 @@ pub unsafe fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
         b.as_f64x2(),
         _mm_setzero_ps().as_f32x4(),
         k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        _MM_FROUND_CUR_DIRECTION,
     ))
 }
 
@@ -40682,70 +40736,20 @@ pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF;
 
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.x86.avx512.pmul.dq.512"]
-    fn vpmuldq(a: i32x16, b: i32x16) -> i64x8;
-    #[link_name = "llvm.x86.avx512.pmulu.dq.512"]
-    fn vpmuludq(a: u32x16, b: u32x16) -> u64x8;
-
-    #[link_name = "llvm.x86.avx512.mask.pmaxs.d.512"]
-    fn vpmaxsd(a: i32x16, b: i32x16) -> i32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.512"]
-    fn vpmaxsq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.256"]
-    fn vpmaxsq256(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.128"]
-    fn vpmaxsq128(a: i64x2, b: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.pmins.d.512"]
-    fn vpminsd(a: i32x16, b: i32x16) -> i32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmins.q.512"]
-    fn vpminsq(a: i64x8, b: i64x8) -> i64x8;
-    #[link_name = "llvm.x86.avx512.mask.pmins.q.256"]
-    fn vpminsq256(a: i64x4, b: i64x4) -> i64x4;
-    #[link_name = "llvm.x86.avx512.mask.pmins.q.128"]
-    fn vpminsq128(a: i64x2, b: i64x2) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.pmaxu.d.512"]
-    fn vpmaxud(a: u32x16, b: u32x16) -> u32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.512"]
-    fn vpmaxuq(a: u64x8, b: u64x8) -> u64x8;
-    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.256"]
-    fn vpmaxuq256(a: u64x4, b: u64x4) -> u64x4;
-    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.128"]
-    fn vpmaxuq128(a: u64x2, b: u64x2) -> u64x2;
-
-    #[link_name = "llvm.x86.avx512.mask.pminu.d.512"]
-    fn vpminud(a: u32x16, b: u32x16) -> u32x16;
-
-    #[link_name = "llvm.x86.avx512.mask.pminu.q.512"]
-    fn vpminuq(a: u64x8, b: u64x8) -> u64x8;
-    #[link_name = "llvm.x86.avx512.mask.pminu.q.256"]
-    fn vpminuq256(a: u64x4, b: u64x4) -> u64x4;
-    #[link_name = "llvm.x86.avx512.mask.pminu.q.128"]
-    fn vpminuq128(a: u64x2, b: u64x2) -> u64x2;
-
     #[link_name = "llvm.x86.avx512.sqrt.ps.512"]
     fn vsqrtps(a: f32x16, rounding: i32) -> f32x16;
     #[link_name = "llvm.x86.avx512.sqrt.pd.512"]
     fn vsqrtpd(a: f64x8, rounding: i32) -> f64x8;
 
-    #[link_name = "llvm.fma.v16f32"]
-    fn vfmadd132ps(a: f32x16, b: f32x16, c: f32x16) -> f32x16;
-    #[link_name = "llvm.fma.v8f64"]
-    fn vfmadd132pd(a: f64x8, b: f64x8, c: f64x8) -> f64x8;
-
     #[link_name = "llvm.x86.avx512.vfmadd.ps.512"]
-    fn vfmadd132psround(a: f32x16, b: f32x16, c: f32x16, rounding: i32) -> f32x16;
+    fn vfmadd132psround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512;
     #[link_name = "llvm.x86.avx512.vfmadd.pd.512"]
-    fn vfmadd132pdround(a: f64x8, b: f64x8, c: f64x8, rounding: i32) -> f64x8;
+    fn vfmadd132pdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d;
 
     #[link_name = "llvm.x86.avx512.vfmaddsub.ps.512"]
-    fn vfmaddsub213ps(a: f32x16, b: f32x16, c: f32x16, d: i32) -> f32x16; //from clang
+    fn vfmaddsubpsround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512; //from clang
     #[link_name = "llvm.x86.avx512.vfmaddsub.pd.512"]
-    fn vfmaddsub213pd(a: f64x8, b: f64x8, c: f64x8, d: i32) -> f64x8; //from clang
+    fn vfmaddsubpdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d; //from clang
 
     #[link_name = "llvm.x86.avx512.add.ps.512"]
     fn vaddps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
@@ -41220,34 +41224,6 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
     fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;
 
-    #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"]
-    fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.ucmp.q.256"]
-    fn vpcmpuq256(a: i64x4, b: i64x4, op: i32, m: i8) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.ucmp.q.128"]
-    fn vpcmpuq128(a: i64x2, b: i64x2, op: i32, m: i8) -> i8;
-
-    #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]
-    fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.cmp.q.256"]
-    fn vpcmpq256(a: i64x4, b: i64x4, op: i32, m: i8) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.cmp.q.128"]
-    fn vpcmpq128(a: i64x2, b: i64x2, op: i32, m: i8) -> i8;
-
-    #[link_name = "llvm.x86.avx512.mask.ucmp.d.512"]
-    fn vpcmpud(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
-    #[link_name = "llvm.x86.avx512.mask.ucmp.d.256"]
-    fn vpcmpud256(a: i32x8, b: i32x8, op: i32, m: i8) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.ucmp.d.128"]
-    fn vpcmpud128(a: i32x4, b: i32x4, op: i32, m: i8) -> i8;
-
-    #[link_name = "llvm.x86.avx512.mask.cmp.d.512"]
-    fn vpcmpd(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
-    #[link_name = "llvm.x86.avx512.mask.cmp.d.256"]
-    fn vpcmpd256(a: i32x8, b: i32x8, op: i32, m: i8) -> i8;
-    #[link_name = "llvm.x86.avx512.mask.cmp.d.128"]
-    fn vpcmpd128(a: i32x4, b: i32x4, op: i32, m: i8) -> i8;
-
     #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
     fn vprold(a: i32x16, i8: i32) -> i32x16;
     #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
@@ -41500,9 +41476,9 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
     fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
     #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
-    fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    fn vsqrtss(a: __m128, b: __m128, src: __m128, mask: u8, rounding: i32) -> __m128;
     #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
-    fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    fn vsqrtsd(a: __m128d, b: __m128d, src: __m128d, mask: u8, rounding: i32) -> __m128d;
     #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
     fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
@@ -41531,9 +41507,9 @@ extern "C" {
     fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
 
     #[link_name = "llvm.x86.avx512.vfmadd.f32"]
-    fn vfmadd132ss(a: f32, b: f32, c: f32, rounding: i32) -> f32;
+    fn vfmaddssround(a: f32, b: f32, c: f32, rounding: i32) -> f32;
     #[link_name = "llvm.x86.avx512.vfmadd.f64"]
-    fn vfmadd132sd(a: f64, b: f64, c: f64, rounding: i32) -> f64;
+    fn vfmaddsdround(a: f64, b: f64, c: f64, rounding: i32) -> f64;
 
     #[link_name = "llvm.x86.avx512.mask.fixupimm.ss"]
     fn vfixupimmss(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
@@ -44152,6 +44128,14 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rsqrt14_ps() {
+        let a = _mm256_set1_ps(3.);
+        let r = _mm256_rsqrt14_ps(a);
+        let e = _mm256_set1_ps(0.5773392);
+        assert_eq_m256(r, e);
+    }
+
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_rsqrt14_ps() {
         let a = _mm256_set1_ps(3.);
@@ -44172,6 +44156,14 @@ mod tests {
         assert_eq_m256(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rsqrt14_ps() {
+        let a = _mm_set1_ps(3.);
+        let r = _mm_rsqrt14_ps(a);
+        let e = _mm_set1_ps(0.5773392);
+        assert_eq_m128(r, e);
+    }
+
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_rsqrt14_ps() {
         let a = _mm_set1_ps(3.);
@@ -54192,6 +54184,22 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_cvtmask16_u32() {
+        let a: __mmask16 = 0b11001100_00110011;
+        let r = _cvtmask16_u32(a);
+        let e: u32 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_cvtu32_mask16() {
+        let a: u32 = 0b11001100_00110011;
+        let r = _cvtu32_mask16(a);
+        let e: __mmask16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_kand() {
         let a: u16 = 0b11001100_00110011;
@@ -54298,6 +54306,65 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortest_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask16_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestc_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let r = _kortestc_mask16_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kortestz_mask16_u8() {
+        let a: __mmask16 = 0b0110100101101001;
+        let b: __mmask16 = 0b1011011010110110;
+        let r = _kortestz_mask16_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftli_mask16() {
+        let a: __mmask16 = 0b1001011011000011;
+        let r = _kshiftli_mask16::<3>(a);
+        let e: __mmask16 = 0b1011011000011000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512dq")]
+    unsafe fn test_kshiftri_mask16() {
+        let a: __mmask16 = 0b0110100100111100;
+        let r = _kshiftri_mask16::<3>(a);
+        let e: __mmask16 = 0b0000110100100111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_load_mask16() {
+        let a: __mmask16 = 0b1001011011000011;
+        let r = _load_mask16(&a);
+        let e: __mmask16 = 0b1001011011000011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_store_mask16() {
+        let a: __mmask16 = 0b0110100100111100;
+        let mut r = 0;
+        _store_mask16(&mut r, a);
+        let e: __mmask16 = 0b0110100100111100;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_kmov() {
         let a: u16 = 0b11001100_00110011;
@@ -54342,6 +54409,16 @@ mod tests {
         assert_eq!(r, 1);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_kortestz() {
+        let a: u16 = 0b11001100_00110011;
+        let b: u16 = 0b00101110_00001011;
+        let r = _mm512_kortestz(a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_kortestz(0, 0);
+        assert_eq!(r, 1);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_test_epi32_mask() {
         let a = _mm512_set1_epi32(1 << 0);
@@ -58410,6 +58487,20 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtss_f32() {
+        let a = _mm512_setr_ps(
+            312.0134, 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq!(_mm512_cvtss_f32(a), 312.0134);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsd_f64() {
+        let r = _mm512_cvtsd_f64(_mm512_setr_pd(-1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8));
+        assert_eq!(r, -1.1);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_shuffle_pd() {
         let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 359a668582..a2b2496caf 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -574,6 +574,46 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_abs_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let r = _mm_abs_epi64(a);
+        let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1));
+        assert_eq_m128i(r, e);
+        let a = _mm_set_epi64x(100, -100);
+        let r = _mm_abs_epi64(a);
+        let e = _mm_set_epi64x(100, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_abs_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let r = _mm_mask_abs_epi64(a, 0, a);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_abs_epi64(a, 0b00000011, a);
+        let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1));
+        assert_eq_m128i(r, e);
+        let a = _mm_set_epi64x(100, -100);
+        let r = _mm_mask_abs_epi64(a, 0b00000011, a);
+        let e = _mm_set_epi64x(100, 100);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_abs_epi64() {
+        let a = _mm_set_epi64x(i64::MAX, i64::MIN);
+        let r = _mm_maskz_abs_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_abs_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1));
+        assert_eq_m128i(r, e);
+        let a = _mm_set_epi64x(100, -100);
+        let r = _mm_maskz_abs_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(100, 100);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_abs_pd() {
         let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.);
@@ -1673,6 +1713,42 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_min_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_min_epi64(a, b);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(1, 0);
+        let r = _mm_min_epi64(a, b);
+        let e = _mm_set_epi64x(1, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_min_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_mask_min_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_min_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_min_epi64() {
+        let a = _mm_set_epi64x(0, 1);
+        let b = _mm_set_epi64x(3, 2);
+        let r = _mm_maskz_min_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_min_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(0, 1);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_min_pd() {
         let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
@@ -2745,6 +2821,14 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_rsqrt14_pd() {
+        let a = _mm256_set1_pd(3.);
+        let r = _mm256_rsqrt14_pd(a);
+        let e = _mm256_set1_pd(0.5773391723632813);
+        assert_eq_m256d(r, e);
+    }
+
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_rsqrt14_pd() {
         let a = _mm256_set1_pd(3.);
@@ -2765,6 +2849,14 @@ mod tests {
         assert_eq_m256d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_rsqrt14_pd() {
+        let a = _mm_set1_pd(3.);
+        let r = _mm_rsqrt14_pd(a);
+        let e = _mm_set1_pd(0.5773391723632813);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_rsqrt14_pd() {
         let a = _mm_set1_pd(3.);