diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md
index 4c70c1b435..16f6c58cbb 100644
--- a/crates/core_arch/missing-x86.md
+++ b/crates/core_arch/missing-x86.md
@@ -147,102 +147,6 @@
 </p></details>
 
 
-<details><summary>["AVX512F"]</summary><p>
-
-  * [ ] [`_mm512_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64)
-  * [ ] [`_mm512_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd)
-  * [ ] [`_mm512_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64)
-  * [ ] [`_mm512_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd)
-  * [ ] [`_mm512_mask_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64)
-  * [ ] [`_mm512_mask_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
-  * [ ] [`_mm512_mask_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
-  * [ ] [`_mm512_mask_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd)
-  * [ ] [`_mm512_stream_load_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_load_si512)
-  * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd)
-  * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss)
-  * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd)
-  * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss)
-  * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd)
-  * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss)
-</p></details>
-
-
-<details><summary>["AVX512F", "AVX512VL"]</summary><p>
-
-  * [ ] [`_mm256_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32)
-  * [ ] [`_mm256_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd)
-  * [ ] [`_mm256_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps)
-  * [ ] [`_mm256_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32)
-  * [ ] [`_mm256_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64)
-  * [ ] [`_mm256_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd)
-  * [ ] [`_mm256_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps)
-  * [ ] [`_mm256_mask_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32)
-  * [ ] [`_mm256_mask_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi64)
-  * [ ] [`_mm256_mask_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_pd)
-  * [ ] [`_mm256_mask_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps)
-  * [ ] [`_mm256_mask_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32)
-  * [ ] [`_mm256_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64)
-  * [ ] [`_mm256_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd)
-  * [ ] [`_mm256_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps)
-  * [ ] [`_mm256_mmask_i32gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32)
-  * [ ] [`_mm256_mmask_i32gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64)
-  * [ ] [`_mm256_mmask_i32gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd)
-  * [ ] [`_mm256_mmask_i32gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps)
-  * [ ] [`_mm256_mmask_i64gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32)
-  * [ ] [`_mm256_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64)
-  * [ ] [`_mm256_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd)
-  * [ ] [`_mm256_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps)
-  * [ ] [`_mm_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32)
-  * [ ] [`_mm_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64)
-  * [ ] [`_mm_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd)
-  * [ ] [`_mm_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps)
-  * [ ] [`_mm_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32)
-  * [ ] [`_mm_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64)
-  * [ ] [`_mm_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd)
-  * [ ] [`_mm_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps)
-  * [ ] [`_mm_mask_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32)
-  * [ ] [`_mm_mask_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64)
-  * [ ] [`_mm_mask_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd)
-  * [ ] [`_mm_mask_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps)
-  * [ ] [`_mm_mask_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32)
-  * [ ] [`_mm_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64)
-  * [ ] [`_mm_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_pd)
-  * [ ] [`_mm_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps)
-  * [ ] [`_mm_mmask_i32gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32)
-  * [ ] [`_mm_mmask_i32gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64)
-  * [ ] [`_mm_mmask_i32gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd)
-  * [ ] [`_mm_mmask_i32gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps)
-  * [ ] [`_mm_mmask_i64gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32)
-  * [ ] [`_mm_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64)
-  * [ ] [`_mm_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd)
-  * [ ] [`_mm_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps)
-</p></details>
-
-
-<details><summary>["AVX512_BF16", "AVX512F"]</summary><p>
-
-  * [ ] [`_mm512_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
-  * [ ] [`_mm512_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
-  * [ ] [`_mm512_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
-  * [ ] [`_mm_cvtsbh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
-</p></details>
-
-
-<details><summary>["AVX512_BF16", "AVX512VL"]</summary><p>
-
-  * [ ] [`_mm256_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
-  * [ ] [`_mm256_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
-  * [ ] [`_mm256_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
-  * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
-  * [ ] [`_mm_cvtness_sbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
-  * [ ] [`_mm_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
-  * [ ] [`_mm_mask_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
-  * [ ] [`_mm_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
-  * [ ] [`_mm_maskz_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
-  * [ ] [`_mm_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
-</p></details>
-
-
 <details><summary>["AVX512_FP16"]</summary><p>
 
   * [ ] [`_mm256_castpd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
@@ -1207,79 +1111,14 @@
 </p></details>
 
 
-<details><summary>["AVX_IFMA"]</summary><p>
-
-  * [ ] [`_mm256_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52hi_avx_epu64)
-  * [ ] [`_mm256_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52lo_avx_epu64)
-  * [ ] [`_mm_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52hi_avx_epu64)
-  * [ ] [`_mm_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52lo_avx_epu64)
-</p></details>
-
-
 <details><summary>["AVX_NE_CONVERT"]</summary><p>
 
-  * [ ] [`_mm256_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
   * [ ] [`_mm256_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
-  * [ ] [`_mm256_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
   * [ ] [`_mm256_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
-  * [ ] [`_mm256_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
   * [ ] [`_mm256_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
-  * [ ] [`_mm256_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
-  * [ ] [`_mm_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
   * [ ] [`_mm_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
-  * [ ] [`_mm_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
   * [ ] [`_mm_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
-  * [ ] [`_mm_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
   * [ ] [`_mm_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
-  * [ ] [`_mm_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
-  * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
-</p></details>
-
-
-<details><summary>["AVX_VNNI"]</summary><p>
-
-  * [ ] [`_mm256_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_avx_epi32)
-  * [ ] [`_mm256_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_avx_epi32)
-  * [ ] [`_mm256_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_avx_epi32)
-  * [ ] [`_mm256_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_avx_epi32)
-  * [ ] [`_mm_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_avx_epi32)
-  * [ ] [`_mm_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_avx_epi32)
-  * [ ] [`_mm_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_avx_epi32)
-  * [ ] [`_mm_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_avx_epi32)
-</p></details>
-
-
-<details><summary>["AVX_VNNI_INT16"]</summary><p>
-
-  * [ ] [`_mm256_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsud_epi32)
-  * [ ] [`_mm256_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsuds_epi32)
-  * [ ] [`_mm256_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusd_epi32)
-  * [ ] [`_mm256_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusds_epi32)
-  * [ ] [`_mm256_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuud_epi32)
-  * [ ] [`_mm256_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuuds_epi32)
-  * [ ] [`_mm_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsud_epi32)
-  * [ ] [`_mm_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsuds_epi32)
-  * [ ] [`_mm_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusd_epi32)
-  * [ ] [`_mm_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusds_epi32)
-  * [ ] [`_mm_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuud_epi32)
-  * [ ] [`_mm_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuuds_epi32)
-</p></details>
-
-
-<details><summary>["AVX_VNNI_INT8"]</summary><p>
-
-  * [ ] [`_mm256_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssd_epi32)
-  * [ ] [`_mm256_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssds_epi32)
-  * [ ] [`_mm256_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsud_epi32)
-  * [ ] [`_mm256_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsuds_epi32)
-  * [ ] [`_mm256_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuud_epi32)
-  * [ ] [`_mm256_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuuds_epi32)
-  * [ ] [`_mm_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssd_epi32)
-  * [ ] [`_mm_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssds_epi32)
-  * [ ] [`_mm_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsud_epi32)
-  * [ ] [`_mm_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsuds_epi32)
-  * [ ] [`_mm_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuud_epi32)
-  * [ ] [`_mm_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuuds_epi32)
 </p></details>
 
 
diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index aa5a5d8c18..7726a188f2 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -1738,8 +1738,8 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
     crate::arch::asm!(
-        "vmovntdq [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("vmovntdq", ",{a}"),
+        p = in(reg) mem_addr,
         a = in(ymm_reg) a,
         options(nostack, preserves_flags),
     );
@@ -1766,8 +1766,8 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
     crate::arch::asm!(
-        "vmovntpd [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("vmovntpd", ",{a}"),
+        p = in(reg) mem_addr,
         a = in(ymm_reg) a,
         options(nostack, preserves_flags),
     );
@@ -1795,8 +1795,8 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
     crate::arch::asm!(
-        "vmovntps [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("vmovntps", ",{a}"),
+        p = in(reg) mem_addr,
         a = in(ymm_reg) a,
         options(nostack, preserves_flags),
     );
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index 0343416a92..fa32c7fcc4 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -3149,9 +3149,9 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
 pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
     let dst: __m256i;
     crate::arch::asm!(
-        "vmovntdqa {a}, [{mem_addr}]",
+        vpl!("vmovntdqa {a}"),
         a = out(ymm_reg) dst,
-        mem_addr = in(reg) mem_addr,
+        p = in(reg) mem_addr,
         options(pure, readonly, nostack, preserves_flags),
     );
     dst
diff --git a/crates/core_arch/src/x86/avx512bf16.rs b/crates/core_arch/src/x86/avx512bf16.rs
index dbd8b4a6f3..7d99809353 100644
--- a/crates/core_arch/src/x86/avx512bf16.rs
+++ b/crates/core_arch/src/x86/avx512bf16.rs
@@ -2,6 +2,7 @@
 //!
 //! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
 
+use crate::arch::asm;
 use crate::core_arch::{simd::*, x86::*};
 use crate::intrinsics::simd::*;
 
@@ -365,9 +366,211 @@ pub unsafe fn _mm512_maskz_dpbf16_ps(
     transmute(simd_select_bitmask(k, rst, zero))
 }
 
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
+    _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a))))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
+    let cvt = _mm512_cvtpbh_ps(a);
+    transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
+    let cvt = _mm512_cvtpbh_ps(a);
+    let zero = _mm512_setzero_ps();
+    transmute(simd_select_bitmask(k, cvt.as_f32x16(), zero.as_f32x16()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
+    _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a))))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
+    let cvt = _mm256_cvtpbh_ps(a);
+    transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
+    let cvt = _mm256_cvtpbh_ps(a);
+    let zero = _mm256_setzero_ps();
+    transmute(simd_select_bitmask(k, cvt.as_f32x8(), zero.as_f32x8()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
+    _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a))))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
+    let cvt = _mm_cvtpbh_ps(a);
+    transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
+    let cvt = _mm_cvtpbh_ps(a);
+    let zero = _mm_setzero_ps();
+    transmute(simd_select_bitmask(k, cvt.as_f32x4(), zero.as_f32x4()))
+}
+
+/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
+/// element, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm_cvtsbh_ss(a: bf16) -> f32 {
+    f32::from_bits((a.to_bits() as u32) << 16)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl,sse")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
+    let mut dst: __m128bh;
+    asm!(
+        "vcvtneps2bf16 {dst}, {src}",
+        dst = lateout(xmm_reg) dst,
+        src = in(xmm_reg) a,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    dst
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl,sse,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
+    let mut dst = src;
+    asm!(
+        "vcvtneps2bf16 {dst}{{{k}}},{src}",
+        dst = inlateout(xmm_reg) dst,
+        src = in(xmm_reg) a,
+        k = in(kreg) k,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    dst
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl,sse,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
+    let mut dst: __m128bh;
+    asm!(
+        "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
+        dst = lateout(xmm_reg) dst,
+        src = in(xmm_reg) a,
+        k = in(kreg) k,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    dst
+}
+
+/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
+/// element, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm_cvtness_sbh(a: f32) -> bf16 {
+    let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
+    bf16::from_bits(value)
+}
+
 #[cfg(test)]
 mod tests {
-    use crate::{core_arch::x86::*, mem::transmute};
+    use crate::core_arch::simd::u16x4;
+    use crate::{
+        core_arch::x86::*,
+        mem::{transmute, transmute_copy},
+    };
     use stdarch_test::simd_test;
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
@@ -1592,4 +1795,156 @@ mod tests {
         ];
         assert_eq!(result, expected_result);
     }
+
+    const BF16_ONE: u16 = 0b0_01111111_0000000;
+    const BF16_TWO: u16 = 0b0_10000000_0000000;
+    const BF16_THREE: u16 = 0b0_10000000_1000000;
+    const BF16_FOUR: u16 = 0b0_10000001_0000000;
+    const BF16_FIVE: u16 = 0b0_10000001_0100000;
+    const BF16_SIX: u16 = 0b0_10000001_1000000;
+    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_cvtpbh_ps() {
+        let a = __m256bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm512_cvtpbh_ps(a);
+        let e = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_mask_cvtpbh_ps() {
+        let a = __m256bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let src = _mm512_setr_ps(
+            9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let k = 0b1010_1010_1010_1010;
+        let r = _mm512_mask_cvtpbh_ps(src, k, a);
+        let e = _mm512_setr_ps(
+            9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm512_maskz_cvtpbh_ps() {
+        let a = __m256bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let k = 0b1010_1010_1010_1010;
+        let r = _mm512_maskz_cvtpbh_ps(k, a);
+        let e = _mm512_setr_ps(
+            0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_cvtpbh_ps() {
+        let a = __m128bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm256_cvtpbh_ps(a);
+        let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpbh_ps() {
+        let a = __m128bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let k = 0b1010_1010;
+        let r = _mm256_mask_cvtpbh_ps(src, k, a);
+        let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpbh_ps() {
+        let a = __m128bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let k = 0b1010_1010;
+        let r = _mm256_maskz_cvtpbh_ps(k, a);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtpbh_ps() {
+        let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0);
+        let r = _mm_cvtpbh_ps(a);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtpbh_ps() {
+        let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0);
+        let src = _mm_setr_ps(9., 10., 11., 12.);
+        let k = 0b1010;
+        let r = _mm_mask_cvtpbh_ps(src, k, a);
+        let e = _mm_setr_ps(9., 2., 11., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpbh_ps() {
+        let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0);
+        let k = 0b1010;
+        let r = _mm_maskz_cvtpbh_ps(k, a);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16")]
+    unsafe fn test_mm_cvtsbh_ss() {
+        let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE));
+        assert_eq!(r, 1.);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a));
+        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_mask_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let src = __m128bh(5, 6, 7, 8, !0, !0, !0, !0);
+        let k = 0b1010;
+        let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a));
+        let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_maskz_cvtneps_pbh() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let k = 0b1010;
+        let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a));
+        let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bf16,avx512vl")]
+    unsafe fn test_mm_cvtness_sbh() {
+        let r = _mm_cvtness_sbh(1.);
+        assert_eq!(r.to_bits(), BF16_ONE);
+    }
 }
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 1f786d01f0..dd74d11786 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -8,8 +8,6 @@ use crate::{
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
-use super::avx512f::{vpl, vps};
-
 /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 3fe919abc8..8a5a529b08 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -6,37 +6,6 @@ use crate::{
     mem, ptr,
 };
 
-// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
-// register name (e.g. rax). We have to explicitly override the placeholder to
-// use the 32-bit register name in that case.
-
-#[cfg(target_pointer_width = "32")]
-macro_rules! vpl {
-    ($inst:expr) => {
-        concat!($inst, ", [{p:e}]")
-    };
-}
-#[cfg(target_pointer_width = "64")]
-macro_rules! vpl {
-    ($inst:expr) => {
-        concat!($inst, ", [{p}]")
-    };
-}
-#[cfg(target_pointer_width = "32")]
-macro_rules! vps {
-    ($inst1:expr, $inst2:expr) => {
-        concat!($inst1, " [{p:e}]", $inst2)
-    };
-}
-#[cfg(target_pointer_width = "64")]
-macro_rules! vps {
-    ($inst1:expr, $inst2:expr) => {
-        concat!($inst1, " [{p}]", $inst2)
-    };
-}
-
-pub(crate) use {vpl, vps};
-
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
@@ -16565,27 +16534,6 @@ pub unsafe fn _mm512_mask_i32scatter_epi64<const SCALE: i32>(
     vpscatterdq(slice, mask, offsets, src, SCALE);
 }
 
-/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_i32scatter_epi64<const SCALE: i32>(
-    slice: *mut u8,
-    offsets: __m128i,
-    src: __m256i,
-) {
-    static_assert_imm8_scale!(SCALE);
-    let src = src.as_i64x4();
-    let neg_one = -1;
-    let slice = slice as *mut i8;
-    let offsets = offsets.as_i32x4();
-    vpscatterdq256(slice, neg_one, offsets, src, SCALE);
-}
-
 /// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi64&expand=3116)
@@ -16715,6 +16663,1153 @@ pub unsafe fn _mm512_mask_i64scatter_epi32<const SCALE: i32>(
     vpscatterqd(slice, mask, offsets, src, SCALE);
 }
 
+/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale and stores them in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_i32logather_epi64<const SCALE: i32>(
+    vindex: __m512i,
+    base_addr: *const u8,
+) -> __m512i {
+    _mm512_i32gather_epi64::<SCALE>(_mm512_castsi512_si256(vindex), base_addr as _)
+}
+
+/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale and stores them in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_i32logather_epi64<const SCALE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    vindex: __m512i,
+    base_addr: *const u8,
+) -> __m512i {
+    _mm512_mask_i32gather_epi64::<SCALE>(src, k, _mm512_castsi512_si256(vindex), base_addr as _)
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_i32logather_pd<const SCALE: i32>(
+    vindex: __m512i,
+    base_addr: *const u8,
+) -> __m512d {
+    _mm512_i32gather_pd::<SCALE>(_mm512_castsi512_si256(vindex), base_addr as _)
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst
+/// using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_i32logather_pd<const SCALE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    vindex: __m512i,
+    base_addr: *const u8,
+) -> __m512d {
+    _mm512_mask_i32gather_pd::<SCALE>(src, k, _mm512_castsi512_si256(vindex), base_addr as _)
+}
+
+/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_i32loscatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m512i,
+    a: __m512i,
+) {
+    _mm512_i32scatter_epi64::<SCALE>(base_addr as _, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_i32loscatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m512i,
+    a: __m512i,
+) {
+    _mm512_mask_i32scatter_epi64::<SCALE>(base_addr as _, k, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_i32loscatter_pd<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m512i,
+    a: __m512d,
+) {
+    _mm512_i32scatter_pd::<SCALE>(base_addr as _, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale using writemask k
+/// (elements whose corresponding mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_i32loscatter_pd<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m512i,
+    a: __m512d,
+) {
+    _mm512_mask_i32scatter_pd::<SCALE>(base_addr as _, k, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_i32x8(), SCALE)
+}
+
+/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_256(base_addr as _, k, vindex.as_i32x8(), a.as_i32x8(), SCALE)
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_i32scatter_epi64<const SCALE: i32>(
+    slice: *mut u8,
+    offsets: __m128i,
+    src: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    let src = src.as_i64x4();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x4();
+    vpscatterdq_256(slice, 0xff, offsets, src, SCALE);
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_256(base_addr as _, k, vindex.as_i32x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m128i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_256(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_256(base_addr as _, k, vindex.as_i32x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m256i,
+    a: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_f32x8(), SCALE)
+}
+
+/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_256(base_addr as _, k, vindex.as_i32x8(), a.as_f32x8(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m256i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_256(base_addr as _, k, vindex.as_i64x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_256(base_addr as _, k, vindex.as_i64x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m256i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m256d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_256(base_addr as _, k, vindex.as_i64x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m256i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m256i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_256(base_addr as _, k, vindex.as_i64x4(), a.as_f32x4(), SCALE)
+}
+
+/// Loads 8 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i32gather_epi32<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const u8,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdd_256(
+        src.as_i32x8(),
+        base_addr as _,
+        vindex.as_i32x8(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i32gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdq_256(
+        src.as_i64x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i32gather_pd<const SCALE: i32>(
+    src: __m256d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdpd_256(
+        src.as_f64x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 8 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i32gather_ps<const SCALE: i32>(
+    src: __m256,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const u8,
+) -> __m256 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdps_256(
+        src.as_f32x8(),
+        base_addr as _,
+        vindex.as_i32x8(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const u8,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqd_256(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i64gather_epi64<const SCALE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const u8,
+) -> __m256i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqq_256(
+        src.as_i64x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i64gather_pd<const SCALE: i32>(
+    src: __m256d,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const u8,
+) -> __m256d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqpd_256(
+        src.as_f64x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m256i,
+    base_addr: *const u8,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqps_256(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i64x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i32scatter_epi32<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdd_128(base_addr as _, k, vindex.as_i32x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i32scatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterdq_128(base_addr as _, k, vindex.as_i32x4(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i32scatter_pd<const SCALE: i32>(base_addr: *mut u8, vindex: __m128i, a: __m128d) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i32scatter_pd<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdpd_128(base_addr as _, k, vindex.as_i32x4(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i32scatter_ps<const SCALE: i32>(base_addr: *mut u8, vindex: __m128i, a: __m128) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i32scatter_ps<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterdps_128(base_addr as _, k, vindex.as_i32x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i64scatter_epi32<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqd_128(base_addr as _, k, vindex.as_i64x2(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i64scatter_epi64<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128i,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vpscatterqq_128(base_addr as _, k, vindex.as_i64x2(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i64scatter_pd<const SCALE: i32>(base_addr: *mut u8, vindex: __m128i, a: __m128d) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i64scatter_pd<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128d,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqpd_128(base_addr as _, k, vindex.as_i64x2(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i64scatter_ps<const SCALE: i32>(base_addr: *mut u8, vindex: __m128i, a: __m128) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i64scatter_ps<const SCALE: i32>(
+    base_addr: *mut u8,
+    k: __mmask8,
+    vindex: __m128i,
+    a: __m128,
+) {
+    static_assert_imm8_scale!(SCALE);
+    vscatterqps_128(base_addr as _, k, vindex.as_i64x2(), a.as_f32x4(), SCALE)
+}
+
+/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i32gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdd_128(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i32gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherdq_128(
+        src.as_i64x2(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i32gather_pd<const SCALE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdpd_128(
+        src.as_f64x2(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i32gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherdps_128(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i32x4(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i64gather_epi32<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqd_128(
+        src.as_i32x4(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i64gather_epi64<const SCALE: i32>(
+    src: __m128i,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m128i {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vpgatherqq_128(
+        src.as_i64x2(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i64gather_pd<const SCALE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m128d {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqpd_128(
+        src.as_f64x2(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
+/// Loads 2 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i64gather_ps<const SCALE: i32>(
+    src: __m128,
+    k: __mmask8,
+    vindex: __m128i,
+    base_addr: *const u8,
+) -> __m128 {
+    static_assert_imm8_scale!(SCALE);
+    transmute(vgatherqps_128(
+        src.as_f32x4(),
+        base_addr as _,
+        vindex.as_i64x2(),
+        k,
+        SCALE,
+    ))
+}
+
 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi32&expand=1198)
@@ -27899,8 +28994,8 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
     crate::arch::asm!(
-        "vmovntps [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("vmovntps", ",{a}"),
+        p = in(reg) mem_addr,
         a = in(zmm_reg) a,
         options(nostack, preserves_flags),
     );
@@ -27925,8 +29020,8 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
     crate::arch::asm!(
-        "vmovntpd [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("vmovntpd", ",{a}"),
+        p = in(reg) mem_addr,
         a = in(zmm_reg) a,
         options(nostack, preserves_flags),
     );
@@ -27951,13 +29046,32 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) {
     crate::arch::asm!(
-        "vmovntdq [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("vmovntdq", ",{a}"),
+        p = in(reg) mem_addr,
         a = in(zmm_reg) a,
         options(nostack, preserves_flags),
     );
 }
 
+/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
+/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si256)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
+    let dst: __m512i;
+    crate::arch::asm!(
+        vpl!("vmovntdqa {a}"),
+        a = out(zmm_reg) dst,
+        p = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
 /// Sets packed 32-bit integers in `dst` with the supplied values.
 ///
 /// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931)
@@ -33856,6 +34970,94 @@ pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
     dst
 }
 
+/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
+/// 3 packed elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_load_ss(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128 = src;
+    asm!(
+        vpl!("vmovss {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper 3 packed
+/// elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_load_ss(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128;
+    asm!(
+        vpl!("vmovss {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
+/// element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_load_sd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d = src;
+    asm!(
+        vpl!("vmovsd {dst}{{{k}}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = inout(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper element
+/// of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
+/// may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_load_sd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d;
+    asm!(
+        vpl!("vmovsd {dst}{{{k}}} {{z}}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        dst = out(xmm_reg) dst,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
 /// Store packed 32-bit integers from a into memory using writemask k.
 /// mem_addr does not need to be aligned on any particular boundary.
 ///
@@ -34276,6 +35478,42 @@ pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d)
     );
 }
 
+/// Store a single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr
+/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_store_ss(mem_addr: *mut f32, k: __mmask8, a: __m128) {
+    asm!(
+        vps!("vmovss", "{{{k}}}, {a}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
+/// Store a double-precision (64-bit) floating-point element from a into memory using writemask k. mem_addr
+/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_store_sd(mem_addr: *mut f64, k: __mmask8, a: __m128d) {
+    asm!(
+        vps!("vmovsd", "{{{k}}}, {a}"),
+        p = in(reg) mem_addr,
+        k = in(kreg) k,
+        a = in(xmm_reg) a,
+        options(nostack, preserves_flags),
+    );
+}
+
 /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi32)
@@ -41174,8 +42412,6 @@ extern "C" {
     fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
     #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
     fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scattersiv4.di"]
-    fn vpscatterdq256(slice: *mut i8, mask: i8, offsets: i32x4, src: i64x4, scale: i32);
 
     #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
     fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
@@ -41184,6 +42420,74 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
     fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
 
+    #[link_name = "llvm.x86.avx512.scattersiv4.si"]
+    fn vpscatterdd_128(slice: *mut i8, k: u8, offsets: i32x4, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv2.di"]
+    fn vpscatterdq_128(slice: *mut i8, k: u8, offsets: i32x4, src: i64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv2.df"]
+    fn vscatterdpd_128(slice: *mut i8, k: u8, offsets: i32x4, src: f64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.sf"]
+    fn vscatterdps_128(slice: *mut i8, k: u8, offsets: i32x4, src: f32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.si"]
+    fn vpscatterqd_128(slice: *mut i8, k: u8, offsets: i64x2, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv2.di"]
+    fn vpscatterqq_128(slice: *mut i8, k: u8, offsets: i64x2, src: i64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv2.df"]
+    fn vscatterqpd_128(slice: *mut i8, k: u8, offsets: i64x2, src: f64x2, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.sf"]
+    fn vscatterqps_128(slice: *mut i8, k: u8, offsets: i64x2, src: f32x4, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.scattersiv8.si"]
+    fn vpscatterdd_256(slice: *mut i8, k: u8, offsets: i32x8, src: i32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.di"]
+    fn vpscatterdq_256(slice: *mut i8, k: u8, offsets: i32x4, src: i64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv4.df"]
+    fn vscatterdpd_256(slice: *mut i8, k: u8, offsets: i32x4, src: f64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scattersiv8.sf"]
+    fn vscatterdps_256(slice: *mut i8, k: u8, offsets: i32x8, src: f32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv8.si"]
+    fn vpscatterqd_256(slice: *mut i8, k: u8, offsets: i64x4, src: i32x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.di"]
+    fn vpscatterqq_256(slice: *mut i8, k: u8, offsets: i64x4, src: i64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv4.df"]
+    fn vscatterqpd_256(slice: *mut i8, k: u8, offsets: i64x4, src: f64x4, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatterdiv8.sf"]
+    fn vscatterqps_256(slice: *mut i8, k: u8, offsets: i64x4, src: f32x4, scale: i32);
+
+    #[link_name = "llvm.x86.avx512.gather3siv4.si"]
+    fn vpgatherdd_128(src: i32x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3siv2.di"]
+    fn vpgatherdq_128(src: i64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x2;
+    #[link_name = "llvm.x86.avx512.gather3siv2.df"]
+    fn vgatherdpd_128(src: f64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.gather3siv4.sf"]
+    fn vgatherdps_128(src: f32x4, slice: *const u8, offsets: i32x4, k: u8, scale: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.si"]
+    fn vpgatherqd_128(src: i32x4, slice: *const u8, offsets: i64x2, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3div2.di"]
+    fn vpgatherqq_128(src: i64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> i64x2;
+    #[link_name = "llvm.x86.avx512.gather3div2.df"]
+    fn vgatherqpd_128(src: f64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.gather3div4.sf"]
+    fn vgatherqps_128(src: f32x4, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f32x4;
+
+    #[link_name = "llvm.x86.avx512.gather3siv8.si"]
+    fn vpgatherdd_256(src: i32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.gather3siv4.di"]
+    fn vpgatherdq_256(src: i64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.gather3siv4.df"]
+    fn vgatherdpd_256(src: f64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.gather3siv8.sf"]
+    fn vgatherdps_256(src: f32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> f32x8;
+    #[link_name = "llvm.x86.avx512.gather3div8.si"]
+    fn vpgatherqd_256(src: i32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i32x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.di"]
+    fn vpgatherqq_256(src: i64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i64x4;
+    #[link_name = "llvm.x86.avx512.gather3div4.df"]
+    fn vgatherqpd_256(src: f64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f64x4;
+    #[link_name = "llvm.x86.avx512.gather3div8.sf"]
+    fn vgatherqps_256(src: f32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f32x4;
+
     #[link_name = "llvm.x86.avx512.mask.cmp.ss"]
     fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
     #[link_name = "llvm.x86.avx512.mask.cmp.sd"]
@@ -50265,6 +51569,60 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_load_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let src = _mm_set_ss(2.0);
+        let mem = Align { data: 1.0 };
+        let r = _mm_mask_load_ss(src, 0b1, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(1.0));
+        let r = _mm_mask_load_ss(src, 0b0, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(2.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_load_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let mem = Align { data: 1.0 };
+        let r = _mm_maskz_load_ss(0b1, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(1.0));
+        let r = _mm_maskz_load_ss(0b0, &mem.data);
+        assert_eq_m128(r, _mm_set_ss(0.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_load_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let src = _mm_set_sd(2.0);
+        let mem = Align { data: 1.0 };
+        let r = _mm_mask_load_sd(src, 0b1, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(1.0));
+        let r = _mm_mask_load_sd(src, 0b0, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(2.0));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_load_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let mem = Align { data: 1.0 };
+        let r = _mm_maskz_load_sd(0b1, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(1.0));
+        let r = _mm_maskz_load_sd(0b0, &mem.data);
+        assert_eq_m128d(r, _mm_set_sd(0.0));
+    }
+
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_storeu_pd() {
         let mut r = [42_f64; 2];
@@ -50289,6 +51647,34 @@ mod tests {
         assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_store_ss() {
+        #[repr(align(16))]
+        struct Align {
+            data: f32,
+        }
+        let a = _mm_set_ss(2.0);
+        let mut mem = Align { data: 1.0 };
+        _mm_mask_store_ss(&mut mem.data, 0b1, a);
+        assert_eq!(mem.data, 2.0);
+        _mm_mask_store_ss(&mut mem.data, 0b0, a);
+        assert_eq!(mem.data, 2.0);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_store_sd() {
+        #[repr(align(16))]
+        struct Align {
+            data: f64,
+        }
+        let a = _mm_set_sd(2.0);
+        let mut mem = Align { data: 1.0 };
+        _mm_mask_store_sd(&mut mem.data, 0b1, a);
+        assert_eq!(mem.data, 2.0);
+        _mm_mask_store_sd(&mut mem.data, 0b0, a);
+        assert_eq!(mem.data, 2.0);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_setr_pd() {
         let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
@@ -54566,6 +55952,13 @@ mod tests {
         }
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_load_si512() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _);
+        assert_eq_m512i(a, r);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_reduce_add_epi32() {
         let a = _mm512_set1_epi32(1);
diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs
index 01bb704ae7..3bf9958e3d 100644
--- a/crates/core_arch/src/x86/avx512ifma.rs
+++ b/crates/core_arch/src/x86/avx512ifma.rs
@@ -114,6 +114,24 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64(
     simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512())
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpmadd52huq)
+)]
+pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    vpmadd52huq_256(a, b, c)
+}
+
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
 /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
 /// unsigned integer from the intermediate result with the
@@ -169,6 +187,24 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64(
     simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256())
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpmadd52luq)
+)]
+pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    vpmadd52luq_256(a, b, c)
+}
+
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
 /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
 /// unsigned integer from the intermediate result with the
@@ -224,6 +260,24 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64(
     simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256())
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpmadd52huq)
+)]
+pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    vpmadd52huq_128(a, b, c)
+}
+
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
 /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
 /// unsigned integer from the intermediate result with the
@@ -269,6 +323,24 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _
     simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128())
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpmadd52luq)
+)]
+pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    vpmadd52luq_128(a, b, c)
+}
+
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
 /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
 /// unsigned integer from the intermediate result with the
@@ -427,6 +499,20 @@ mod tests {
         assert_eq_m512i(expected, actual);
     }
 
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm256_madd52hi_avx_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52hi_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm256_set1_epi64x(11030549757952);
+
+        assert_eq_m256i(expected, actual);
+    }
+
     #[simd_test(enable = "avx512ifma,avx512vl")]
     unsafe fn test_mm256_madd52hi_epu64() {
         let a = _mm256_set1_epi64x(10 << 40);
@@ -471,6 +557,20 @@ mod tests {
         assert_eq_m256i(expected, actual);
     }
 
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm256_madd52lo_avx_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52lo_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm256_set1_epi64x(100055558127628);
+
+        assert_eq_m256i(expected, actual);
+    }
+
     #[simd_test(enable = "avx512ifma,avx512vl")]
     unsafe fn test_mm256_madd52lo_epu64() {
         let a = _mm256_set1_epi64x(10 << 40);
@@ -515,6 +615,20 @@ mod tests {
         assert_eq_m256i(expected, actual);
     }
 
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm_madd52hi_avx_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52hi_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(expected, actual);
+    }
+
     #[simd_test(enable = "avx512ifma,avx512vl")]
     unsafe fn test_mm_madd52hi_epu64() {
         let a = _mm_set1_epi64x(10 << 40);
@@ -559,6 +673,20 @@ mod tests {
         assert_eq_m128i(expected, actual);
     }
 
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm_madd52lo_avx_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52lo_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm_set1_epi64x(100055558127628);
+
+        assert_eq_m128i(expected, actual);
+    }
+
     #[simd_test(enable = "avx512ifma,avx512vl")]
     unsafe fn test_mm_madd52lo_epu64() {
         let a = _mm_set1_epi64x(10 << 40);
diff --git a/crates/core_arch/src/x86/avx512vnni.rs b/crates/core_arch/src/x86/avx512vnni.rs
index 67a626b7ed..2ed800d295 100644
--- a/crates/core_arch/src/x86/avx512vnni.rs
+++ b/crates/core_arch/src/x86/avx512vnni.rs
@@ -50,6 +50,20 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwssd)
+)]
+pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216)
@@ -96,6 +110,20 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwssd)
+)]
+pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213)
@@ -178,6 +206,20 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwssds)
+)]
+pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225)
@@ -224,6 +266,20 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwssds)
+)]
+pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222)
@@ -311,6 +367,20 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbusd)
+)]
+pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198)
@@ -357,6 +427,20 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbusd)
+)]
+pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195)
@@ -439,6 +523,20 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbusds)
+)]
+pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207)
@@ -485,6 +583,20 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbusds)
+)]
+pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204)
@@ -526,6 +638,390 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbssd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbssd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbssds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbssds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwusd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwusd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwusds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwusds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
@@ -555,6 +1051,66 @@ extern "C" {
     fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
     #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
     fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx2.vpdpbssd.128"]
+    fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbssd.256"]
+    fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbssds.128"]
+    fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbssds.256"]
+    fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbsud.128"]
+    fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbsud.256"]
+    fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbsuds.128"]
+    fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbsuds.256"]
+    fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbuud.128"]
+    fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbuud.256"]
+    fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbuuds.128"]
+    fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbuuds.256"]
+    fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwsud.128"]
+    fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwsud.256"]
+    fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwsuds.128"]
+    fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwsuds.256"]
+    fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwusd.128"]
+    fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwusd.256"]
+    fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwusds.128"]
+    fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwusds.256"]
+    fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwuud.128"]
+    fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwuud.256"]
+    fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwuuds.128"]
+    fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwuuds.256"]
+    fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
 }
 
 #[cfg(test)]
@@ -597,6 +1153,16 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpwssd_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssd_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm256_dpwssd_epi32() {
         let src = _mm256_set1_epi32(1);
@@ -631,6 +1197,16 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpwssd_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssd_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm_dpwssd_epi32() {
         let src = _mm_set1_epi32(1);
@@ -699,6 +1275,16 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpwssds_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssds_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm256_dpwssds_epi32() {
         let src = _mm256_set1_epi32(1);
@@ -733,6 +1319,16 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpwssds_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssds_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm_dpwssds_epi32() {
         let src = _mm_set1_epi32(1);
@@ -801,6 +1397,16 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpbusd_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusd_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm256_dpbusd_epi32() {
         let src = _mm256_set1_epi32(1);
@@ -835,6 +1441,16 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpbusd_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusd_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm_dpbusd_epi32() {
         let src = _mm_set1_epi32(1);
@@ -903,6 +1519,16 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpbusds_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusds_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm256_dpbusds_epi32() {
         let src = _mm256_set1_epi32(1);
@@ -937,6 +1563,16 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpbusds_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusds_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm_dpbusds_epi32() {
         let src = _mm_set1_epi32(1);
@@ -970,4 +1606,244 @@ mod tests {
         let e = _mm_set1_epi32(5);
         assert_eq_m128i(r, e);
     }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbssd_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbssd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbssds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbssds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbsud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbsud_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbsud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbsud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbsuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbsuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbsuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbsuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbuud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbuud_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbuud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbuud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbuuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbuuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbuuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbuuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwsud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwsud_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwsud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwsud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwsuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwsuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwsuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwsuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwusd_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwusd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwusds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwusds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwuud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwuud_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwuud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwuud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwuuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwuuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwuuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwuuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
 }
diff --git a/crates/core_arch/src/x86/avxneconvert.rs b/crates/core_arch/src/x86/avxneconvert.rs
new file mode 100644
index 0000000000..4eb1a9b30c
--- /dev/null
+++ b/crates/core_arch/src/x86/avxneconvert.rs
@@ -0,0 +1,253 @@
+use crate::arch::asm;
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vbcstnebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
+    bcstnebf162ps_128(a)
+}
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vbcstnebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
+    bcstnebf162ps_256(a)
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
+    transmute(cvtneebf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
+    transmute(cvtneebf162ps_256(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneobf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
+    transmute(cvtneobf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneobf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
+    transmute(cvtneobf162ps_256(a))
+}
+
+/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_bf16)
+#[inline]
+#[target_feature(enable = "avxneconvert,sse")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneps2bf16)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
+    let mut dst: __m128bh;
+    asm!(
+        "{{vex}}vcvtneps2bf16 {dst},{src}",
+        dst = lateout(xmm_reg) dst,
+        src = in(xmm_reg) a,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    dst
+}
+
+/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_bf16)
+#[inline]
+#[target_feature(enable = "avxneconvert,sse,avx")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneps2bf16)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
+    let mut dst: __m128bh;
+    asm!(
+        "{{vex}}vcvtneps2bf16 {dst},{src}",
+        dst = lateout(xmm_reg) dst,
+        src = in(ymm_reg) a,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    dst
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.vbcstnebf162ps128"]
+    fn bcstnebf162ps_128(a: *const bf16) -> __m128;
+    #[link_name = "llvm.x86.vbcstnebf162ps256"]
+    fn bcstnebf162ps_256(a: *const bf16) -> __m256;
+
+    #[link_name = "llvm.x86.vcvtneebf162ps128"]
+    fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
+    #[link_name = "llvm.x86.vcvtneebf162ps256"]
+    fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
+
+    #[link_name = "llvm.x86.vcvtneobf162ps128"]
+    fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
+    #[link_name = "llvm.x86.vcvtneobf162ps256"]
+    fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::simd::{u16x4, u16x8};
+    use crate::core_arch::x86::*;
+    use crate::mem::transmute_copy;
+    use std::ptr::addr_of;
+    use stdarch_test::simd_test;
+
+    const BF16_ONE: u16 = 0b0_01111111_0000000;
+    const BF16_TWO: u16 = 0b0_10000000_0000000;
+    const BF16_THREE: u16 = 0b0_10000000_1000000;
+    const BF16_FOUR: u16 = 0b0_10000001_0000000;
+    const BF16_FIVE: u16 = 0b0_10000001_0100000;
+    const BF16_SIX: u16 = 0b0_10000001_1000000;
+    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_bcstnebf16_ps() {
+        let a = bf16::from_bits(BF16_ONE);
+        let r = _mm_bcstnebf16_ps(addr_of!(a));
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_bcstnebf16_ps() {
+        let a = bf16::from_bits(BF16_ONE);
+        let r = _mm256_bcstnebf16_ps(addr_of!(a));
+        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneebf16_ps() {
+        let a = __m128bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm_cvtneebf16_ps(addr_of!(a));
+        let e = _mm_setr_ps(1., 3., 5., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneebf16_ps() {
+        let a = __m256bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm256_cvtneebf16_ps(addr_of!(a));
+        let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneobf16_ps() {
+        let a = __m128bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm_cvtneobf16_ps(addr_of!(a));
+        let e = _mm_setr_ps(2., 4., 6., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneobf16_ps() {
+        let a = __m256bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm256_cvtneobf16_ps(addr_of!(a));
+        let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneps_avx_pbh() {
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
+        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneps_avx_pbh() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
+        let e = u16x8::new(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        assert_eq!(r, e);
+    }
+}
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 17d64f5bbf..ddf38aa506 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -57,3 +57,33 @@ macro_rules! assert_approx_eq {
         );
     }};
 }
+
+// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vpl {
+    ($inst:expr) => {
+        concat!($inst, ", [{p:e}]")
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vpl {
+    ($inst:expr) => {
+        concat!($inst, ", [{p}]")
+    };
+}
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p:e}]", $inst2)
+    };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vps {
+    ($inst1:expr, $inst2:expr) => {
+        concat!($inst1, " [{p}]", $inst2)
+    };
+}
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 8b1d3bbbb6..9365fe10a2 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -337,6 +337,31 @@ types! {
     );
 }
 
+/// The BFloat16 type used in AVX-512 intrinsics.
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug)]
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub struct bf16(u16);
+
+impl bf16 {
+    /// Raw transmutation from `u16`
+    #[inline]
+    #[must_use]
+    #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+    pub const fn from_bits(bits: u16) -> bf16 {
+        bf16(bits)
+    }
+
+    /// Raw transmutation to `u16`
+    #[inline]
+    #[must_use = "this returns the result of the operation, without modifying the original"]
+    #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+    pub const fn to_bits(self) -> u16 {
+        self.0
+    }
+}
+
 /// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer
 #[allow(non_camel_case_types)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
@@ -894,6 +919,9 @@ mod f16c;
 pub use self::f16c::*;
 
 mod avx512bf16;
-
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub use self::avx512bf16::*;
+
+mod avxneconvert;
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub use self::avxneconvert::*;
diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs
index a4602301de..ea6e685acb 100644
--- a/crates/core_arch/src/x86/sse.rs
+++ b/crates/core_arch/src/x86/sse.rs
@@ -1992,8 +1992,8 @@ extern "C" {
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
     crate::arch::asm!(
-        "movntps [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("movntps", ",{a}"),
+        p = in(reg) mem_addr,
         a = in(xmm_reg) a,
         options(nostack, preserves_flags),
     );
diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs
index 289d41a0ff..0dee597410 100644
--- a/crates/core_arch/src/x86/sse2.rs
+++ b/crates/core_arch/src/x86/sse2.rs
@@ -1312,8 +1312,8 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
     crate::arch::asm!(
-        "movntdq [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("movntdq",  ",{a}"),
+        p = in(reg) mem_addr,
         a = in(xmm_reg) a,
         options(nostack, preserves_flags),
     );
@@ -1339,8 +1339,8 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
     crate::arch::asm!(
-        "movnti [{mem_addr}], {a:e}", // `:e` for 32bit value
-        mem_addr = in(reg) mem_addr,
+        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
+        p = in(reg) mem_addr,
         a = in(reg) a,
         options(nostack, preserves_flags),
     );
@@ -2542,8 +2542,8 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 #[allow(clippy::cast_ptr_alignment)]
 pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
     crate::arch::asm!(
-        "movntpd [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        vps!("movntpd", ",{a}"),
+        p = in(reg) mem_addr,
         a = in(xmm_reg) a,
         options(nostack, preserves_flags),
     );
diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs
index c8b260bec8..daf89bc3fd 100644
--- a/crates/core_arch/src/x86/sse41.rs
+++ b/crates/core_arch/src/x86/sse41.rs
@@ -1154,9 +1154,9 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
 pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
     let dst: __m128i;
     crate::arch::asm!(
-        "movntdqa {a}, [{mem_addr}]",
+        vpl!("movntdqa {a}"),
         a = out(xmm_reg) dst,
-        mem_addr = in(reg) mem_addr,
+        p = in(reg) mem_addr,
         options(pure, readonly, nostack, preserves_flags),
     );
     dst
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index a2b2496caf..5ea6dcc026 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -7649,20 +7649,6 @@ mod tests {
         assert_eq!(&arr[..], &expected[..],);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_i32scatter_epi64() {
-        let mut arr = [0i64; 64];
-        let index = _mm_setr_epi32(0, 16, 32, 48);
-        let src = _mm256_setr_epi64x(1, 2, 3, 4);
-        // A multiplier of 8 is word-addressing
-        _mm256_i32scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, index, src);
-        let mut expected = [0i64; 64];
-        for i in 0..4 {
-            expected[i * 16] = (i + 1) as i64;
-        }
-        assert_eq!(&arr[..], &expected[..],);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_i64scatter_epi64() {
         let mut arr = [0i64; 128];
@@ -7721,6 +7707,566 @@ mod tests {
         assert_eq!(&arr[..], &expected[..],);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32logather_epi64() {
+        let base_addr: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_i32logather_epi64::<8>(vindex, base_addr.as_ptr().cast());
+        let expected = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+        assert_eq_m512i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32logather_epi64() {
+        let base_addr: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let src = _mm512_setr_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r =
+            _mm512_mask_i32logather_epi64::<8>(src, 0b01010101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm512_setr_epi64(2, 10, 4, 12, 6, 14, 8, 16);
+        assert_eq_m512i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32logather_pd() {
+        let base_addr: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_i32logather_pd::<8>(vindex, base_addr.as_ptr().cast());
+        let expected = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+        assert_eq_m512d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32logather_pd() {
+        let base_addr: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let src = _mm512_setr_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let r = _mm512_mask_i32logather_pd::<8>(src, 0b01010101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm512_setr_pd(2., 10., 4., 12., 6., 14., 8., 16.);
+        assert_eq_m512d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32loscatter_epi64() {
+        let mut base_addr: [i64; 8] = [0; 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let src = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+        _mm512_i32loscatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2, 3, 4, 5, 6, 7, 8];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32loscatter_epi64() {
+        let mut base_addr: [i64; 8] = [0; 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let src = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+        _mm512_mask_i32loscatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src);
+        let expected = [0, 2, 0, 4, 0, 6, 0, 8];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32loscatter_pd() {
+        let mut base_addr: [f64; 8] = [0.; 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let src = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+        _mm512_i32loscatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2., 3., 4., 5., 6., 7., 8.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32loscatter_pd() {
+        let mut base_addr: [f64; 8] = [0.; 8];
+        let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+        let src = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+        _mm512_mask_i32loscatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src);
+        let expected = [0., 2., 0., 4., 0., 6., 0., 8.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i32gather_epi32() {
+        let base_addr: [i32; 4] = [1, 2, 3, 4];
+        let src = _mm_setr_epi32(5, 6, 7, 8);
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let r = _mm_mmask_i32gather_epi32::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_epi32(2, 6, 4, 8);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i32gather_epi64() {
+        let base_addr: [i64; 2] = [1, 2];
+        let src = _mm_setr_epi64x(5, 6);
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let r = _mm_mmask_i32gather_epi64::<8>(src, 0b01, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_epi64x(2, 6);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i32gather_pd() {
+        let base_addr: [f64; 2] = [1., 2.];
+        let src = _mm_setr_pd(5., 6.);
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let r = _mm_mmask_i32gather_pd::<8>(src, 0b01, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_pd(2., 6.);
+        assert_eq_m128d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i32gather_ps() {
+        let base_addr: [f32; 4] = [1., 2., 3., 4.];
+        let src = _mm_setr_ps(5., 6., 7., 8.);
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let r = _mm_mmask_i32gather_ps::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_ps(2., 6., 4., 8.);
+        assert_eq_m128(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i64gather_epi32() {
+        let base_addr: [i32; 2] = [1, 2];
+        let src = _mm_setr_epi32(5, 6, 7, 8);
+        let vindex = _mm_setr_epi64x(1, 0);
+        let r = _mm_mmask_i64gather_epi32::<4>(src, 0b01, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_epi32(2, 6, 0, 0);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i64gather_epi64() {
+        let base_addr: [i64; 2] = [1, 2];
+        let src = _mm_setr_epi64x(5, 6);
+        let vindex = _mm_setr_epi64x(1, 0);
+        let r = _mm_mmask_i64gather_epi64::<8>(src, 0b01, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_epi64x(2, 6);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i64gather_pd() {
+        let base_addr: [f64; 2] = [1., 2.];
+        let src = _mm_setr_pd(5., 6.);
+        let vindex = _mm_setr_epi64x(1, 0);
+        let r = _mm_mmask_i64gather_pd::<8>(src, 0b01, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_pd(2., 6.);
+        assert_eq_m128d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mmask_i64gather_ps() {
+        let base_addr: [f32; 2] = [1., 2.];
+        let src = _mm_setr_ps(5., 6., 7., 8.);
+        let vindex = _mm_setr_epi64x(1, 0);
+        let r = _mm_mmask_i64gather_ps::<4>(src, 0b01, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_ps(2., 6., 0., 0.);
+        assert_eq_m128(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i32gather_epi32() {
+        let base_addr: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+        let src = _mm256_setr_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let r =
+            _mm256_mmask_i32gather_epi32::<4>(src, 0b01010101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm256_setr_epi32(2, 10, 4, 12, 6, 14, 8, 16);
+        assert_eq_m256i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i32gather_epi64() {
+        let base_addr: [i64; 4] = [1, 2, 3, 4];
+        let src = _mm256_setr_epi64x(9, 10, 11, 12);
+        let vindex = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm256_mmask_i32gather_epi64::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm256_setr_epi64x(2, 10, 4, 12);
+        assert_eq_m256i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i32gather_pd() {
+        let base_addr: [f64; 4] = [1., 2., 3., 4.];
+        let src = _mm256_setr_pd(9., 10., 11., 12.);
+        let vindex = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm256_mmask_i32gather_pd::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm256_setr_pd(2., 10., 4., 12.);
+        assert_eq_m256d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i32gather_ps() {
+        let base_addr: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let r = _mm256_mmask_i32gather_ps::<4>(src, 0b01010101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm256_setr_ps(2., 10., 4., 12., 6., 14., 8., 16.);
+        assert_eq_m256(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i64gather_epi32() {
+        let base_addr: [i32; 4] = [1, 2, 3, 4];
+        let src = _mm_setr_epi32(9, 10, 11, 12);
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let r = _mm256_mmask_i64gather_epi32::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_epi32(2, 10, 4, 12);
+        assert_eq_m128i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i64gather_epi64() {
+        let base_addr: [i64; 4] = [1, 2, 3, 4];
+        let src = _mm256_setr_epi64x(9, 10, 11, 12);
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let r = _mm256_mmask_i64gather_epi64::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm256_setr_epi64x(2, 10, 4, 12);
+        assert_eq_m256i(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i64gather_pd() {
+        let base_addr: [f64; 4] = [1., 2., 3., 4.];
+        let src = _mm256_setr_pd(9., 10., 11., 12.);
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let r = _mm256_mmask_i64gather_pd::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm256_setr_pd(2., 10., 4., 12.);
+        assert_eq_m256d(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mmask_i64gather_ps() {
+        let base_addr: [f32; 4] = [1., 2., 3., 4.];
+        let src = _mm_setr_ps(9., 10., 11., 12.);
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let r = _mm256_mmask_i64gather_ps::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+        let expected = _mm_setr_ps(2., 10., 4., 12.);
+        assert_eq_m128(expected, r);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i32scatter_epi32() {
+        let mut base_addr: [i32; 4] = [0; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm_setr_epi32(2, 3, 4, 1);
+        _mm_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2, 3, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i32scatter_epi32() {
+        let mut base_addr: [i32; 4] = [0; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm_setr_epi32(2, 3, 4, 1);
+        _mm_mask_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+        let expected = [0, 2, 0, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i32scatter_epi64() {
+        let mut base_addr: [i64; 2] = [0; 2];
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let src = _mm_setr_epi64x(2, 1);
+        _mm_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i32scatter_epi64() {
+        let mut base_addr: [i64; 2] = [0; 2];
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let src = _mm_setr_epi64x(2, 1);
+        _mm_mask_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+        let expected = [0, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i32scatter_pd() {
+        let mut base_addr: [f64; 2] = [0.; 2];
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let src = _mm_setr_pd(2., 1.);
+        _mm_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i32scatter_pd() {
+        let mut base_addr: [f64; 2] = [0.; 2];
+        let vindex = _mm_setr_epi32(1, 0, -1, -1);
+        let src = _mm_setr_pd(2., 1.);
+        _mm_mask_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+        let expected = [0., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i32scatter_ps() {
+        let mut base_addr: [f32; 4] = [0.; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm_setr_ps(2., 3., 4., 1.);
+        _mm_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2., 3., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i32scatter_ps() {
+        let mut base_addr: [f32; 4] = [0.; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm_setr_ps(2., 3., 4., 1.);
+        _mm_mask_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+        let expected = [0., 2., 0., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i64scatter_epi32() {
+        let mut base_addr: [i32; 2] = [0; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_epi32(2, 1, -1, -1);
+        _mm_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i64scatter_epi32() {
+        let mut base_addr: [i32; 2] = [0; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_epi32(2, 1, -1, -1);
+        _mm_mask_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+        let expected = [0, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i64scatter_epi64() {
+        let mut base_addr: [i64; 2] = [0; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_epi64x(2, 1);
+        _mm_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i64scatter_epi64() {
+        let mut base_addr: [i64; 2] = [0; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_epi64x(2, 1);
+        _mm_mask_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+        let expected = [0, 2];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i64scatter_pd() {
+        let mut base_addr: [f64; 2] = [0.; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_pd(2., 1.);
+        _mm_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i64scatter_pd() {
+        let mut base_addr: [f64; 2] = [0.; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_pd(2., 1.);
+        _mm_mask_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+        let expected = [0., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_i64scatter_ps() {
+        let mut base_addr: [f32; 2] = [0.; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_ps(2., 1., -1., -1.);
+        _mm_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_i64scatter_ps() {
+        let mut base_addr: [f32; 2] = [0.; 2];
+        let vindex = _mm_setr_epi64x(1, 0);
+        let src = _mm_setr_ps(2., 1., -1., -1.);
+        _mm_mask_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+        let expected = [0., 2.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_epi32() {
+        let mut base_addr: [i32; 8] = [0; 8];
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let src = _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 8, 1);
+        _mm256_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2, 3, 4, 5, 6, 7, 8];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i32scatter_epi32() {
+        let mut base_addr: [i32; 8] = [0; 8];
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let src = _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 8, 1);
+        _mm256_mask_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src);
+        let expected = [0, 2, 0, 4, 0, 6, 0, 8];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_epi64() {
+        let mut base_addr: [i64; 4] = [0; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm256_setr_epi64x(2, 3, 4, 1);
+        _mm256_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2, 3, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i32scatter_epi64() {
+        let mut base_addr: [i64; 4] = [0; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm256_setr_epi64x(2, 3, 4, 1);
+        _mm256_mask_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+        let expected = [0, 2, 0, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_pd() {
+        let mut base_addr: [f64; 4] = [0.; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm256_setr_pd(2., 3., 4., 1.);
+        _mm256_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2., 3., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i32scatter_pd() {
+        let mut base_addr: [f64; 4] = [0.; 4];
+        let vindex = _mm_setr_epi32(1, 2, 3, 0);
+        let src = _mm256_setr_pd(2., 3., 4., 1.);
+        _mm256_mask_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+        let expected = [0., 2., 0., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i32scatter_ps() {
+        let mut base_addr: [f32; 8] = [0.; 8];
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let src = _mm256_setr_ps(2., 3., 4., 5., 6., 7., 8., 1.);
+        _mm256_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2., 3., 4., 5., 6., 7., 8.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i32scatter_ps() {
+        let mut base_addr: [f32; 8] = [0.; 8];
+        let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+        let src = _mm256_setr_ps(2., 3., 4., 5., 6., 7., 8., 1.);
+        _mm256_mask_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src);
+        let expected = [0., 2., 0., 4., 0., 6., 0., 8.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i64scatter_epi32() {
+        let mut base_addr: [i32; 4] = [0; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm_setr_epi32(2, 3, 4, 1);
+        _mm256_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2, 3, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i64scatter_epi32() {
+        let mut base_addr: [i32; 4] = [0; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm_setr_epi32(2, 3, 4, 1);
+        _mm256_mask_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+        let expected = [0, 2, 0, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i64scatter_epi64() {
+        let mut base_addr: [i64; 4] = [0; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm256_setr_epi64x(2, 3, 4, 1);
+        _mm256_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1, 2, 3, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i64scatter_epi64() {
+        let mut base_addr: [i64; 4] = [0; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm256_setr_epi64x(2, 3, 4, 1);
+        _mm256_mask_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+        let expected = [0, 2, 0, 4];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i64scatter_pd() {
+        let mut base_addr: [f64; 4] = [0.; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm256_setr_pd(2., 3., 4., 1.);
+        _mm256_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2., 3., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i64scatter_pd() {
+        let mut base_addr: [f64; 4] = [0.; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm256_setr_pd(2., 3., 4., 1.);
+        _mm256_mask_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+        let expected = [0., 2., 0., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_i64scatter_ps() {
+        let mut base_addr: [f32; 4] = [0.; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm_setr_ps(2., 3., 4., 1.);
+        _mm256_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+        let expected = [1., 2., 3., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_i64scatter_ps() {
+        let mut base_addr: [f32; 4] = [0.; 4];
+        let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+        let src = _mm_setr_ps(2., 3., 4., 1.);
+        _mm256_mask_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+        let expected = [0., 2., 0., 4.];
+        assert_eq!(expected, base_addr);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_rol_epi64() {
         #[rustfmt::skip]
diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs
index e5069058cd..8f85d4e282 100644
--- a/crates/core_arch/src/x86_64/sse2.rs
+++ b/crates/core_arch/src/x86_64/sse2.rs
@@ -79,8 +79,8 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
     crate::arch::asm!(
-        "movnti [{mem_addr}], {a}",
-        mem_addr = in(reg) mem_addr,
+        "movnti [{p}], {a}",
+        p = in(reg) mem_addr,
         a = in(reg) a,
         options(nostack, preserves_flags),
     );
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index a8c2d36e11..a2835e3b0c 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -84,7 +84,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
     // 2. It is a mark, indicating that the instruction will be
     // compiled into other instructions - mainly because of llvm
     // optimization.
-    let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected));
+    let found = expected == "nop" || instrs.iter().any(|s| s.contains(expected));
 
     // Look for subroutine call instructions in the disassembly to detect whether
     // inlining failed: all intrinsics are `#[inline(always)]`, so calling one
diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs
index ff31c31c89..94569dfd0c 100644
--- a/crates/stdarch-verify/src/lib.rs
+++ b/crates/stdarch-verify/src/lib.rs
@@ -197,6 +197,7 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
             "_MM_MANTISSA_SIGN_ENUM" => quote! { &MM_MANTISSA_SIGN_ENUM },
             "_MM_PERM_ENUM" => quote! { &MM_PERM_ENUM },
             "bool" => quote! { &BOOL },
+            "bf16" => quote! { &BF16 },
             "f32" => quote! { &F32 },
             "f64" => quote! { &F64 },
             "i16" => quote! { &I16 },
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index 15d2454f43..8de2c88b81 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -22,6 +22,7 @@ struct Function {
     has_test: bool,
 }
 
+static BF16: Type = Type::BFloat16;
 static F32: Type = Type::PrimFloat(32);
 static F64: Type = Type::PrimFloat(64);
 static I8: Type = Type::PrimSigned(8);
@@ -65,6 +66,7 @@ enum Type {
     PrimFloat(u8),
     PrimSigned(u8),
     PrimUnsigned(u8),
+    BFloat16,
     MutPtr(&'static Type),
     ConstPtr(&'static Type),
     M128,
@@ -700,6 +702,7 @@ fn equate(
         (&Type::PrimSigned(64), "__int64" | "long long") => {}
         (&Type::PrimUnsigned(8), "unsigned char") => {}
         (&Type::PrimUnsigned(16), "unsigned short") => {}
+        (&Type::BFloat16, "__bfloat16") => {}
         (
             &Type::PrimUnsigned(32),
             "unsigned __int32" | "unsigned int" | "unsigned long" | "const unsigned int",
@@ -761,6 +764,7 @@ fn equate(
         (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {}
         (&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {}
         (&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {}
+        (&Type::ConstPtr(&Type::BFloat16), "__bf16 const*") => {}
 
         (&Type::ConstPtr(&Type::M128), "__m128 const*") => {}
         (&Type::ConstPtr(&Type::M128BH), "__m128bh const*") => {}