diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md
index 4c70c1b435..16f6c58cbb 100644
--- a/crates/core_arch/missing-x86.md
+++ b/crates/core_arch/missing-x86.md
@@ -147,102 +147,6 @@
-["AVX512F"]
-
- * [ ] [`_mm512_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64)
- * [ ] [`_mm512_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd)
- * [ ] [`_mm512_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64)
- * [ ] [`_mm512_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd)
- * [ ] [`_mm512_mask_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64)
- * [ ] [`_mm512_mask_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
- * [ ] [`_mm512_mask_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
- * [ ] [`_mm512_mask_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd)
- * [ ] [`_mm512_stream_load_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_load_si512)
- * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd)
- * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss)
- * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd)
- * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss)
- * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd)
- * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss)
-
-
-
-["AVX512F", "AVX512VL"]
-
- * [ ] [`_mm256_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32)
- * [ ] [`_mm256_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd)
- * [ ] [`_mm256_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps)
- * [ ] [`_mm256_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32)
- * [ ] [`_mm256_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64)
- * [ ] [`_mm256_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd)
- * [ ] [`_mm256_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps)
- * [ ] [`_mm256_mask_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32)
- * [ ] [`_mm256_mask_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi64)
- * [ ] [`_mm256_mask_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_pd)
- * [ ] [`_mm256_mask_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps)
- * [ ] [`_mm256_mask_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32)
- * [ ] [`_mm256_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64)
- * [ ] [`_mm256_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd)
- * [ ] [`_mm256_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps)
- * [ ] [`_mm256_mmask_i32gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32)
- * [ ] [`_mm256_mmask_i32gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64)
- * [ ] [`_mm256_mmask_i32gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd)
- * [ ] [`_mm256_mmask_i32gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps)
- * [ ] [`_mm256_mmask_i64gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32)
- * [ ] [`_mm256_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64)
- * [ ] [`_mm256_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd)
- * [ ] [`_mm256_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps)
- * [ ] [`_mm_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32)
- * [ ] [`_mm_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64)
- * [ ] [`_mm_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd)
- * [ ] [`_mm_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps)
- * [ ] [`_mm_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32)
- * [ ] [`_mm_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64)
- * [ ] [`_mm_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd)
- * [ ] [`_mm_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps)
- * [ ] [`_mm_mask_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32)
- * [ ] [`_mm_mask_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64)
- * [ ] [`_mm_mask_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd)
- * [ ] [`_mm_mask_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps)
- * [ ] [`_mm_mask_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32)
- * [ ] [`_mm_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64)
- * [ ] [`_mm_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_pd)
- * [ ] [`_mm_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps)
- * [ ] [`_mm_mmask_i32gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32)
- * [ ] [`_mm_mmask_i32gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64)
- * [ ] [`_mm_mmask_i32gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd)
- * [ ] [`_mm_mmask_i32gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps)
- * [ ] [`_mm_mmask_i64gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32)
- * [ ] [`_mm_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64)
- * [ ] [`_mm_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd)
- * [ ] [`_mm_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps)
-
-
-
-["AVX512_BF16", "AVX512F"]
-
- * [ ] [`_mm512_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
- * [ ] [`_mm512_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
- * [ ] [`_mm512_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
- * [ ] [`_mm_cvtsbh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
-
-
-
-["AVX512_BF16", "AVX512VL"]
-
- * [ ] [`_mm256_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
- * [ ] [`_mm256_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
- * [ ] [`_mm256_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
- * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
- * [ ] [`_mm_cvtness_sbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
- * [ ] [`_mm_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
- * [ ] [`_mm_mask_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
- * [ ] [`_mm_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
- * [ ] [`_mm_maskz_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
- * [ ] [`_mm_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
-
-
-
["AVX512_FP16"]
* [ ] [`_mm256_castpd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
@@ -1207,79 +1111,14 @@
-["AVX_IFMA"]
-
- * [ ] [`_mm256_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52hi_avx_epu64)
- * [ ] [`_mm256_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52lo_avx_epu64)
- * [ ] [`_mm_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52hi_avx_epu64)
- * [ ] [`_mm_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52lo_avx_epu64)
-
-
-
["AVX_NE_CONVERT"]
- * [ ] [`_mm256_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
* [ ] [`_mm256_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
- * [ ] [`_mm256_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
* [ ] [`_mm256_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
- * [ ] [`_mm256_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
* [ ] [`_mm256_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
- * [ ] [`_mm256_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
- * [ ] [`_mm_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
* [ ] [`_mm_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
- * [ ] [`_mm_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
* [ ] [`_mm_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
- * [ ] [`_mm_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
* [ ] [`_mm_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
- * [ ] [`_mm_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
- * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
-
-
-
-["AVX_VNNI"]
-
- * [ ] [`_mm256_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_avx_epi32)
- * [ ] [`_mm256_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_avx_epi32)
- * [ ] [`_mm256_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_avx_epi32)
- * [ ] [`_mm256_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_avx_epi32)
- * [ ] [`_mm_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_avx_epi32)
- * [ ] [`_mm_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_avx_epi32)
- * [ ] [`_mm_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_avx_epi32)
- * [ ] [`_mm_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_avx_epi32)
-
-
-
-["AVX_VNNI_INT16"]
-
- * [ ] [`_mm256_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsud_epi32)
- * [ ] [`_mm256_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsuds_epi32)
- * [ ] [`_mm256_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusd_epi32)
- * [ ] [`_mm256_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusds_epi32)
- * [ ] [`_mm256_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuud_epi32)
- * [ ] [`_mm256_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuuds_epi32)
- * [ ] [`_mm_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsud_epi32)
- * [ ] [`_mm_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsuds_epi32)
- * [ ] [`_mm_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusd_epi32)
- * [ ] [`_mm_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusds_epi32)
- * [ ] [`_mm_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuud_epi32)
- * [ ] [`_mm_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuuds_epi32)
-
-
-
-["AVX_VNNI_INT8"]
-
- * [ ] [`_mm256_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssd_epi32)
- * [ ] [`_mm256_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssds_epi32)
- * [ ] [`_mm256_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsud_epi32)
- * [ ] [`_mm256_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsuds_epi32)
- * [ ] [`_mm256_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuud_epi32)
- * [ ] [`_mm256_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuuds_epi32)
- * [ ] [`_mm_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssd_epi32)
- * [ ] [`_mm_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssds_epi32)
- * [ ] [`_mm_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsud_epi32)
- * [ ] [`_mm_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsuds_epi32)
- * [ ] [`_mm_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuud_epi32)
- * [ ] [`_mm_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuuds_epi32)
diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index aa5a5d8c18..7726a188f2 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -1738,8 +1738,8 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
crate::arch::asm!(
- "vmovntdq [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("vmovntdq", ",{a}"),
+ p = in(reg) mem_addr,
a = in(ymm_reg) a,
options(nostack, preserves_flags),
);
@@ -1766,8 +1766,8 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
crate::arch::asm!(
- "vmovntpd [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("vmovntpd", ",{a}"),
+ p = in(reg) mem_addr,
a = in(ymm_reg) a,
options(nostack, preserves_flags),
);
@@ -1795,8 +1795,8 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
crate::arch::asm!(
- "vmovntps [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("vmovntps", ",{a}"),
+ p = in(reg) mem_addr,
a = in(ymm_reg) a,
options(nostack, preserves_flags),
);
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index 0343416a92..fa32c7fcc4 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -3149,9 +3149,9 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
let dst: __m256i;
crate::arch::asm!(
- "vmovntdqa {a}, [{mem_addr}]",
+ vpl!("vmovntdqa {a}"),
a = out(ymm_reg) dst,
- mem_addr = in(reg) mem_addr,
+ p = in(reg) mem_addr,
options(pure, readonly, nostack, preserves_flags),
);
dst
diff --git a/crates/core_arch/src/x86/avx512bf16.rs b/crates/core_arch/src/x86/avx512bf16.rs
index dbd8b4a6f3..7d99809353 100644
--- a/crates/core_arch/src/x86/avx512bf16.rs
+++ b/crates/core_arch/src/x86/avx512bf16.rs
@@ -2,6 +2,7 @@
//!
//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
+use crate::arch::asm;
use crate::core_arch::{simd::*, x86::*};
use crate::intrinsics::simd::*;
@@ -365,9 +366,211 @@ pub unsafe fn _mm512_maskz_dpbf16_ps(
transmute(simd_select_bitmask(k, rst, zero))
}
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
+ _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a))))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
+ let cvt = _mm512_cvtpbh_ps(a);
+ transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
+ let cvt = _mm512_cvtpbh_ps(a);
+ let zero = _mm512_setzero_ps();
+ transmute(simd_select_bitmask(k, cvt.as_f32x16(), zero.as_f32x16()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
+ _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a))))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
+ let cvt = _mm256_cvtpbh_ps(a);
+ transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
+ let cvt = _mm256_cvtpbh_ps(a);
+ let zero = _mm256_setzero_ps();
+ transmute(simd_select_bitmask(k, cvt.as_f32x8(), zero.as_f32x8()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
+ _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a))))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
+ let cvt = _mm_cvtpbh_ps(a);
+ transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
+}
+
+/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
+/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
+ let cvt = _mm_cvtpbh_ps(a);
+ let zero = _mm_setzero_ps();
+ transmute(simd_select_bitmask(k, cvt.as_f32x4(), zero.as_f32x4()))
+}
+
+/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
+/// element, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm_cvtsbh_ss(a: bf16) -> f32 {
+ f32::from_bits((a.to_bits() as u32) << 16)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl,sse")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
+ let mut dst: __m128bh;
+ asm!(
+ "vcvtneps2bf16 {dst}, {src}",
+ dst = lateout(xmm_reg) dst,
+ src = in(xmm_reg) a,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ dst
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl,sse,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
+ let mut dst = src;
+ asm!(
+ "vcvtneps2bf16 {dst}{{{k}}},{src}",
+ dst = inlateout(xmm_reg) dst,
+ src = in(xmm_reg) a,
+ k = in(kreg) k,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ dst
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
+/// when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl,sse,avx512f")]
+#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
+ let mut dst: __m128bh;
+ asm!(
+ "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
+ dst = lateout(xmm_reg) dst,
+ src = in(xmm_reg) a,
+ k = in(kreg) k,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ dst
+}
+
+/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
+/// element, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
+#[inline]
+#[target_feature(enable = "avx512bf16,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm_cvtness_sbh(a: f32) -> bf16 {
+ let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
+ bf16::from_bits(value)
+}
+
#[cfg(test)]
mod tests {
- use crate::{core_arch::x86::*, mem::transmute};
+ use crate::core_arch::simd::u16x4;
+ use crate::{
+ core_arch::x86::*,
+ mem::{transmute, transmute_copy},
+ };
use stdarch_test::simd_test;
#[simd_test(enable = "avx512bf16,avx512vl")]
@@ -1592,4 +1795,156 @@ mod tests {
];
assert_eq!(result, expected_result);
}
+
+ const BF16_ONE: u16 = 0b0_01111111_0000000;
+ const BF16_TWO: u16 = 0b0_10000000_0000000;
+ const BF16_THREE: u16 = 0b0_10000000_1000000;
+ const BF16_FOUR: u16 = 0b0_10000001_0000000;
+ const BF16_FIVE: u16 = 0b0_10000001_0100000;
+ const BF16_SIX: u16 = 0b0_10000001_1000000;
+ const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+ const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+ #[simd_test(enable = "avx512bf16")]
+ unsafe fn test_mm512_cvtpbh_ps() {
+ let a = __m256bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let r = _mm512_cvtpbh_ps(a);
+ let e = _mm512_setr_ps(
+ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16")]
+ unsafe fn test_mm512_mask_cvtpbh_ps() {
+ let a = __m256bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let src = _mm512_setr_ps(
+ 9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16.,
+ );
+ let k = 0b1010_1010_1010_1010;
+ let r = _mm512_mask_cvtpbh_ps(src, k, a);
+ let e = _mm512_setr_ps(
+ 9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16")]
+ unsafe fn test_mm512_maskz_cvtpbh_ps() {
+ let a = __m256bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let k = 0b1010_1010_1010_1010;
+ let r = _mm512_maskz_cvtpbh_ps(k, a);
+ let e = _mm512_setr_ps(
+ 0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm256_cvtpbh_ps() {
+ let a = __m128bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let r = _mm256_cvtpbh_ps(a);
+ let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm256_mask_cvtpbh_ps() {
+ let a = __m128bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+ let k = 0b1010_1010;
+ let r = _mm256_mask_cvtpbh_ps(src, k, a);
+ let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm256_maskz_cvtpbh_ps() {
+ let a = __m128bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let k = 0b1010_1010;
+ let r = _mm256_maskz_cvtpbh_ps(k, a);
+ let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm_cvtpbh_ps() {
+ let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0);
+ let r = _mm_cvtpbh_ps(a);
+ let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm_mask_cvtpbh_ps() {
+ let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0);
+ let src = _mm_setr_ps(9., 10., 11., 12.);
+ let k = 0b1010;
+ let r = _mm_mask_cvtpbh_ps(src, k, a);
+ let e = _mm_setr_ps(9., 2., 11., 4.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm_maskz_cvtpbh_ps() {
+ let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0);
+ let k = 0b1010;
+ let r = _mm_maskz_cvtpbh_ps(k, a);
+ let e = _mm_setr_ps(0., 2., 0., 4.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16")]
+ unsafe fn test_mm_cvtsbh_ss() {
+ let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE));
+ assert_eq!(r, 1.);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm_cvtneps_pbh() {
+ let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+ let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a));
+ let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm_mask_cvtneps_pbh() {
+ let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+ let src = __m128bh(5, 6, 7, 8, !0, !0, !0, !0);
+ let k = 0b1010;
+ let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a));
+ let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR);
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm_maskz_cvtneps_pbh() {
+ let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+ let k = 0b1010;
+ let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a));
+ let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR);
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512bf16,avx512vl")]
+ unsafe fn test_mm_cvtness_sbh() {
+ let r = _mm_cvtness_sbh(1.);
+ assert_eq!(r.to_bits(), BF16_ONE);
+ }
}
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 1f786d01f0..dd74d11786 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -8,8 +8,6 @@ use crate::{
#[cfg(test)]
use stdarch_test::assert_instr;
-use super::avx512f::{vpl, vps};
-
/// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 3fe919abc8..8a5a529b08 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -6,37 +6,6 @@ use crate::{
mem, ptr,
};
-// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
-// register name (e.g. rax). We have to explicitly override the placeholder to
-// use the 32-bit register name in that case.
-
-#[cfg(target_pointer_width = "32")]
-macro_rules! vpl {
- ($inst:expr) => {
- concat!($inst, ", [{p:e}]")
- };
-}
-#[cfg(target_pointer_width = "64")]
-macro_rules! vpl {
- ($inst:expr) => {
- concat!($inst, ", [{p}]")
- };
-}
-#[cfg(target_pointer_width = "32")]
-macro_rules! vps {
- ($inst1:expr, $inst2:expr) => {
- concat!($inst1, " [{p:e}]", $inst2)
- };
-}
-#[cfg(target_pointer_width = "64")]
-macro_rules! vps {
- ($inst1:expr, $inst2:expr) => {
- concat!($inst1, " [{p}]", $inst2)
- };
-}
-
-pub(crate) use {vpl, vps};
-
#[cfg(test)]
use stdarch_test::assert_instr;
@@ -16565,27 +16534,6 @@ pub unsafe fn _mm512_mask_i32scatter_epi64(
vpscatterdq(slice, mask, offsets, src, SCALE);
}
-/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099)
-#[inline]
-#[target_feature(enable = "avx512f,avx512vl")]
-#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_i32scatter_epi64(
- slice: *mut u8,
- offsets: __m128i,
- src: __m256i,
-) {
- static_assert_imm8_scale!(SCALE);
- let src = src.as_i64x4();
- let neg_one = -1;
- let slice = slice as *mut i8;
- let offsets = offsets.as_i32x4();
- vpscatterdq256(slice, neg_one, offsets, src, SCALE);
-}
-
/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi64&expand=3116)
@@ -16715,6 +16663,1153 @@ pub unsafe fn _mm512_mask_i64scatter_epi32(
vpscatterqd(slice, mask, offsets, src, SCALE);
}
+/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale and stores them in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_i32logather_epi64(
+ vindex: __m512i,
+ base_addr: *const u8,
+) -> __m512i {
+ _mm512_i32gather_epi64::(_mm512_castsi512_si256(vindex), base_addr as _)
+}
+
+/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale and stores them in dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_i32logather_epi64(
+ src: __m512i,
+ k: __mmask8,
+ vindex: __m512i,
+ base_addr: *const u8,
+) -> __m512i {
+ _mm512_mask_i32gather_epi64::(src, k, _mm512_castsi512_si256(vindex), base_addr as _)
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_i32logather_pd(
+ vindex: __m512i,
+ base_addr: *const u8,
+) -> __m512d {
+ _mm512_i32gather_pd::(_mm512_castsi512_si256(vindex), base_addr as _)
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst
+/// using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_i32logather_pd(
+ src: __m512d,
+ k: __mmask8,
+ vindex: __m512i,
+ base_addr: *const u8,
+) -> __m512d {
+ _mm512_mask_i32gather_pd::(src, k, _mm512_castsi512_si256(vindex), base_addr as _)
+}
+
+/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_i32loscatter_epi64(
+ base_addr: *mut u8,
+ vindex: __m512i,
+ a: __m512i,
+) {
+ _mm512_i32scatter_epi64::(base_addr as _, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in the lower half of vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_i32loscatter_epi64(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m512i,
+ a: __m512i,
+) {
+ _mm512_mask_i32scatter_epi64::(base_addr as _, k, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_i32loscatter_pd(
+ base_addr: *mut u8,
+ vindex: __m512i,
+ a: __m512d,
+) {
+ _mm512_i32scatter_pd::(base_addr as _, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale using writemask k
+/// (elements whose corresponding mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_mask_i32loscatter_pd(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m512i,
+ a: __m512d,
+) {
+ _mm512_mask_i32scatter_pd::(base_addr as _, k, _mm512_castsi512_si256(vindex), a)
+}
+
+/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i32scatter_epi32(
+ base_addr: *mut u8,
+ vindex: __m256i,
+ a: __m256i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterdd_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_i32x8(), SCALE)
+}
+
+/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i32scatter_epi32(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m256i,
+ a: __m256i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterdd_256(base_addr as _, k, vindex.as_i32x8(), a.as_i32x8(), SCALE)
+}
+
+/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_i32scatter_epi64(
+ slice: *mut u8,
+ offsets: __m128i,
+ src: __m256i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i64x4();
+ let slice = slice as *mut i8;
+ let offsets = offsets.as_i32x4();
+ vpscatterdq_256(slice, 0xff, offsets, src, SCALE);
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i32scatter_epi64(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m256i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterdq_256(base_addr as _, k, vindex.as_i32x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i32scatter_pd(
+ base_addr: *mut u8,
+ vindex: __m128i,
+ a: __m256d,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterdpd_256(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i32scatter_pd(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m256d,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterdpd_256(base_addr as _, k, vindex.as_i32x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i32scatter_ps(
+ base_addr: *mut u8,
+ vindex: __m256i,
+ a: __m256,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterdps_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_f32x8(), SCALE)
+}
+
+/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i32scatter_ps(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m256i,
+ a: __m256,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterdps_256(base_addr as _, k, vindex.as_i32x8(), a.as_f32x8(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i64scatter_epi32(
+ base_addr: *mut u8,
+ vindex: __m256i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterqd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i64scatter_epi32(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m256i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterqd_256(base_addr as _, k, vindex.as_i64x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i64scatter_epi64(
+ base_addr: *mut u8,
+ vindex: __m256i,
+ a: __m256i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterqq_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i64scatter_epi64(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m256i,
+ a: __m256i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterqq_256(base_addr as _, k, vindex.as_i64x4(), a.as_i64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i64scatter_pd(
+ base_addr: *mut u8,
+ vindex: __m256i,
+ a: __m256d,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterqpd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i64scatter_pd(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m256i,
+ a: __m256d,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterqpd_256(base_addr as _, k, vindex.as_i64x4(), a.as_f64x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_i64scatter_ps(
+ base_addr: *mut u8,
+ vindex: __m256i,
+ a: __m128,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterqps_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_i64scatter_ps(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m256i,
+ a: __m128,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterqps_256(base_addr as _, k, vindex.as_i64x4(), a.as_f32x4(), SCALE)
+}
+
+/// Loads 8 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i32gather_epi32(
+ src: __m256i,
+ k: __mmask8,
+ vindex: __m256i,
+ base_addr: *const u8,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vpgatherdd_256(
+ src.as_i32x8(),
+ base_addr as _,
+ vindex.as_i32x8(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i32gather_epi64(
+ src: __m256i,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vpgatherdq_256(
+ src.as_i64x4(),
+ base_addr as _,
+ vindex.as_i32x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i32gather_pd(
+ src: __m256d,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m256d {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vgatherdpd_256(
+ src.as_f64x4(),
+ base_addr as _,
+ vindex.as_i32x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 8 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i32gather_ps(
+ src: __m256,
+ k: __mmask8,
+ vindex: __m256i,
+ base_addr: *const u8,
+) -> __m256 {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vgatherdps_256(
+ src.as_f32x8(),
+ base_addr as _,
+ vindex.as_i32x8(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i64gather_epi32(
+ src: __m128i,
+ k: __mmask8,
+ vindex: __m256i,
+ base_addr: *const u8,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vpgatherqd_256(
+ src.as_i32x4(),
+ base_addr as _,
+ vindex.as_i64x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i64gather_epi64(
+ src: __m256i,
+ k: __mmask8,
+ vindex: __m256i,
+ base_addr: *const u8,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vpgatherqq_256(
+ src.as_i64x4(),
+ base_addr as _,
+ vindex.as_i64x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i64gather_pd(
+ src: __m256d,
+ k: __mmask8,
+ vindex: __m256i,
+ base_addr: *const u8,
+) -> __m256d {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vgatherqpd_256(
+ src.as_f64x4(),
+ base_addr as _,
+ vindex.as_i64x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mmask_i64gather_ps(
+ src: __m128,
+ k: __mmask8,
+ vindex: __m256i,
+ base_addr: *const u8,
+) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vgatherqps_256(
+ src.as_f32x4(),
+ base_addr as _,
+ vindex.as_i64x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i32scatter_epi32(
+ base_addr: *mut u8,
+ vindex: __m128i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterdd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i32scatter_epi32(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterdd_128(base_addr as _, k, vindex.as_i32x4(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i32scatter_epi64(
+ base_addr: *mut u8,
+ vindex: __m128i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterdq_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i32scatter_epi64(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterdq_128(base_addr as _, k, vindex.as_i32x4(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i32scatter_pd(base_addr: *mut u8, vindex: __m128i, a: __m128d) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterdpd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i32scatter_pd(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m128d,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterdpd_128(base_addr as _, k, vindex.as_i32x4(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i32scatter_ps(base_addr: *mut u8, vindex: __m128i, a: __m128) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterdps_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i32scatter_ps(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m128,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterdps_128(base_addr as _, k, vindex.as_i32x4(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i64scatter_epi32(
+ base_addr: *mut u8,
+ vindex: __m128i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterqd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i64scatter_epi32(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterqd_128(base_addr as _, k, vindex.as_i64x2(), a.as_i32x4(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i64scatter_epi64(
+ base_addr: *mut u8,
+ vindex: __m128i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterqq_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set
+/// are not written to memory).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i64scatter_epi64(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m128i,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vpscatterqq_128(base_addr as _, k, vindex.as_i64x2(), a.as_i64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i64scatter_pd(base_addr: *mut u8, vindex: __m128i, a: __m128d) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterqpd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+/// mask bit is not set are not written to memory).
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i64scatter_pd(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m128d,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterqpd_128(base_addr as _, k, vindex.as_i64x2(), a.as_f64x2(), SCALE)
+}
+
+/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(3)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_i64scatter_ps(base_addr: *mut u8, vindex: __m128i, a: __m128) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterqps_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f32x4(), SCALE)
+}
+
+/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_i64scatter_ps(
+ base_addr: *mut u8,
+ k: __mmask8,
+ vindex: __m128i,
+ a: __m128,
+) {
+ static_assert_imm8_scale!(SCALE);
+ vscatterqps_128(base_addr as _, k, vindex.as_i64x2(), a.as_f32x4(), SCALE)
+}
+
+/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i32gather_epi32(
+ src: __m128i,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vpgatherdd_128(
+ src.as_i32x4(),
+ base_addr as _,
+ vindex.as_i32x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i32gather_epi64(
+ src: __m128i,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vpgatherdq_128(
+ src.as_i64x2(),
+ base_addr as _,
+ vindex.as_i32x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i32gather_pd(
+ src: __m128d,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m128d {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vgatherdpd_128(
+ src.as_f64x2(),
+ base_addr as _,
+ vindex.as_i32x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i32gather_ps(
+ src: __m128,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vgatherdps_128(
+ src.as_f32x4(),
+ base_addr as _,
+ vindex.as_i32x4(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 2 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i64gather_epi32(
+ src: __m128i,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vpgatherqd_128(
+ src.as_i32x4(),
+ base_addr as _,
+ vindex.as_i64x2(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 64-bit integer
+/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding
+/// mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i64gather_epi64(
+ src: __m128i,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vpgatherqq_128(
+ src.as_i64x2(),
+ base_addr as _,
+ vindex.as_i64x2(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i64gather_pd(
+ src: __m128d,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m128d {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vgatherqpd_128(
+ src.as_f64x2(),
+ base_addr as _,
+ vindex.as_i64x2(),
+ k,
+ SCALE,
+ ))
+}
+
+/// Loads 2 single-precision (32-bit) floating-point elements from memory starting at location base_addr
+/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied
+/// from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mmask_i64gather_ps(
+ src: __m128,
+ k: __mmask8,
+ vindex: __m128i,
+ base_addr: *const u8,
+) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ transmute(vgatherqps_128(
+ src.as_f32x4(),
+ base_addr as _,
+ vindex.as_i64x2(),
+ k,
+ SCALE,
+ ))
+}
+
/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi32&expand=1198)
@@ -27899,8 +28994,8 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
crate::arch::asm!(
- "vmovntps [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("vmovntps", ",{a}"),
+ p = in(reg) mem_addr,
a = in(zmm_reg) a,
options(nostack, preserves_flags),
);
@@ -27925,8 +29020,8 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
crate::arch::asm!(
- "vmovntpd [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("vmovntpd", ",{a}"),
+ p = in(reg) mem_addr,
a = in(zmm_reg) a,
options(nostack, preserves_flags),
);
@@ -27951,13 +29046,32 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) {
crate::arch::asm!(
- "vmovntdq [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("vmovntdq", ",{a}"),
+ p = in(reg) mem_addr,
a = in(zmm_reg) a,
options(nostack, preserves_flags),
);
}
+/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
+/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si256)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
+ let dst: __m512i;
+ crate::arch::asm!(
+ vpl!("vmovntdqa {a}"),
+ a = out(zmm_reg) dst,
+ p = in(reg) mem_addr,
+ options(pure, readonly, nostack, preserves_flags),
+ );
+ dst
+}
+
/// Sets packed 32-bit integers in `dst` with the supplied values.
///
/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931)
@@ -33856,6 +34970,94 @@ pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
dst
}
+/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
+/// 3 packed elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_load_ss(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+ let mut dst: __m128 = src;
+ asm!(
+ vpl!("vmovss {dst}{{{k}}}"),
+ p = in(reg) mem_addr,
+ k = in(kreg) k,
+ dst = inout(xmm_reg) dst,
+ options(pure, readonly, nostack, preserves_flags),
+ );
+ dst
+}
+
+/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper 3 packed
+/// elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_load_ss(k: __mmask8, mem_addr: *const f32) -> __m128 {
+ let mut dst: __m128;
+ asm!(
+ vpl!("vmovss {dst}{{{k}}} {{z}}"),
+ p = in(reg) mem_addr,
+ k = in(kreg) k,
+ dst = out(xmm_reg) dst,
+ options(pure, readonly, nostack, preserves_flags),
+ );
+ dst
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
+/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper
+/// element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection
+/// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_load_sd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+ let mut dst: __m128d = src;
+ asm!(
+ vpl!("vmovsd {dst}{{{k}}}"),
+ p = in(reg) mem_addr,
+ k = in(kreg) k,
+ dst = inout(xmm_reg) dst,
+ options(pure, readonly, nostack, preserves_flags),
+ );
+ dst
+}
+
+/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst
+/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper element
+/// of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
+/// may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_maskz_load_sd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+ let mut dst: __m128d;
+ asm!(
+ vpl!("vmovsd {dst}{{{k}}} {{z}}"),
+ p = in(reg) mem_addr,
+ k = in(kreg) k,
+ dst = out(xmm_reg) dst,
+ options(pure, readonly, nostack, preserves_flags),
+ );
+ dst
+}
+
/// Store packed 32-bit integers from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
@@ -34276,6 +35478,42 @@ pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d)
);
}
+/// Store a single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr
+/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovss))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_store_ss(mem_addr: *mut f32, k: __mmask8, a: __m128) {
+ asm!(
+ vps!("vmovss", "{{{k}}}, {a}"),
+ p = in(reg) mem_addr,
+ k = in(kreg) k,
+ a = in(xmm_reg) a,
+ options(nostack, preserves_flags),
+ );
+}
+
+/// Store a double-precision (64-bit) floating-point element from a into memory using writemask k. mem_addr
+/// must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd)
+#[inline]
+#[cfg_attr(test, assert_instr(vmovsd))]
+#[target_feature(enable = "sse,avx512f")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_store_sd(mem_addr: *mut f64, k: __mmask8, a: __m128d) {
+ asm!(
+ vps!("vmovsd", "{{{k}}}, {a}"),
+ p = in(reg) mem_addr,
+ k = in(kreg) k,
+ a = in(xmm_reg) a,
+ options(nostack, preserves_flags),
+ );
+}
+
/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi32)
@@ -41174,8 +42412,6 @@ extern "C" {
fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
#[link_name = "llvm.x86.avx512.scatter.dpq.512"]
fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
- #[link_name = "llvm.x86.avx512.scattersiv4.di"]
- fn vpscatterdq256(slice: *mut i8, mask: i8, offsets: i32x4, src: i64x4, scale: i32);
#[link_name = "llvm.x86.avx512.scatter.dpi.512"]
fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
@@ -41184,6 +42420,74 @@ extern "C" {
#[link_name = "llvm.x86.avx512.scatter.qpi.512"]
fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
+ #[link_name = "llvm.x86.avx512.scattersiv4.si"]
+ fn vpscatterdd_128(slice: *mut i8, k: u8, offsets: i32x4, src: i32x4, scale: i32);
+ #[link_name = "llvm.x86.avx512.scattersiv2.di"]
+ fn vpscatterdq_128(slice: *mut i8, k: u8, offsets: i32x4, src: i64x2, scale: i32);
+ #[link_name = "llvm.x86.avx512.scattersiv2.df"]
+ fn vscatterdpd_128(slice: *mut i8, k: u8, offsets: i32x4, src: f64x2, scale: i32);
+ #[link_name = "llvm.x86.avx512.scattersiv4.sf"]
+ fn vscatterdps_128(slice: *mut i8, k: u8, offsets: i32x4, src: f32x4, scale: i32);
+ #[link_name = "llvm.x86.avx512.scatterdiv4.si"]
+ fn vpscatterqd_128(slice: *mut i8, k: u8, offsets: i64x2, src: i32x4, scale: i32);
+ #[link_name = "llvm.x86.avx512.scatterdiv2.di"]
+ fn vpscatterqq_128(slice: *mut i8, k: u8, offsets: i64x2, src: i64x2, scale: i32);
+ #[link_name = "llvm.x86.avx512.scatterdiv2.df"]
+ fn vscatterqpd_128(slice: *mut i8, k: u8, offsets: i64x2, src: f64x2, scale: i32);
+ #[link_name = "llvm.x86.avx512.scatterdiv4.sf"]
+ fn vscatterqps_128(slice: *mut i8, k: u8, offsets: i64x2, src: f32x4, scale: i32);
+
+ #[link_name = "llvm.x86.avx512.scattersiv8.si"]
+ fn vpscatterdd_256(slice: *mut i8, k: u8, offsets: i32x8, src: i32x8, scale: i32);
+ #[link_name = "llvm.x86.avx512.scattersiv4.di"]
+ fn vpscatterdq_256(slice: *mut i8, k: u8, offsets: i32x4, src: i64x4, scale: i32);
+ #[link_name = "llvm.x86.avx512.scattersiv4.df"]
+ fn vscatterdpd_256(slice: *mut i8, k: u8, offsets: i32x4, src: f64x4, scale: i32);
+ #[link_name = "llvm.x86.avx512.scattersiv8.sf"]
+ fn vscatterdps_256(slice: *mut i8, k: u8, offsets: i32x8, src: f32x8, scale: i32);
+ #[link_name = "llvm.x86.avx512.scatterdiv8.si"]
+ fn vpscatterqd_256(slice: *mut i8, k: u8, offsets: i64x4, src: i32x4, scale: i32);
+ #[link_name = "llvm.x86.avx512.scatterdiv4.di"]
+ fn vpscatterqq_256(slice: *mut i8, k: u8, offsets: i64x4, src: i64x4, scale: i32);
+ #[link_name = "llvm.x86.avx512.scatterdiv4.df"]
+ fn vscatterqpd_256(slice: *mut i8, k: u8, offsets: i64x4, src: f64x4, scale: i32);
+ #[link_name = "llvm.x86.avx512.scatterdiv8.sf"]
+ fn vscatterqps_256(slice: *mut i8, k: u8, offsets: i64x4, src: f32x4, scale: i32);
+
+ #[link_name = "llvm.x86.avx512.gather3siv4.si"]
+ fn vpgatherdd_128(src: i32x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i32x4;
+ #[link_name = "llvm.x86.avx512.gather3siv2.di"]
+ fn vpgatherdq_128(src: i64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x2;
+ #[link_name = "llvm.x86.avx512.gather3siv2.df"]
+ fn vgatherdpd_128(src: f64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.gather3siv4.sf"]
+ fn vgatherdps_128(src: f32x4, slice: *const u8, offsets: i32x4, k: u8, scale: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.gather3div4.si"]
+ fn vpgatherqd_128(src: i32x4, slice: *const u8, offsets: i64x2, k: u8, scale: i32) -> i32x4;
+ #[link_name = "llvm.x86.avx512.gather3div2.di"]
+ fn vpgatherqq_128(src: i64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> i64x2;
+ #[link_name = "llvm.x86.avx512.gather3div2.df"]
+ fn vgatherqpd_128(src: f64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.gather3div4.sf"]
+ fn vgatherqps_128(src: f32x4, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f32x4;
+
+ #[link_name = "llvm.x86.avx512.gather3siv8.si"]
+ fn vpgatherdd_256(src: i32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> i32x8;
+ #[link_name = "llvm.x86.avx512.gather3siv4.di"]
+ fn vpgatherdq_256(src: i64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x4;
+ #[link_name = "llvm.x86.avx512.gather3siv4.df"]
+ fn vgatherdpd_256(src: f64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x4;
+ #[link_name = "llvm.x86.avx512.gather3siv8.sf"]
+ fn vgatherdps_256(src: f32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> f32x8;
+ #[link_name = "llvm.x86.avx512.gather3div8.si"]
+ fn vpgatherqd_256(src: i32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i32x4;
+ #[link_name = "llvm.x86.avx512.gather3div4.di"]
+ fn vpgatherqq_256(src: i64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i64x4;
+ #[link_name = "llvm.x86.avx512.gather3div4.df"]
+ fn vgatherqpd_256(src: f64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f64x4;
+ #[link_name = "llvm.x86.avx512.gather3div8.sf"]
+ fn vgatherqps_256(src: f32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f32x4;
+
#[link_name = "llvm.x86.avx512.mask.cmp.ss"]
fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
#[link_name = "llvm.x86.avx512.mask.cmp.sd"]
@@ -50265,6 +51569,60 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_load_ss() {
+ #[repr(align(16))]
+ struct Align {
+ data: f32,
+ }
+ let src = _mm_set_ss(2.0);
+ let mem = Align { data: 1.0 };
+ let r = _mm_mask_load_ss(src, 0b1, &mem.data);
+ assert_eq_m128(r, _mm_set_ss(1.0));
+ let r = _mm_mask_load_ss(src, 0b0, &mem.data);
+ assert_eq_m128(r, _mm_set_ss(2.0));
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_load_ss() {
+ #[repr(align(16))]
+ struct Align {
+ data: f32,
+ }
+ let mem = Align { data: 1.0 };
+ let r = _mm_maskz_load_ss(0b1, &mem.data);
+ assert_eq_m128(r, _mm_set_ss(1.0));
+ let r = _mm_maskz_load_ss(0b0, &mem.data);
+ assert_eq_m128(r, _mm_set_ss(0.0));
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_load_sd() {
+ #[repr(align(16))]
+ struct Align {
+ data: f64,
+ }
+ let src = _mm_set_sd(2.0);
+ let mem = Align { data: 1.0 };
+ let r = _mm_mask_load_sd(src, 0b1, &mem.data);
+ assert_eq_m128d(r, _mm_set_sd(1.0));
+ let r = _mm_mask_load_sd(src, 0b0, &mem.data);
+ assert_eq_m128d(r, _mm_set_sd(2.0));
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_load_sd() {
+ #[repr(align(16))]
+ struct Align {
+ data: f64,
+ }
+ let mem = Align { data: 1.0 };
+ let r = _mm_maskz_load_sd(0b1, &mem.data);
+ assert_eq_m128d(r, _mm_set_sd(1.0));
+ let r = _mm_maskz_load_sd(0b0, &mem.data);
+ assert_eq_m128d(r, _mm_set_sd(0.0));
+ }
+
#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_storeu_pd() {
let mut r = [42_f64; 2];
@@ -50289,6 +51647,34 @@ mod tests {
assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_store_ss() {
+ #[repr(align(16))]
+ struct Align {
+ data: f32,
+ }
+ let a = _mm_set_ss(2.0);
+ let mut mem = Align { data: 1.0 };
+ _mm_mask_store_ss(&mut mem.data, 0b1, a);
+ assert_eq!(mem.data, 2.0);
+ _mm_mask_store_ss(&mut mem.data, 0b0, a);
+ assert_eq!(mem.data, 2.0);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_store_sd() {
+ #[repr(align(16))]
+ struct Align {
+ data: f64,
+ }
+ let a = _mm_set_sd(2.0);
+ let mut mem = Align { data: 1.0 };
+ _mm_mask_store_sd(&mut mem.data, 0b1, a);
+ assert_eq!(mem.data, 2.0);
+ _mm_mask_store_sd(&mut mem.data, 0b0, a);
+ assert_eq!(mem.data, 2.0);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_setr_pd() {
let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
@@ -54566,6 +55952,13 @@ mod tests {
}
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_stream_load_si512() {
+ let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+ let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _);
+ assert_eq_m512i(a, r);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_reduce_add_epi32() {
let a = _mm512_set1_epi32(1);
diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs
index 01bb704ae7..3bf9958e3d 100644
--- a/crates/core_arch/src/x86/avx512ifma.rs
+++ b/crates/core_arch/src/x86/avx512ifma.rs
@@ -114,6 +114,24 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64(
simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512())
}
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpmadd52huq)
+)]
+pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+ vpmadd52huq_256(a, b, c)
+}
+
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the
@@ -169,6 +187,24 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64(
simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256())
}
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpmadd52luq)
+)]
+pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+ vpmadd52luq_256(a, b, c)
+}
+
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the
@@ -224,6 +260,24 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64(
simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256())
}
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpmadd52huq)
+)]
+pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+ vpmadd52huq_128(a, b, c)
+}
+
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the
@@ -269,6 +323,24 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _
simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128())
}
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpmadd52luq)
+)]
+pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+ vpmadd52luq_128(a, b, c)
+}
+
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the
@@ -427,6 +499,20 @@ mod tests {
assert_eq_m512i(expected, actual);
}
+ #[simd_test(enable = "avxifma")]
+ unsafe fn test_mm256_madd52hi_avx_epu64() {
+ let a = _mm256_set1_epi64x(10 << 40);
+ let b = _mm256_set1_epi64x((11 << 40) + 4);
+ let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+ let actual = _mm256_madd52hi_avx_epu64(a, b, c);
+
+ // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+ let expected = _mm256_set1_epi64x(11030549757952);
+
+ assert_eq_m256i(expected, actual);
+ }
+
#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm256_madd52hi_epu64() {
let a = _mm256_set1_epi64x(10 << 40);
@@ -471,6 +557,20 @@ mod tests {
assert_eq_m256i(expected, actual);
}
+ #[simd_test(enable = "avxifma")]
+ unsafe fn test_mm256_madd52lo_avx_epu64() {
+ let a = _mm256_set1_epi64x(10 << 40);
+ let b = _mm256_set1_epi64x((11 << 40) + 4);
+ let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+ let actual = _mm256_madd52lo_avx_epu64(a, b, c);
+
+ // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+ let expected = _mm256_set1_epi64x(100055558127628);
+
+ assert_eq_m256i(expected, actual);
+ }
+
#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm256_madd52lo_epu64() {
let a = _mm256_set1_epi64x(10 << 40);
@@ -515,6 +615,20 @@ mod tests {
assert_eq_m256i(expected, actual);
}
+ #[simd_test(enable = "avxifma")]
+ unsafe fn test_mm_madd52hi_avx_epu64() {
+ let a = _mm_set1_epi64x(10 << 40);
+ let b = _mm_set1_epi64x((11 << 40) + 4);
+ let c = _mm_set1_epi64x((12 << 40) + 3);
+
+ let actual = _mm_madd52hi_avx_epu64(a, b, c);
+
+ // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+ let expected = _mm_set1_epi64x(11030549757952);
+
+ assert_eq_m128i(expected, actual);
+ }
+
#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm_madd52hi_epu64() {
let a = _mm_set1_epi64x(10 << 40);
@@ -559,6 +673,20 @@ mod tests {
assert_eq_m128i(expected, actual);
}
+ #[simd_test(enable = "avxifma")]
+ unsafe fn test_mm_madd52lo_avx_epu64() {
+ let a = _mm_set1_epi64x(10 << 40);
+ let b = _mm_set1_epi64x((11 << 40) + 4);
+ let c = _mm_set1_epi64x((12 << 40) + 3);
+
+ let actual = _mm_madd52lo_avx_epu64(a, b, c);
+
+ // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+ let expected = _mm_set1_epi64x(100055558127628);
+
+ assert_eq_m128i(expected, actual);
+ }
+
#[simd_test(enable = "avx512ifma,avx512vl")]
unsafe fn test_mm_madd52lo_epu64() {
let a = _mm_set1_epi64x(10 << 40);
diff --git a/crates/core_arch/src/x86/avx512vnni.rs b/crates/core_arch/src/x86/avx512vnni.rs
index 67a626b7ed..2ed800d295 100644
--- a/crates/core_arch/src/x86/avx512vnni.rs
+++ b/crates/core_arch/src/x86/avx512vnni.rs
@@ -50,6 +50,20 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwssd)
+)]
+pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216)
@@ -96,6 +110,20 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwssd)
+)]
+pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213)
@@ -178,6 +206,20 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwssds)
+)]
+pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225)
@@ -224,6 +266,20 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwssds)
+)]
+pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222)
@@ -311,6 +367,20 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbusd)
+)]
+pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198)
@@ -357,6 +427,20 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbusd)
+)]
+pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195)
@@ -439,6 +523,20 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbusds)
+)]
+pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207)
@@ -485,6 +583,20 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbusds)
+)]
+pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204)
@@ -526,6 +638,390 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
transmute(simd_select_bitmask(k, r, zero))
}
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbssd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbssd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbssds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbssds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpbuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwusd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwusd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwusds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwusds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+ transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vpdpwuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+ transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.avx512.vpdpwssd.512"]
@@ -555,6 +1051,66 @@ extern "C" {
fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx512.vpdpbusds.128"]
fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+ #[link_name = "llvm.x86.avx2.vpdpbssd.128"]
+ fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpbssd.256"]
+ fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpbssds.128"]
+ fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpbssds.256"]
+ fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpbsud.128"]
+ fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpbsud.256"]
+ fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpbsuds.128"]
+ fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpbsuds.256"]
+ fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpbuud.128"]
+ fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpbuud.256"]
+ fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpbuuds.128"]
+ fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpbuuds.256"]
+ fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpwsud.128"]
+ fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpwsud.256"]
+ fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpwsuds.128"]
+ fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpwsuds.256"]
+ fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpwusd.128"]
+ fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpwusd.256"]
+ fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpwusds.128"]
+ fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpwusds.256"]
+ fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpwuud.128"]
+ fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpwuud.256"]
+ fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+ #[link_name = "llvm.x86.avx2.vpdpwuuds.128"]
+ fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.vpdpwuuds.256"]
+ fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
}
#[cfg(test)]
@@ -597,6 +1153,16 @@ mod tests {
assert_eq_m512i(r, e);
}
+ #[simd_test(enable = "avxvnni")]
+ unsafe fn test_mm256_dpwssd_avx_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm256_dpwssd_avx_epi32(src, a, b);
+ let e = _mm256_set1_epi32(3);
+ assert_eq_m256i(r, e);
+ }
+
#[simd_test(enable = "avx512vnni,avx512vl")]
unsafe fn test_mm256_dpwssd_epi32() {
let src = _mm256_set1_epi32(1);
@@ -631,6 +1197,16 @@ mod tests {
assert_eq_m256i(r, e);
}
+ #[simd_test(enable = "avxvnni")]
+ unsafe fn test_mm_dpwssd_avx_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm_dpwssd_avx_epi32(src, a, b);
+ let e = _mm_set1_epi32(3);
+ assert_eq_m128i(r, e);
+ }
+
#[simd_test(enable = "avx512vnni,avx512vl")]
unsafe fn test_mm_dpwssd_epi32() {
let src = _mm_set1_epi32(1);
@@ -699,6 +1275,16 @@ mod tests {
assert_eq_m512i(r, e);
}
+ #[simd_test(enable = "avxvnni")]
+ unsafe fn test_mm256_dpwssds_avx_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm256_dpwssds_avx_epi32(src, a, b);
+ let e = _mm256_set1_epi32(3);
+ assert_eq_m256i(r, e);
+ }
+
#[simd_test(enable = "avx512vnni,avx512vl")]
unsafe fn test_mm256_dpwssds_epi32() {
let src = _mm256_set1_epi32(1);
@@ -733,6 +1319,16 @@ mod tests {
assert_eq_m256i(r, e);
}
+ #[simd_test(enable = "avxvnni")]
+ unsafe fn test_mm_dpwssds_avx_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm_dpwssds_avx_epi32(src, a, b);
+ let e = _mm_set1_epi32(3);
+ assert_eq_m128i(r, e);
+ }
+
#[simd_test(enable = "avx512vnni,avx512vl")]
unsafe fn test_mm_dpwssds_epi32() {
let src = _mm_set1_epi32(1);
@@ -801,6 +1397,16 @@ mod tests {
assert_eq_m512i(r, e);
}
+ #[simd_test(enable = "avxvnni")]
+ unsafe fn test_mm256_dpbusd_avx_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm256_dpbusd_avx_epi32(src, a, b);
+ let e = _mm256_set1_epi32(5);
+ assert_eq_m256i(r, e);
+ }
+
#[simd_test(enable = "avx512vnni,avx512vl")]
unsafe fn test_mm256_dpbusd_epi32() {
let src = _mm256_set1_epi32(1);
@@ -835,6 +1441,16 @@ mod tests {
assert_eq_m256i(r, e);
}
+ #[simd_test(enable = "avxvnni")]
+ unsafe fn test_mm_dpbusd_avx_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm_dpbusd_avx_epi32(src, a, b);
+ let e = _mm_set1_epi32(5);
+ assert_eq_m128i(r, e);
+ }
+
#[simd_test(enable = "avx512vnni,avx512vl")]
unsafe fn test_mm_dpbusd_epi32() {
let src = _mm_set1_epi32(1);
@@ -903,6 +1519,16 @@ mod tests {
assert_eq_m512i(r, e);
}
+ #[simd_test(enable = "avxvnni")]
+ unsafe fn test_mm256_dpbusds_avx_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm256_dpbusds_avx_epi32(src, a, b);
+ let e = _mm256_set1_epi32(5);
+ assert_eq_m256i(r, e);
+ }
+
#[simd_test(enable = "avx512vnni,avx512vl")]
unsafe fn test_mm256_dpbusds_epi32() {
let src = _mm256_set1_epi32(1);
@@ -937,6 +1563,16 @@ mod tests {
assert_eq_m256i(r, e);
}
+ #[simd_test(enable = "avxvnni")]
+ unsafe fn test_mm_dpbusds_avx_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm_dpbusds_avx_epi32(src, a, b);
+ let e = _mm_set1_epi32(5);
+ assert_eq_m128i(r, e);
+ }
+
#[simd_test(enable = "avx512vnni,avx512vl")]
unsafe fn test_mm_dpbusds_epi32() {
let src = _mm_set1_epi32(1);
@@ -970,4 +1606,244 @@ mod tests {
let e = _mm_set1_epi32(5);
assert_eq_m128i(r, e);
}
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm_dpbssd_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm_dpbssd_epi32(src, a, b);
+ let e = _mm_set1_epi32(5);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm256_dpbssd_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm256_dpbssd_epi32(src, a, b);
+ let e = _mm256_set1_epi32(5);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm_dpbssds_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm_dpbssds_epi32(src, a, b);
+ let e = _mm_set1_epi32(5);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm256_dpbssds_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm256_dpbssds_epi32(src, a, b);
+ let e = _mm256_set1_epi32(5);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm_dpbsud_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm_dpbsud_epi32(src, a, b);
+ let e = _mm_set1_epi32(5);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm256_dpbsud_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm256_dpbsud_epi32(src, a, b);
+ let e = _mm256_set1_epi32(5);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm_dpbsuds_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm_dpbsuds_epi32(src, a, b);
+ let e = _mm_set1_epi32(5);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm256_dpbsuds_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm256_dpbsuds_epi32(src, a, b);
+ let e = _mm256_set1_epi32(5);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm_dpbuud_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm_dpbuud_epi32(src, a, b);
+ let e = _mm_set1_epi32(5);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm256_dpbuud_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm256_dpbuud_epi32(src, a, b);
+ let e = _mm256_set1_epi32(5);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm_dpbuuds_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm_dpbuuds_epi32(src, a, b);
+ let e = _mm_set1_epi32(5);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint8")]
+ unsafe fn test_mm256_dpbuuds_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+ let r = _mm256_dpbuuds_epi32(src, a, b);
+ let e = _mm256_set1_epi32(5);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm_dpwsud_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm_dpwsud_epi32(src, a, b);
+ let e = _mm_set1_epi32(3);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm256_dpwsud_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm256_dpwsud_epi32(src, a, b);
+ let e = _mm256_set1_epi32(3);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm_dpwsuds_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm_dpwsuds_epi32(src, a, b);
+ let e = _mm_set1_epi32(3);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm256_dpwsuds_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm256_dpwsuds_epi32(src, a, b);
+ let e = _mm256_set1_epi32(3);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm_dpwusd_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm_dpwusd_epi32(src, a, b);
+ let e = _mm_set1_epi32(3);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm256_dpwusd_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm256_dpwusd_epi32(src, a, b);
+ let e = _mm256_set1_epi32(3);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm_dpwusds_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm_dpwusds_epi32(src, a, b);
+ let e = _mm_set1_epi32(3);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm256_dpwusds_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm256_dpwusds_epi32(src, a, b);
+ let e = _mm256_set1_epi32(3);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm_dpwuud_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm_dpwuud_epi32(src, a, b);
+ let e = _mm_set1_epi32(3);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm256_dpwuud_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm256_dpwuud_epi32(src, a, b);
+ let e = _mm256_set1_epi32(3);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm_dpwuuds_epi32() {
+ let src = _mm_set1_epi32(1);
+ let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm_dpwuuds_epi32(src, a, b);
+ let e = _mm_set1_epi32(3);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avxvnniint16")]
+ unsafe fn test_mm256_dpwuuds_epi32() {
+ let src = _mm256_set1_epi32(1);
+ let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+ let r = _mm256_dpwuuds_epi32(src, a, b);
+ let e = _mm256_set1_epi32(3);
+ assert_eq_m256i(r, e);
+ }
}
diff --git a/crates/core_arch/src/x86/avxneconvert.rs b/crates/core_arch/src/x86/avxneconvert.rs
new file mode 100644
index 0000000000..4eb1a9b30c
--- /dev/null
+++ b/crates/core_arch/src/x86/avxneconvert.rs
@@ -0,0 +1,253 @@
+use crate::arch::asm;
+use crate::core_arch::x86::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vbcstnebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
+ bcstnebf162ps_128(a)
+}
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vbcstnebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
+ bcstnebf162ps_256(a)
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vcvtneebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
+ transmute(cvtneebf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vcvtneebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
+ transmute(cvtneebf162ps_256(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vcvtneobf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
+ transmute(cvtneobf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vcvtneobf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
+ transmute(cvtneobf162ps_256(a))
+}
+
+/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_bf16)
+#[inline]
+#[target_feature(enable = "avxneconvert,sse")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vcvtneps2bf16)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
+ let mut dst: __m128bh;
+ asm!(
+ "{{vex}}vcvtneps2bf16 {dst},{src}",
+ dst = lateout(xmm_reg) dst,
+ src = in(xmm_reg) a,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ dst
+}
+
+/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_bf16)
+#[inline]
+#[target_feature(enable = "avxneconvert,sse,avx")]
+#[cfg_attr(
+ all(test, any(target_os = "linux", target_env = "msvc")),
+ assert_instr(vcvtneps2bf16)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
+ let mut dst: __m128bh;
+ asm!(
+ "{{vex}}vcvtneps2bf16 {dst},{src}",
+ dst = lateout(xmm_reg) dst,
+ src = in(ymm_reg) a,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ dst
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.x86.vbcstnebf162ps128"]
+ fn bcstnebf162ps_128(a: *const bf16) -> __m128;
+ #[link_name = "llvm.x86.vbcstnebf162ps256"]
+ fn bcstnebf162ps_256(a: *const bf16) -> __m256;
+
+ #[link_name = "llvm.x86.vcvtneebf162ps128"]
+ fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
+ #[link_name = "llvm.x86.vcvtneebf162ps256"]
+ fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
+
+ #[link_name = "llvm.x86.vcvtneobf162ps128"]
+ fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
+ #[link_name = "llvm.x86.vcvtneobf162ps256"]
+ fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::core_arch::simd::{u16x4, u16x8};
+ use crate::core_arch::x86::*;
+ use crate::mem::transmute_copy;
+ use std::ptr::addr_of;
+ use stdarch_test::simd_test;
+
+ const BF16_ONE: u16 = 0b0_01111111_0000000;
+ const BF16_TWO: u16 = 0b0_10000000_0000000;
+ const BF16_THREE: u16 = 0b0_10000000_1000000;
+ const BF16_FOUR: u16 = 0b0_10000001_0000000;
+ const BF16_FIVE: u16 = 0b0_10000001_0100000;
+ const BF16_SIX: u16 = 0b0_10000001_1000000;
+ const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+ const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+ #[simd_test(enable = "avxneconvert")]
+ unsafe fn test_mm_bcstnebf16_ps() {
+ let a = bf16::from_bits(BF16_ONE);
+ let r = _mm_bcstnebf16_ps(addr_of!(a));
+ let e = _mm_set_ps(1., 1., 1., 1.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avxneconvert")]
+ unsafe fn test_mm256_bcstnebf16_ps() {
+ let a = bf16::from_bits(BF16_ONE);
+ let r = _mm256_bcstnebf16_ps(addr_of!(a));
+ let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avxneconvert")]
+ unsafe fn test_mm_cvtneebf16_ps() {
+ let a = __m128bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let r = _mm_cvtneebf16_ps(addr_of!(a));
+ let e = _mm_setr_ps(1., 3., 5., 7.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avxneconvert")]
+ unsafe fn test_mm256_cvtneebf16_ps() {
+ let a = __m256bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let r = _mm256_cvtneebf16_ps(addr_of!(a));
+ let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avxneconvert")]
+ unsafe fn test_mm_cvtneobf16_ps() {
+ let a = __m128bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let r = _mm_cvtneobf16_ps(addr_of!(a));
+ let e = _mm_setr_ps(2., 4., 6., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avxneconvert")]
+ unsafe fn test_mm256_cvtneobf16_ps() {
+ let a = __m256bh(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ let r = _mm256_cvtneobf16_ps(addr_of!(a));
+ let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avxneconvert")]
+ unsafe fn test_mm_cvtneps_avx_pbh() {
+ let a = _mm_setr_ps(1., 2., 3., 4.);
+ let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
+ let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avxneconvert")]
+ unsafe fn test_mm256_cvtneps_avx_pbh() {
+ let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+ let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
+ let e = u16x8::new(
+ BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+ );
+ assert_eq!(r, e);
+ }
+}
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 17d64f5bbf..ddf38aa506 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -57,3 +57,33 @@ macro_rules! assert_approx_eq {
);
}};
}
+
+// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
+// register name (e.g. rax). We have to explicitly override the placeholder to
+// use the 32-bit register name in that case.
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vpl {
+ ($inst:expr) => {
+ concat!($inst, ", [{p:e}]")
+ };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vpl {
+ ($inst:expr) => {
+ concat!($inst, ", [{p}]")
+ };
+}
+
+#[cfg(target_pointer_width = "32")]
+macro_rules! vps {
+ ($inst1:expr, $inst2:expr) => {
+ concat!($inst1, " [{p:e}]", $inst2)
+ };
+}
+#[cfg(target_pointer_width = "64")]
+macro_rules! vps {
+ ($inst1:expr, $inst2:expr) => {
+ concat!($inst1, " [{p}]", $inst2)
+ };
+}
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 8b1d3bbbb6..9365fe10a2 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -337,6 +337,31 @@ types! {
);
}
+/// The BFloat16 type used in AVX-512 intrinsics.
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug)]
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+pub struct bf16(u16);
+
+impl bf16 {
+ /// Raw transmutation from `u16`
+ #[inline]
+ #[must_use]
+ #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+ pub const fn from_bits(bits: u16) -> bf16 {
+ bf16(bits)
+ }
+
+ /// Raw transmutation to `u16`
+ #[inline]
+ #[must_use = "this returns the result of the operation, without modifying the original"]
+ #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
+ pub const fn to_bits(self) -> u16 {
+ self.0
+ }
+}
+
/// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer
#[allow(non_camel_case_types)]
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
@@ -894,6 +919,9 @@ mod f16c;
pub use self::f16c::*;
mod avx512bf16;
-
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
pub use self::avx512bf16::*;
+
+mod avxneconvert;
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub use self::avxneconvert::*;
diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs
index a4602301de..ea6e685acb 100644
--- a/crates/core_arch/src/x86/sse.rs
+++ b/crates/core_arch/src/x86/sse.rs
@@ -1992,8 +1992,8 @@ extern "C" {
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
crate::arch::asm!(
- "movntps [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("movntps", ",{a}"),
+ p = in(reg) mem_addr,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs
index 289d41a0ff..0dee597410 100644
--- a/crates/core_arch/src/x86/sse2.rs
+++ b/crates/core_arch/src/x86/sse2.rs
@@ -1312,8 +1312,8 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
crate::arch::asm!(
- "movntdq [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("movntdq", ",{a}"),
+ p = in(reg) mem_addr,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
@@ -1339,8 +1339,8 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
crate::arch::asm!(
- "movnti [{mem_addr}], {a:e}", // `:e` for 32bit value
- mem_addr = in(reg) mem_addr,
+ vps!("movnti", ",{a:e}"), // `:e` for 32bit value
+ p = in(reg) mem_addr,
a = in(reg) a,
options(nostack, preserves_flags),
);
@@ -2542,8 +2542,8 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
crate::arch::asm!(
- "movntpd [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ vps!("movntpd", ",{a}"),
+ p = in(reg) mem_addr,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs
index c8b260bec8..daf89bc3fd 100644
--- a/crates/core_arch/src/x86/sse41.rs
+++ b/crates/core_arch/src/x86/sse41.rs
@@ -1154,9 +1154,9 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
let dst: __m128i;
crate::arch::asm!(
- "movntdqa {a}, [{mem_addr}]",
+ vpl!("movntdqa {a}"),
a = out(xmm_reg) dst,
- mem_addr = in(reg) mem_addr,
+ p = in(reg) mem_addr,
options(pure, readonly, nostack, preserves_flags),
);
dst
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index a2b2496caf..5ea6dcc026 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -7649,20 +7649,6 @@ mod tests {
assert_eq!(&arr[..], &expected[..],);
}
- #[simd_test(enable = "avx512f,avx512vl")]
- unsafe fn test_mm256_i32scatter_epi64() {
- let mut arr = [0i64; 64];
- let index = _mm_setr_epi32(0, 16, 32, 48);
- let src = _mm256_setr_epi64x(1, 2, 3, 4);
- // A multiplier of 8 is word-addressing
- _mm256_i32scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, index, src);
- let mut expected = [0i64; 64];
- for i in 0..4 {
- expected[i * 16] = (i + 1) as i64;
- }
- assert_eq!(&arr[..], &expected[..],);
- }
-
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_i64scatter_epi64() {
let mut arr = [0i64; 128];
@@ -7721,6 +7707,566 @@ mod tests {
assert_eq!(&arr[..], &expected[..],);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_i32logather_epi64() {
+ let base_addr: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+ let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+ let r = _mm512_i32logather_epi64::<8>(vindex, base_addr.as_ptr().cast());
+ let expected = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+ assert_eq_m512i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_i32logather_epi64() {
+ let base_addr: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+ let src = _mm512_setr_epi64(9, 10, 11, 12, 13, 14, 15, 16);
+ let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+ let r =
+ _mm512_mask_i32logather_epi64::<8>(src, 0b01010101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm512_setr_epi64(2, 10, 4, 12, 6, 14, 8, 16);
+ assert_eq_m512i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_i32logather_pd() {
+ let base_addr: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+ let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+ let r = _mm512_i32logather_pd::<8>(vindex, base_addr.as_ptr().cast());
+ let expected = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+ assert_eq_m512d(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_i32logather_pd() {
+ let base_addr: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+ let src = _mm512_setr_pd(9., 10., 11., 12., 13., 14., 15., 16.);
+ let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+ let r = _mm512_mask_i32logather_pd::<8>(src, 0b01010101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm512_setr_pd(2., 10., 4., 12., 6., 14., 8., 16.);
+ assert_eq_m512d(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_i32loscatter_epi64() {
+ let mut base_addr: [i64; 8] = [0; 8];
+ let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+ let src = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+ _mm512_i32loscatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2, 3, 4, 5, 6, 7, 8];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_i32loscatter_epi64() {
+ let mut base_addr: [i64; 8] = [0; 8];
+ let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+ let src = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1);
+ _mm512_mask_i32loscatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src);
+ let expected = [0, 2, 0, 4, 0, 6, 0, 8];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_i32loscatter_pd() {
+ let mut base_addr: [f64; 8] = [0.; 8];
+ let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+ let src = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+ _mm512_i32loscatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2., 3., 4., 5., 6., 7., 8.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_i32loscatter_pd() {
+ let mut base_addr: [f64; 8] = [0.; 8];
+ let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1);
+ let src = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.);
+ _mm512_mask_i32loscatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src);
+ let expected = [0., 2., 0., 4., 0., 6., 0., 8.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mmask_i32gather_epi32() {
+ let base_addr: [i32; 4] = [1, 2, 3, 4];
+ let src = _mm_setr_epi32(5, 6, 7, 8);
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let r = _mm_mmask_i32gather_epi32::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_epi32(2, 6, 4, 8);
+ assert_eq_m128i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mmask_i32gather_epi64() {
+ let base_addr: [i64; 2] = [1, 2];
+ let src = _mm_setr_epi64x(5, 6);
+ let vindex = _mm_setr_epi32(1, 0, -1, -1);
+ let r = _mm_mmask_i32gather_epi64::<8>(src, 0b01, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_epi64x(2, 6);
+ assert_eq_m128i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mmask_i32gather_pd() {
+ let base_addr: [f64; 2] = [1., 2.];
+ let src = _mm_setr_pd(5., 6.);
+ let vindex = _mm_setr_epi32(1, 0, -1, -1);
+ let r = _mm_mmask_i32gather_pd::<8>(src, 0b01, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_pd(2., 6.);
+ assert_eq_m128d(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mmask_i32gather_ps() {
+ let base_addr: [f32; 4] = [1., 2., 3., 4.];
+ let src = _mm_setr_ps(5., 6., 7., 8.);
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let r = _mm_mmask_i32gather_ps::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_ps(2., 6., 4., 8.);
+ assert_eq_m128(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mmask_i64gather_epi32() {
+ let base_addr: [i32; 2] = [1, 2];
+ let src = _mm_setr_epi32(5, 6, 7, 8);
+ let vindex = _mm_setr_epi64x(1, 0);
+ let r = _mm_mmask_i64gather_epi32::<4>(src, 0b01, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_epi32(2, 6, 0, 0);
+ assert_eq_m128i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mmask_i64gather_epi64() {
+ let base_addr: [i64; 2] = [1, 2];
+ let src = _mm_setr_epi64x(5, 6);
+ let vindex = _mm_setr_epi64x(1, 0);
+ let r = _mm_mmask_i64gather_epi64::<8>(src, 0b01, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_epi64x(2, 6);
+ assert_eq_m128i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mmask_i64gather_pd() {
+ let base_addr: [f64; 2] = [1., 2.];
+ let src = _mm_setr_pd(5., 6.);
+ let vindex = _mm_setr_epi64x(1, 0);
+ let r = _mm_mmask_i64gather_pd::<8>(src, 0b01, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_pd(2., 6.);
+ assert_eq_m128d(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mmask_i64gather_ps() {
+ let base_addr: [f32; 2] = [1., 2.];
+ let src = _mm_setr_ps(5., 6., 7., 8.);
+ let vindex = _mm_setr_epi64x(1, 0);
+ let r = _mm_mmask_i64gather_ps::<4>(src, 0b01, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_ps(2., 6., 0., 0.);
+ assert_eq_m128(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mmask_i32gather_epi32() {
+ let base_addr: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8];
+ let src = _mm256_setr_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+ let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+ let r =
+ _mm256_mmask_i32gather_epi32::<4>(src, 0b01010101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm256_setr_epi32(2, 10, 4, 12, 6, 14, 8, 16);
+ assert_eq_m256i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mmask_i32gather_epi64() {
+ let base_addr: [i64; 4] = [1, 2, 3, 4];
+ let src = _mm256_setr_epi64x(9, 10, 11, 12);
+ let vindex = _mm_setr_epi32(1, 2, 3, 4);
+ let r = _mm256_mmask_i32gather_epi64::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm256_setr_epi64x(2, 10, 4, 12);
+ assert_eq_m256i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mmask_i32gather_pd() {
+ let base_addr: [f64; 4] = [1., 2., 3., 4.];
+ let src = _mm256_setr_pd(9., 10., 11., 12.);
+ let vindex = _mm_setr_epi32(1, 2, 3, 4);
+ let r = _mm256_mmask_i32gather_pd::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm256_setr_pd(2., 10., 4., 12.);
+ assert_eq_m256d(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mmask_i32gather_ps() {
+ let base_addr: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.];
+ let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
+ let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+ let r = _mm256_mmask_i32gather_ps::<4>(src, 0b01010101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm256_setr_ps(2., 10., 4., 12., 6., 14., 8., 16.);
+ assert_eq_m256(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mmask_i64gather_epi32() {
+ let base_addr: [i32; 4] = [1, 2, 3, 4];
+ let src = _mm_setr_epi32(9, 10, 11, 12);
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let r = _mm256_mmask_i64gather_epi32::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_epi32(2, 10, 4, 12);
+ assert_eq_m128i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mmask_i64gather_epi64() {
+ let base_addr: [i64; 4] = [1, 2, 3, 4];
+ let src = _mm256_setr_epi64x(9, 10, 11, 12);
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let r = _mm256_mmask_i64gather_epi64::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm256_setr_epi64x(2, 10, 4, 12);
+ assert_eq_m256i(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mmask_i64gather_pd() {
+ let base_addr: [f64; 4] = [1., 2., 3., 4.];
+ let src = _mm256_setr_pd(9., 10., 11., 12.);
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let r = _mm256_mmask_i64gather_pd::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm256_setr_pd(2., 10., 4., 12.);
+ assert_eq_m256d(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mmask_i64gather_ps() {
+ let base_addr: [f32; 4] = [1., 2., 3., 4.];
+ let src = _mm_setr_ps(9., 10., 11., 12.);
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let r = _mm256_mmask_i64gather_ps::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast());
+ let expected = _mm_setr_ps(2., 10., 4., 12.);
+ assert_eq_m128(expected, r);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_i32scatter_epi32() {
+ let mut base_addr: [i32; 4] = [0; 4];
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let src = _mm_setr_epi32(2, 3, 4, 1);
+ _mm_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2, 3, 4];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mask_i32scatter_epi32() {
+ let mut base_addr: [i32; 4] = [0; 4];
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let src = _mm_setr_epi32(2, 3, 4, 1);
+ _mm_mask_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+ let expected = [0, 2, 0, 4];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_i32scatter_epi64() {
+ let mut base_addr: [i64; 2] = [0; 2];
+ let vindex = _mm_setr_epi32(1, 0, -1, -1);
+ let src = _mm_setr_epi64x(2, 1);
+ _mm_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mask_i32scatter_epi64() {
+ let mut base_addr: [i64; 2] = [0; 2];
+ let vindex = _mm_setr_epi32(1, 0, -1, -1);
+ let src = _mm_setr_epi64x(2, 1);
+ _mm_mask_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+ let expected = [0, 2];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_i32scatter_pd() {
+ let mut base_addr: [f64; 2] = [0.; 2];
+ let vindex = _mm_setr_epi32(1, 0, -1, -1);
+ let src = _mm_setr_pd(2., 1.);
+ _mm_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mask_i32scatter_pd() {
+ let mut base_addr: [f64; 2] = [0.; 2];
+ let vindex = _mm_setr_epi32(1, 0, -1, -1);
+ let src = _mm_setr_pd(2., 1.);
+ _mm_mask_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+ let expected = [0., 2.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_i32scatter_ps() {
+ let mut base_addr: [f32; 4] = [0.; 4];
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let src = _mm_setr_ps(2., 3., 4., 1.);
+ _mm_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2., 3., 4.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mask_i32scatter_ps() {
+ let mut base_addr: [f32; 4] = [0.; 4];
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let src = _mm_setr_ps(2., 3., 4., 1.);
+ _mm_mask_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+ let expected = [0., 2., 0., 4.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_i64scatter_epi32() {
+ let mut base_addr: [i32; 2] = [0; 2];
+ let vindex = _mm_setr_epi64x(1, 0);
+ let src = _mm_setr_epi32(2, 1, -1, -1);
+ _mm_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mask_i64scatter_epi32() {
+ let mut base_addr: [i32; 2] = [0; 2];
+ let vindex = _mm_setr_epi64x(1, 0);
+ let src = _mm_setr_epi32(2, 1, -1, -1);
+ _mm_mask_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+ let expected = [0, 2];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_i64scatter_epi64() {
+ let mut base_addr: [i64; 2] = [0; 2];
+ let vindex = _mm_setr_epi64x(1, 0);
+ let src = _mm_setr_epi64x(2, 1);
+ _mm_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mask_i64scatter_epi64() {
+ let mut base_addr: [i64; 2] = [0; 2];
+ let vindex = _mm_setr_epi64x(1, 0);
+ let src = _mm_setr_epi64x(2, 1);
+ _mm_mask_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+ let expected = [0, 2];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_i64scatter_pd() {
+ let mut base_addr: [f64; 2] = [0.; 2];
+ let vindex = _mm_setr_epi64x(1, 0);
+ let src = _mm_setr_pd(2., 1.);
+ _mm_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mask_i64scatter_pd() {
+ let mut base_addr: [f64; 2] = [0.; 2];
+ let vindex = _mm_setr_epi64x(1, 0);
+ let src = _mm_setr_pd(2., 1.);
+ _mm_mask_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+ let expected = [0., 2.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_i64scatter_ps() {
+ let mut base_addr: [f32; 2] = [0.; 2];
+ let vindex = _mm_setr_epi64x(1, 0);
+ let src = _mm_setr_ps(2., 1., -1., -1.);
+ _mm_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm_mask_i64scatter_ps() {
+ let mut base_addr: [f32; 2] = [0.; 2];
+ let vindex = _mm_setr_epi64x(1, 0);
+ let src = _mm_setr_ps(2., 1., -1., -1.);
+ _mm_mask_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src);
+ let expected = [0., 2.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_i32scatter_epi32() {
+ let mut base_addr: [i32; 8] = [0; 8];
+ let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+ let src = _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 8, 1);
+ _mm256_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2, 3, 4, 5, 6, 7, 8];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mask_i32scatter_epi32() {
+ let mut base_addr: [i32; 8] = [0; 8];
+ let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+ let src = _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 8, 1);
+ _mm256_mask_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src);
+ let expected = [0, 2, 0, 4, 0, 6, 0, 8];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_i32scatter_epi64() {
+ let mut base_addr: [i64; 4] = [0; 4];
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let src = _mm256_setr_epi64x(2, 3, 4, 1);
+ _mm256_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2, 3, 4];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mask_i32scatter_epi64() {
+ let mut base_addr: [i64; 4] = [0; 4];
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let src = _mm256_setr_epi64x(2, 3, 4, 1);
+ _mm256_mask_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+ let expected = [0, 2, 0, 4];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_i32scatter_pd() {
+ let mut base_addr: [f64; 4] = [0.; 4];
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let src = _mm256_setr_pd(2., 3., 4., 1.);
+ _mm256_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2., 3., 4.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mask_i32scatter_pd() {
+ let mut base_addr: [f64; 4] = [0.; 4];
+ let vindex = _mm_setr_epi32(1, 2, 3, 0);
+ let src = _mm256_setr_pd(2., 3., 4., 1.);
+ _mm256_mask_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+ let expected = [0., 2., 0., 4.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_i32scatter_ps() {
+ let mut base_addr: [f32; 8] = [0.; 8];
+ let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+ let src = _mm256_setr_ps(2., 3., 4., 5., 6., 7., 8., 1.);
+ _mm256_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2., 3., 4., 5., 6., 7., 8.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mask_i32scatter_ps() {
+ let mut base_addr: [f32; 8] = [0.; 8];
+ let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
+ let src = _mm256_setr_ps(2., 3., 4., 5., 6., 7., 8., 1.);
+ _mm256_mask_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src);
+ let expected = [0., 2., 0., 4., 0., 6., 0., 8.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_i64scatter_epi32() {
+ let mut base_addr: [i32; 4] = [0; 4];
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let src = _mm_setr_epi32(2, 3, 4, 1);
+ _mm256_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2, 3, 4];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mask_i64scatter_epi32() {
+ let mut base_addr: [i32; 4] = [0; 4];
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let src = _mm_setr_epi32(2, 3, 4, 1);
+ _mm256_mask_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+ let expected = [0, 2, 0, 4];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_i64scatter_epi64() {
+ let mut base_addr: [i64; 4] = [0; 4];
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let src = _mm256_setr_epi64x(2, 3, 4, 1);
+ _mm256_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1, 2, 3, 4];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mask_i64scatter_epi64() {
+ let mut base_addr: [i64; 4] = [0; 4];
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let src = _mm256_setr_epi64x(2, 3, 4, 1);
+ _mm256_mask_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+ let expected = [0, 2, 0, 4];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_i64scatter_pd() {
+ let mut base_addr: [f64; 4] = [0.; 4];
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let src = _mm256_setr_pd(2., 3., 4., 1.);
+ _mm256_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2., 3., 4.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mask_i64scatter_pd() {
+ let mut base_addr: [f64; 4] = [0.; 4];
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let src = _mm256_setr_pd(2., 3., 4., 1.);
+ _mm256_mask_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+ let expected = [0., 2., 0., 4.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_i64scatter_ps() {
+ let mut base_addr: [f32; 4] = [0.; 4];
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let src = _mm_setr_ps(2., 3., 4., 1.);
+ _mm256_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src);
+ let expected = [1., 2., 3., 4.];
+ assert_eq!(expected, base_addr);
+ }
+
+ #[simd_test(enable = "avx512f,avx512vl")]
+ unsafe fn test_mm256_mask_i64scatter_ps() {
+ let mut base_addr: [f32; 4] = [0.; 4];
+ let vindex = _mm256_setr_epi64x(1, 2, 3, 0);
+ let src = _mm_setr_ps(2., 3., 4., 1.);
+ _mm256_mask_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src);
+ let expected = [0., 2., 0., 4.];
+ assert_eq!(expected, base_addr);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_rol_epi64() {
#[rustfmt::skip]
diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs
index e5069058cd..8f85d4e282 100644
--- a/crates/core_arch/src/x86_64/sse2.rs
+++ b/crates/core_arch/src/x86_64/sse2.rs
@@ -79,8 +79,8 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
crate::arch::asm!(
- "movnti [{mem_addr}], {a}",
- mem_addr = in(reg) mem_addr,
+ "movnti [{p}], {a}",
+ p = in(reg) mem_addr,
a = in(reg) a,
options(nostack, preserves_flags),
);
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index a8c2d36e11..a2835e3b0c 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -84,7 +84,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
// 2. It is a mark, indicating that the instruction will be
// compiled into other instructions - mainly because of llvm
// optimization.
- let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected));
+ let found = expected == "nop" || instrs.iter().any(|s| s.contains(expected));
// Look for subroutine call instructions in the disassembly to detect whether
// inlining failed: all intrinsics are `#[inline(always)]`, so calling one
diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs
index ff31c31c89..94569dfd0c 100644
--- a/crates/stdarch-verify/src/lib.rs
+++ b/crates/stdarch-verify/src/lib.rs
@@ -197,6 +197,7 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
"_MM_MANTISSA_SIGN_ENUM" => quote! { &MM_MANTISSA_SIGN_ENUM },
"_MM_PERM_ENUM" => quote! { &MM_PERM_ENUM },
"bool" => quote! { &BOOL },
+ "bf16" => quote! { &BF16 },
"f32" => quote! { &F32 },
"f64" => quote! { &F64 },
"i16" => quote! { &I16 },
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index 15d2454f43..8de2c88b81 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -22,6 +22,7 @@ struct Function {
has_test: bool,
}
+static BF16: Type = Type::BFloat16;
static F32: Type = Type::PrimFloat(32);
static F64: Type = Type::PrimFloat(64);
static I8: Type = Type::PrimSigned(8);
@@ -65,6 +66,7 @@ enum Type {
PrimFloat(u8),
PrimSigned(u8),
PrimUnsigned(u8),
+ BFloat16,
MutPtr(&'static Type),
ConstPtr(&'static Type),
M128,
@@ -700,6 +702,7 @@ fn equate(
(&Type::PrimSigned(64), "__int64" | "long long") => {}
(&Type::PrimUnsigned(8), "unsigned char") => {}
(&Type::PrimUnsigned(16), "unsigned short") => {}
+ (&Type::BFloat16, "__bfloat16") => {}
(
&Type::PrimUnsigned(32),
"unsigned __int32" | "unsigned int" | "unsigned long" | "const unsigned int",
@@ -761,6 +764,7 @@ fn equate(
(&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {}
(&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {}
(&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {}
+ (&Type::ConstPtr(&Type::BFloat16), "__bf16 const*") => {}
(&Type::ConstPtr(&Type::M128), "__m128 const*") => {}
(&Type::ConstPtr(&Type::M128BH), "__m128bh const*") => {}