From 4fc85989c9c721237b1d5b7d082c49d3579a59e7 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sun, 30 Jun 2024 01:42:36 +0530 Subject: [PATCH 1/6] Fix the stream intrinsics They should use a platform-specific address management. --- crates/core_arch/src/x86/avx.rs | 12 ++--- crates/core_arch/src/x86/avx2.rs | 4 +- crates/core_arch/src/x86/avx512bw.rs | 2 - crates/core_arch/src/x86/avx512f.rs | 69 +++++++++++++--------------- crates/core_arch/src/x86/macros.rs | 30 ++++++++++++ crates/core_arch/src/x86/sse.rs | 4 +- crates/core_arch/src/x86/sse2.rs | 12 ++--- crates/core_arch/src/x86/sse41.rs | 4 +- crates/core_arch/src/x86_64/sse2.rs | 4 +- 9 files changed, 82 insertions(+), 59 deletions(-) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index aa5a5d8c18..7726a188f2 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -1738,8 +1738,8 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { crate::arch::asm!( - "vmovntdq [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntdq", ",{a}"), + p = in(reg) mem_addr, a = in(ymm_reg) a, options(nostack, preserves_flags), ); @@ -1766,8 +1766,8 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { crate::arch::asm!( - "vmovntpd [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntpd", ",{a}"), + p = in(reg) mem_addr, a = in(ymm_reg) a, options(nostack, preserves_flags), ); @@ -1795,8 +1795,8 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) { crate::arch::asm!( - "vmovntps [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntps", ",{a}"), + p = in(reg) mem_addr, a = in(ymm_reg) a, options(nostack, preserves_flags), ); diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 0343416a92..fa32c7fcc4 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -3149,9 +3149,9 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i { let dst: __m256i; crate::arch::asm!( - "vmovntdqa {a}, [{mem_addr}]", + vpl!("vmovntdqa {a}"), a = out(ymm_reg) dst, - mem_addr = in(reg) mem_addr, + p = in(reg) mem_addr, options(pure, readonly, nostack, preserves_flags), ); dst diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 1f786d01f0..dd74d11786 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -8,8 +8,6 @@ use crate::{ #[cfg(test)] use stdarch_test::assert_instr; -use super::avx512f::{vpl, vps}; - /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_epi16&expand=30) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 8c88d3aa2f..ef7a7ccaeb 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -6,37 +6,6 @@ use crate::{ mem, ptr, }; -// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full -// register name (e.g. rax). We have to explicitly override the placeholder to -// use the 32-bit register name in that case. - -#[cfg(target_pointer_width = "32")] -macro_rules! vpl { - ($inst:expr) => { - concat!($inst, ", [{p:e}]") - }; -} -#[cfg(target_pointer_width = "64")] -macro_rules! vpl { - ($inst:expr) => { - concat!($inst, ", [{p}]") - }; -} -#[cfg(target_pointer_width = "32")] -macro_rules! vps { - ($inst1:expr, $inst2:expr) => { - concat!($inst1, " [{p:e}]", $inst2) - }; -} -#[cfg(target_pointer_width = "64")] -macro_rules! vps { - ($inst1:expr, $inst2:expr) => { - concat!($inst1, " [{p}]", $inst2) - }; -} - -pub(crate) use {vpl, vps}; - #[cfg(test)] use stdarch_test::assert_instr; @@ -27899,8 +27868,8 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { crate::arch::asm!( - "vmovntps [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntps", ",{a}"), + p = in(reg) mem_addr, a = in(zmm_reg) a, options(nostack, preserves_flags), ); @@ -27925,8 +27894,8 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { crate::arch::asm!( - "vmovntpd [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntpd", ",{a}"), + p = in(reg) mem_addr, a = in(zmm_reg) a, options(nostack, preserves_flags), ); @@ -27951,13 +27920,32 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) { crate::arch::asm!( - "vmovntdq [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("vmovntdq", ",{a}"), + p = in(reg) mem_addr, a = in(zmm_reg) a, options(nostack, preserves_flags), ); } +/// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr +/// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To +/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon) +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si256) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i { + let dst: __m512i; + crate::arch::asm!( + vpl!("vmovntdqa {a}"), + a = out(zmm_reg) dst, + p = in(reg) mem_addr, + options(pure, readonly, nostack, preserves_flags), + ); + dst +} + /// Sets packed 32-bit integers in `dst` with the supplied values. /// /// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931) @@ -54587,6 +54575,13 @@ mod tests { } } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_stream_load_si512() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _); + assert_eq_m512i(a, r); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_reduce_add_epi32() { let a = _mm512_set1_epi32(1); diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs index 17d64f5bbf..ddf38aa506 100644 --- a/crates/core_arch/src/x86/macros.rs +++ b/crates/core_arch/src/x86/macros.rs @@ -57,3 +57,33 @@ macro_rules! assert_approx_eq { ); }}; } + +// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full +// register name (e.g. rax). We have to explicitly override the placeholder to +// use the 32-bit register name in that case. + +#[cfg(target_pointer_width = "32")] +macro_rules! vpl { + ($inst:expr) => { + concat!($inst, ", [{p:e}]") + }; +} +#[cfg(target_pointer_width = "64")] +macro_rules! vpl { + ($inst:expr) => { + concat!($inst, ", [{p}]") + }; +} + +#[cfg(target_pointer_width = "32")] +macro_rules! vps { + ($inst1:expr, $inst2:expr) => { + concat!($inst1, " [{p:e}]", $inst2) + }; +} +#[cfg(target_pointer_width = "64")] +macro_rules! vps { + ($inst1:expr, $inst2:expr) => { + concat!($inst1, " [{p}]", $inst2) + }; +} diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index a4602301de..ea6e685acb 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1992,8 +1992,8 @@ extern "C" { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { crate::arch::asm!( - "movntps [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("movntps", ",{a}"), + p = in(reg) mem_addr, a = in(xmm_reg) a, options(nostack, preserves_flags), ); diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 289d41a0ff..0dee597410 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -1312,8 +1312,8 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { crate::arch::asm!( - "movntdq [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("movntdq", ",{a}"), + p = in(reg) mem_addr, a = in(xmm_reg) a, options(nostack, preserves_flags), ); @@ -1339,8 +1339,8 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { crate::arch::asm!( - "movnti [{mem_addr}], {a:e}", // `:e` for 32bit value - mem_addr = in(reg) mem_addr, + vps!("movnti", ",{a:e}"), // `:e` for 32bit value + p = in(reg) mem_addr, a = in(reg) a, options(nostack, preserves_flags), ); @@ -2542,8 +2542,8 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d { #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) { crate::arch::asm!( - "movntpd [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + vps!("movntpd", ",{a}"), + p = in(reg) mem_addr, a = in(xmm_reg) a, options(nostack, preserves_flags), ); diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index c8b260bec8..daf89bc3fd 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -1154,9 +1154,9 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i { let dst: __m128i; crate::arch::asm!( - "movntdqa {a}, [{mem_addr}]", + vpl!("movntdqa {a}"), a = out(xmm_reg) dst, - mem_addr = in(reg) mem_addr, + p = in(reg) mem_addr, options(pure, readonly, nostack, preserves_flags), ); dst diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs index e5069058cd..8f85d4e282 100644 --- a/crates/core_arch/src/x86_64/sse2.rs +++ b/crates/core_arch/src/x86_64/sse2.rs @@ -79,8 +79,8 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) { crate::arch::asm!( - "movnti [{mem_addr}], {a}", - mem_addr = in(reg) mem_addr, + "movnti [{p}], {a}", + p = in(reg) mem_addr, a = in(reg) a, options(nostack, preserves_flags), ); From fdc73f0150c4804ae8d775d2bb863eb76e05b785 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sun, 30 Jun 2024 12:34:35 +0530 Subject: [PATCH 2/6] Implemented missing gather-scatters --- crates/core_arch/missing-x86.md | 72 -- crates/core_arch/src/x86/avx512f.rs | 1444 +++++++++++++++++++++++- crates/core_arch/src/x86_64/avx512f.rs | 574 +++++++++- 3 files changed, 1981 insertions(+), 109 deletions(-) diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index 4c70c1b435..8d75a237a4 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -147,78 +147,6 @@

-
["AVX512F"]

- - * [ ] [`_mm512_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64) - * [ ] [`_mm512_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd) - * [ ] [`_mm512_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64) - * [ ] [`_mm512_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd) - * [ ] [`_mm512_mask_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64) - * [ ] [`_mm512_mask_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd) - * [ ] [`_mm512_mask_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64) - * [ ] [`_mm512_mask_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd) - * [ ] [`_mm512_stream_load_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_load_si512) - * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd) - * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss) - * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd) - * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss) - * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd) - * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss) -

- - -
["AVX512F", "AVX512VL"]

- - * [ ] [`_mm256_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32) - * [ ] [`_mm256_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd) - * [ ] [`_mm256_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps) - * [ ] [`_mm256_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32) - * [ ] [`_mm256_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64) - * [ ] [`_mm256_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd) - * [ ] [`_mm256_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps) - * [ ] [`_mm256_mask_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32) - * [ ] [`_mm256_mask_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi64) - * [ ] [`_mm256_mask_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_pd) - * [ ] [`_mm256_mask_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps) - * [ ] [`_mm256_mask_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32) - * [ ] [`_mm256_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64) - * [ ] [`_mm256_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd) - * [ ] [`_mm256_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps) - * [ ] [`_mm256_mmask_i32gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32) - * [ ] [`_mm256_mmask_i32gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64) - * [ ] [`_mm256_mmask_i32gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd) - * [ ] [`_mm256_mmask_i32gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps) - * [ ] [`_mm256_mmask_i64gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32) - * [ ] [`_mm256_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64) - * [ ] [`_mm256_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd) - * [ ] [`_mm256_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps) - * [ ] [`_mm_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32) - * [ ] [`_mm_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64) - * [ ] [`_mm_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd) - * [ ] [`_mm_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps) - * [ ] [`_mm_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32) - * [ ] [`_mm_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64) - * [ ] [`_mm_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd) - * [ ] [`_mm_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps) - * [ ] [`_mm_mask_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32) - * [ ] [`_mm_mask_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64) - * [ ] [`_mm_mask_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd) - * [ ] [`_mm_mask_i32scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps) - * [ ] [`_mm_mask_i64scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32) - * [ ] [`_mm_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64) - * [ ] [`_mm_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_pd) - * [ ] [`_mm_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps) - * [ ] [`_mm_mmask_i32gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32) - * [ ] [`_mm_mmask_i32gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64) - * [ ] [`_mm_mmask_i32gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd) - * [ ] [`_mm_mmask_i32gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps) - * [ ] [`_mm_mmask_i64gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32) - * [ ] [`_mm_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64) - * [ ] [`_mm_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd) - * [ ] [`_mm_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps) -

- -
["AVX512_BF16", "AVX512F"]

* [ ] [`_mm512_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index ef7a7ccaeb..fc82344348 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -16534,27 +16534,6 @@ pub unsafe fn _mm512_mask_i32scatter_epi64( vpscatterdq(slice, mask, offsets, src, SCALE); } -/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099) -#[inline] -#[target_feature(enable = "avx512f,avx512vl")] -#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm256_i32scatter_epi64( - slice: *mut u8, - offsets: __m128i, - src: __m256i, -) { - static_assert_imm8_scale!(SCALE); - let src = src.as_i64x4(); - let neg_one = -1; - let slice = slice as *mut i8; - let offsets = offsets.as_i32x4(); - vpscatterdq256(slice, neg_one, offsets, src, SCALE); -} - /// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_i64scatter_epi64&expand=3116) @@ -16684,6 +16663,1153 @@ pub unsafe fn _mm512_mask_i64scatter_epi32( vpscatterqd(slice, mask, offsets, src, SCALE); } +/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer +/// indices stored in the lower half of vindex scaled by scale and stores them in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_i32logather_epi64( + vindex: __m512i, + base_addr: *const u8, +) -> __m512i { + _mm512_i32gather_epi64::(_mm512_castsi512_si256(vindex), base_addr as _) +} + +/// Loads 8 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer +/// indices stored in the lower half of vindex scaled by scale and stores them in dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_mask_i32logather_epi64( + src: __m512i, + k: __mmask8, + vindex: __m512i, + base_addr: *const u8, +) -> __m512i { + _mm512_mask_i32gather_epi64::(src, k, _mm512_castsi512_si256(vindex), base_addr as _) +} + +/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr +/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))] +#[rustc_legacy_const_generics(2)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_i32logather_pd( + vindex: __m512i, + base_addr: *const u8, +) -> __m512d { + _mm512_i32gather_pd::(_mm512_castsi512_si256(vindex), base_addr as _) +} + +/// Loads 8 double-precision (64-bit) floating-point elements from memory starting at location base_addr +/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale and stores them in dst +/// using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_mask_i32logather_pd( + src: __m512d, + k: __mmask8, + vindex: __m512i, + base_addr: *const u8, +) -> __m512d { + _mm512_mask_i32gather_pd::(src, k, _mm512_castsi512_si256(vindex), base_addr as _) +} + +/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in the lower half of vindex scaled by scale. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_i32loscatter_epi64( + base_addr: *mut u8, + vindex: __m512i, + a: __m512i, +) { + _mm512_i32scatter_epi64::(base_addr as _, _mm512_castsi512_si256(vindex), a) +} + +/// Stores 8 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in the lower half of vindex scaled by scale using writemask k (elements whose corresponding +/// mask bit is not set are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_mask_i32loscatter_epi64( + base_addr: *mut u8, + k: __mmask8, + vindex: __m512i, + a: __m512i, +) { + _mm512_mask_i32scatter_epi64::(base_addr as _, k, _mm512_castsi512_si256(vindex), a) +} + +/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_i32loscatter_pd( + base_addr: *mut u8, + vindex: __m512i, + a: __m512d, +) { + _mm512_i32scatter_pd::(base_addr as _, _mm512_castsi512_si256(vindex), a) +} + +/// Stores 8 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in the lower half of vindex scaled by scale using writemask k +/// (elements whose corresponding mask bit is not set are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_mask_i32loscatter_pd( + base_addr: *mut u8, + k: __mmask8, + vindex: __m512i, + a: __m512d, +) { + _mm512_mask_i32scatter_pd::(base_addr as _, k, _mm512_castsi512_si256(vindex), a) +} + +/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_i32scatter_epi32( + base_addr: *mut u8, + vindex: __m256i, + a: __m256i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterdd_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_i32x8(), SCALE) +} + +/// Stores 8 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set +/// are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_i32scatter_epi32( + base_addr: *mut u8, + k: __mmask8, + vindex: __m256i, + a: __m256i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterdd_256(base_addr as _, k, vindex.as_i32x8(), a.as_i32x8(), SCALE) +} + +/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32scatter_epi64&expand=4099) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm256_i32scatter_epi64( + slice: *mut u8, + offsets: __m128i, + src: __m256i, +) { + static_assert_imm8_scale!(SCALE); + let src = src.as_i64x4(); + let slice = slice as *mut i8; + let offsets = offsets.as_i32x4(); + vpscatterdq_256(slice, 0xff, offsets, src, SCALE); +} + +/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set +/// are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_i32scatter_epi64( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m256i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterdq_256(base_addr as _, k, vindex.as_i32x4(), a.as_i64x4(), SCALE) +} + +/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_i32scatter_pd( + base_addr: *mut u8, + vindex: __m128i, + a: __m256d, +) { + static_assert_imm8_scale!(SCALE); + vscatterdpd_256(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x4(), SCALE) +} + +/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding +/// mask bit is not set are not written to memory). +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_i32scatter_pd( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m256d, +) { + static_assert_imm8_scale!(SCALE); + vscatterdpd_256(base_addr as _, k, vindex.as_i32x4(), a.as_f64x4(), SCALE) +} + +/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32scatter_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_i32scatter_ps( + base_addr: *mut u8, + vindex: __m256i, + a: __m256, +) { + static_assert_imm8_scale!(SCALE); + vscatterdps_256(base_addr as _, 0xff, vindex.as_i32x8(), a.as_f32x8(), SCALE) +} + +/// Stores 8 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding +/// mask bit is not set are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32scatter_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_i32scatter_ps( + base_addr: *mut u8, + k: __mmask8, + vindex: __m256i, + a: __m256, +) { + static_assert_imm8_scale!(SCALE); + vscatterdps_256(base_addr as _, k, vindex.as_i32x8(), a.as_f32x8(), SCALE) +} + +/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_i64scatter_epi32( + base_addr: *mut u8, + vindex: __m256i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterqd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i32x4(), SCALE) +} + +/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set +/// are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_i64scatter_epi32( + base_addr: *mut u8, + k: __mmask8, + vindex: __m256i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterqd_256(base_addr as _, k, vindex.as_i64x4(), a.as_i32x4(), SCALE) +} + +/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_i64scatter_epi64( + base_addr: *mut u8, + vindex: __m256i, + a: __m256i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterqq_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_i64x4(), SCALE) +} + +/// Stores 4 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set +/// are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_i64scatter_epi64( + base_addr: *mut u8, + k: __mmask8, + vindex: __m256i, + a: __m256i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterqq_256(base_addr as _, k, vindex.as_i64x4(), a.as_i64x4(), SCALE) +} + +/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_i64scatter_pd( + base_addr: *mut u8, + vindex: __m256i, + a: __m256d, +) { + static_assert_imm8_scale!(SCALE); + vscatterqpd_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f64x4(), SCALE) +} + +/// Stores 4 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding +/// mask bit is not set are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_i64scatter_pd( + base_addr: *mut u8, + k: __mmask8, + vindex: __m256i, + a: __m256d, +) { + static_assert_imm8_scale!(SCALE); + vscatterqpd_256(base_addr as _, k, vindex.as_i64x4(), a.as_f64x4(), SCALE) +} + +/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64scatter_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_i64scatter_ps( + base_addr: *mut u8, + vindex: __m256i, + a: __m128, +) { + static_assert_imm8_scale!(SCALE); + vscatterqps_256(base_addr as _, 0xff, vindex.as_i64x4(), a.as_f32x4(), SCALE) +} + +/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding +/// mask bit is not set are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64scatter_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_i64scatter_ps( + base_addr: *mut u8, + k: __mmask8, + vindex: __m256i, + a: __m128, +) { + static_assert_imm8_scale!(SCALE); + vscatterqps_256(base_addr as _, k, vindex.as_i64x4(), a.as_f32x4(), SCALE) +} + +/// Loads 8 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mmask_i32gather_epi32( + src: __m256i, + k: __mmask8, + vindex: __m256i, + base_addr: *const u8, +) -> __m256i { + static_assert_imm8_scale!(SCALE); + transmute(vpgatherdd_256( + src.as_i32x8(), + base_addr as _, + vindex.as_i32x8(), + k, + SCALE, + )) +} + +/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mmask_i32gather_epi64( + src: __m256i, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m256i { + static_assert_imm8_scale!(SCALE); + transmute(vpgatherdq_256( + src.as_i64x4(), + base_addr as _, + vindex.as_i32x4(), + k, + SCALE, + )) +} + +/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mmask_i32gather_pd( + src: __m256d, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m256d { + static_assert_imm8_scale!(SCALE); + transmute(vgatherdpd_256( + src.as_f64x4(), + base_addr as _, + vindex.as_i32x4(), + k, + SCALE, + )) +} + +/// Loads 8 single-precision (32-bit) floating-point elements from memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i32gather_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mmask_i32gather_ps( + src: __m256, + k: __mmask8, + vindex: __m256i, + base_addr: *const u8, +) -> __m256 { + static_assert_imm8_scale!(SCALE); + transmute(vgatherdps_256( + src.as_f32x8(), + base_addr as _, + vindex.as_i32x8(), + k, + SCALE, + )) +} + +/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mmask_i64gather_epi32( + src: __m128i, + k: __mmask8, + vindex: __m256i, + base_addr: *const u8, +) -> __m128i { + static_assert_imm8_scale!(SCALE); + transmute(vpgatherqd_256( + src.as_i32x4(), + base_addr as _, + vindex.as_i64x4(), + k, + SCALE, + )) +} + +/// Loads 4 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mmask_i64gather_epi64( + src: __m256i, + k: __mmask8, + vindex: __m256i, + base_addr: *const u8, +) -> __m256i { + static_assert_imm8_scale!(SCALE); + transmute(vpgatherqq_256( + src.as_i64x4(), + base_addr as _, + vindex.as_i64x4(), + k, + SCALE, + )) +} + +/// Loads 4 double-precision (64-bit) floating-point elements from memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mmask_i64gather_pd( + src: __m256d, + k: __mmask8, + vindex: __m256i, + base_addr: *const u8, +) -> __m256d { + static_assert_imm8_scale!(SCALE); + transmute(vgatherqpd_256( + src.as_f64x4(), + base_addr as _, + vindex.as_i64x4(), + k, + SCALE, + )) +} + +/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mmask_i64gather_ps( + src: __m128, + k: __mmask8, + vindex: __m256i, + base_addr: *const u8, +) -> __m128 { + static_assert_imm8_scale!(SCALE); + transmute(vgatherqps_256( + src.as_f32x4(), + base_addr as _, + vindex.as_i64x4(), + k, + SCALE, + )) +} + +/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_i32scatter_epi32( + base_addr: *mut u8, + vindex: __m128i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterdd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i32x4(), SCALE) +} + +/// Stores 4 32-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set +/// are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_i32scatter_epi32( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterdd_128(base_addr as _, k, vindex.as_i32x4(), a.as_i32x4(), SCALE) +} + +/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_i32scatter_epi64( + base_addr: *mut u8, + vindex: __m128i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterdq_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_i64x2(), SCALE) +} + +/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set +/// are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_i32scatter_epi64( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterdq_128(base_addr as _, k, vindex.as_i32x4(), a.as_i64x2(), SCALE) +} + +/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_i32scatter_pd(base_addr: *mut u8, vindex: __m128i, a: __m128d) { + static_assert_imm8_scale!(SCALE); + vscatterdpd_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f64x2(), SCALE) +} + +/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding +/// mask bit is not set are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_i32scatter_pd( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m128d, +) { + static_assert_imm8_scale!(SCALE); + vscatterdpd_128(base_addr as _, k, vindex.as_i32x4(), a.as_f64x2(), SCALE) +} + +/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_i32scatter_ps(base_addr: *mut u8, vindex: __m128i, a: __m128) { + static_assert_imm8_scale!(SCALE); + vscatterdps_128(base_addr as _, 0xff, vindex.as_i32x4(), a.as_f32x4(), SCALE) +} + +/// Stores 4 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding +/// mask bit is not set are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_i32scatter_ps( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m128, +) { + static_assert_imm8_scale!(SCALE); + vscatterdps_128(base_addr as _, k, vindex.as_i32x4(), a.as_f32x4(), SCALE) +} + +/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_i64scatter_epi32( + base_addr: *mut u8, + vindex: __m128i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterqd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i32x4(), SCALE) +} + +/// Stores 2 32-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set +/// are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_i64scatter_epi32( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterqd_128(base_addr as _, k, vindex.as_i64x2(), a.as_i32x4(), SCALE) +} + +/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_i64scatter_epi64( + base_addr: *mut u8, + vindex: __m128i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterqq_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_i64x2(), SCALE) +} + +/// Stores 2 64-bit integer elements from a to memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements whose corresponding mask bit is not set +/// are not written to memory). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_i64scatter_epi64( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m128i, +) { + static_assert_imm8_scale!(SCALE); + vpscatterqq_128(base_addr as _, k, vindex.as_i64x2(), a.as_i64x2(), SCALE) +} + +/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_i64scatter_pd(base_addr: *mut u8, vindex: __m128i, a: __m128d) { + static_assert_imm8_scale!(SCALE); + vscatterqpd_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f64x2(), SCALE) +} + +/// Stores 2 double-precision (64-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding +/// mask bit is not set are not written to memory). +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_i64scatter_pd( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m128d, +) { + static_assert_imm8_scale!(SCALE); + vscatterqpd_128(base_addr as _, k, vindex.as_i64x2(), a.as_f64x2(), SCALE) +} + +/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_i64scatter_ps(base_addr: *mut u8, vindex: __m128i, a: __m128) { + static_assert_imm8_scale!(SCALE); + vscatterqps_128(base_addr as _, 0xff, vindex.as_i64x2(), a.as_f32x4(), SCALE) +} + +/// Stores 2 single-precision (32-bit) floating-point elements from a to memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements whose corresponding +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_i64scatter_ps( + base_addr: *mut u8, + k: __mmask8, + vindex: __m128i, + a: __m128, +) { + static_assert_imm8_scale!(SCALE); + vscatterqps_128(base_addr as _, k, vindex.as_i64x2(), a.as_f32x4(), SCALE) +} + +/// Loads 4 32-bit integer elements from memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mmask_i32gather_epi32( + src: __m128i, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m128i { + static_assert_imm8_scale!(SCALE); + transmute(vpgatherdd_128( + src.as_i32x4(), + base_addr as _, + vindex.as_i32x4(), + k, + SCALE, + )) +} + +/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 32-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mmask_i32gather_epi64( + src: __m128i, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m128i { + static_assert_imm8_scale!(SCALE); + transmute(vpgatherdq_128( + src.as_i64x2(), + base_addr as _, + vindex.as_i32x4(), + k, + SCALE, + )) +} + +/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mmask_i32gather_pd( + src: __m128d, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m128d { + static_assert_imm8_scale!(SCALE); + transmute(vgatherdpd_128( + src.as_f64x2(), + base_addr as _, + vindex.as_i32x4(), + k, + SCALE, + )) +} + +/// Loads 4 single-precision (32-bit) floating-point elements from memory starting at location base_addr +/// at packed 32-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mmask_i32gather_ps( + src: __m128, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m128 { + static_assert_imm8_scale!(SCALE); + transmute(vgatherdps_128( + src.as_f32x4(), + base_addr as _, + vindex.as_i32x4(), + k, + SCALE, + )) +} + +/// Loads 2 32-bit integer elements from memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mmask_i64gather_epi32( + src: __m128i, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m128i { + static_assert_imm8_scale!(SCALE); + transmute(vpgatherqd_128( + src.as_i32x4(), + base_addr as _, + vindex.as_i64x2(), + k, + SCALE, + )) +} + +/// Loads 2 64-bit integer elements from memory starting at location base_addr at packed 64-bit integer +/// indices stored in vindex scaled by scale using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mmask_i64gather_epi64( + src: __m128i, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m128i { + static_assert_imm8_scale!(SCALE); + transmute(vpgatherqq_128( + src.as_i64x2(), + base_addr as _, + vindex.as_i64x2(), + k, + SCALE, + )) +} + +/// Loads 2 double-precision (64-bit) floating-point elements from memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mmask_i64gather_pd( + src: __m128d, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m128d { + static_assert_imm8_scale!(SCALE); + transmute(vgatherqpd_128( + src.as_f64x2(), + base_addr as _, + vindex.as_i64x2(), + k, + SCALE, + )) +} + +/// Loads 2 single-precision (32-bit) floating-point elements from memory starting at location base_addr +/// at packed 64-bit integer indices stored in vindex scaled by scale using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))] +#[rustc_legacy_const_generics(4)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mmask_i64gather_ps( + src: __m128, + k: __mmask8, + vindex: __m128i, + base_addr: *const u8, +) -> __m128 { + static_assert_imm8_scale!(SCALE); + transmute(vgatherqps_128( + src.as_f32x4(), + base_addr as _, + vindex.as_i64x2(), + k, + SCALE, + )) +} + /// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_compress_epi32&expand=1198) @@ -33865,6 +34991,94 @@ pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { dst } +/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst +/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper +/// 3 packed elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection +/// exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss) +#[inline] +#[cfg_attr(test, assert_instr(vmovss))] +#[target_feature(enable = "sse,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_load_ss(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128 = src; + asm!( + vpl!("vmovss {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack, preserves_flags), + ); + dst +} + +/// Load a single-precision (32-bit) floating-point element from memory into the lower element of dst +/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper 3 packed +/// elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection +/// exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss) +#[inline] +#[cfg_attr(test, assert_instr(vmovss))] +#[target_feature(enable = "sse,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_maskz_load_ss(k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128; + asm!( + vpl!("vmovss {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack, preserves_flags), + ); + dst +} + +/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst +/// using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper +/// element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection +/// exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd) +#[inline] +#[cfg_attr(test, assert_instr(vmovsd))] +#[target_feature(enable = "sse,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_load_sd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d = src; + asm!( + vpl!("vmovsd {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack, preserves_flags), + ); + dst +} + +/// Load a double-precision (64-bit) floating-point element from memory into the lower element of dst +/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper element +/// of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception +/// may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd) +#[inline] +#[cfg_attr(test, assert_instr(vmovsd))] +#[target_feature(enable = "sse,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_maskz_load_sd(k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d; + asm!( + vpl!("vmovsd {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack, preserves_flags), + ); + dst +} + /// Store packed 32-bit integers from a into memory using writemask k. /// mem_addr does not need to be aligned on any particular boundary. /// @@ -34285,6 +35499,42 @@ pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) ); } +/// Store a single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr +/// must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss) +#[inline] +#[cfg_attr(test, assert_instr(vmovss))] +#[target_feature(enable = "sse,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_store_ss(mem_addr: *mut f32, k: __mmask8, a: __m128) { + asm!( + vps!("vmovss", "{{{k}}}, {a}"), + p = in(reg) mem_addr, + k = in(kreg) k, + a = in(xmm_reg) a, + options(nostack, preserves_flags), + ); +} + +/// Store a double-precision (64-bit) floating-point element from a into memory using writemask k. mem_addr +/// must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd) +#[inline] +#[cfg_attr(test, assert_instr(vmovsd))] +#[target_feature(enable = "sse,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_store_sd(mem_addr: *mut f64, k: __mmask8, a: __m128d) { + asm!( + vps!("vmovsd", "{{{k}}}, {a}"), + p = in(reg) mem_addr, + k = in(kreg) k, + a = in(xmm_reg) a, + options(nostack, preserves_flags), + ); +} + /// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_expandloadu_epi32) @@ -41183,8 +42433,6 @@ extern "C" { fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32); #[link_name = "llvm.x86.avx512.scatter.dpq.512"] fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32); - #[link_name = "llvm.x86.avx512.scattersiv4.di"] - fn vpscatterdq256(slice: *mut i8, mask: i8, offsets: i32x4, src: i64x4, scale: i32); #[link_name = "llvm.x86.avx512.scatter.dpi.512"] fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32); @@ -41193,6 +42441,74 @@ extern "C" { #[link_name = "llvm.x86.avx512.scatter.qpi.512"] fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32); + #[link_name = "llvm.x86.avx512.scattersiv4.si"] + fn vpscatterdd_128(slice: *mut i8, k: u8, offsets: i32x4, src: i32x4, scale: i32); + #[link_name = "llvm.x86.avx512.scattersiv2.di"] + fn vpscatterdq_128(slice: *mut i8, k: u8, offsets: i32x4, src: i64x2, scale: i32); + #[link_name = "llvm.x86.avx512.scattersiv2.df"] + fn vscatterdpd_128(slice: *mut i8, k: u8, offsets: i32x4, src: f64x2, scale: i32); + #[link_name = "llvm.x86.avx512.scattersiv4.sf"] + fn vscatterdps_128(slice: *mut i8, k: u8, offsets: i32x4, src: f32x4, scale: i32); + #[link_name = "llvm.x86.avx512.scatterdiv4.si"] + fn vpscatterqd_128(slice: *mut i8, k: u8, offsets: i64x2, src: i32x4, scale: i32); + #[link_name = "llvm.x86.avx512.scatterdiv2.di"] + fn vpscatterqq_128(slice: *mut i8, k: u8, offsets: i64x2, src: i64x2, scale: i32); + #[link_name = "llvm.x86.avx512.scatterdiv2.df"] + fn vscatterqpd_128(slice: *mut i8, k: u8, offsets: i64x2, src: f64x2, scale: i32); + #[link_name = "llvm.x86.avx512.scatterdiv4.sf"] + fn vscatterqps_128(slice: *mut i8, k: u8, offsets: i64x2, src: f32x4, scale: i32); + + #[link_name = "llvm.x86.avx512.scattersiv8.si"] + fn vpscatterdd_256(slice: *mut i8, k: u8, offsets: i32x8, src: i32x8, scale: i32); + #[link_name = "llvm.x86.avx512.scattersiv4.di"] + fn vpscatterdq_256(slice: *mut i8, k: u8, offsets: i32x4, src: i64x4, scale: i32); + #[link_name = "llvm.x86.avx512.scattersiv4.df"] + fn vscatterdpd_256(slice: *mut i8, k: u8, offsets: i32x4, src: f64x4, scale: i32); + #[link_name = "llvm.x86.avx512.scattersiv8.sf"] + fn vscatterdps_256(slice: *mut i8, k: u8, offsets: i32x8, src: f32x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatterdiv8.si"] + fn vpscatterqd_256(slice: *mut i8, k: u8, offsets: i64x4, src: i32x4, scale: i32); + #[link_name = "llvm.x86.avx512.scatterdiv4.di"] + fn vpscatterqq_256(slice: *mut i8, k: u8, offsets: i64x4, src: i64x4, scale: i32); + #[link_name = "llvm.x86.avx512.scatterdiv4.df"] + fn vscatterqpd_256(slice: *mut i8, k: u8, offsets: i64x4, src: f64x4, scale: i32); + #[link_name = "llvm.x86.avx512.scatterdiv8.sf"] + fn vscatterqps_256(slice: *mut i8, k: u8, offsets: i64x4, src: f32x4, scale: i32); + + #[link_name = "llvm.x86.avx512.gather3siv4.si"] + fn vpgatherdd_128(src: i32x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i32x4; + #[link_name = "llvm.x86.avx512.gather3siv2.di"] + fn vpgatherdq_128(src: i64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x2; + #[link_name = "llvm.x86.avx512.gather3siv2.df"] + fn vgatherdpd_128(src: f64x2, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.gather3siv4.sf"] + fn vgatherdps_128(src: f32x4, slice: *const u8, offsets: i32x4, k: u8, scale: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.gather3div4.si"] + fn vpgatherqd_128(src: i32x4, slice: *const u8, offsets: i64x2, k: u8, scale: i32) -> i32x4; + #[link_name = "llvm.x86.avx512.gather3div2.di"] + fn vpgatherqq_128(src: i64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> i64x2; + #[link_name = "llvm.x86.avx512.gather3div2.df"] + fn vgatherqpd_128(src: f64x2, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.gather3div4.sf"] + fn vgatherqps_128(src: f32x4, slice: *const i8, offsets: i64x2, k: u8, scale: i32) -> f32x4; + + #[link_name = "llvm.x86.avx512.gather3siv8.si"] + fn vpgatherdd_256(src: i32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> i32x8; + #[link_name = "llvm.x86.avx512.gather3siv4.di"] + fn vpgatherdq_256(src: i64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> i64x4; + #[link_name = "llvm.x86.avx512.gather3siv4.df"] + fn vgatherdpd_256(src: f64x4, slice: *const i8, offsets: i32x4, k: u8, scale: i32) -> f64x4; + #[link_name = "llvm.x86.avx512.gather3siv8.sf"] + fn vgatherdps_256(src: f32x8, slice: *const i8, offsets: i32x8, k: u8, scale: i32) -> f32x8; + #[link_name = "llvm.x86.avx512.gather3div8.si"] + fn vpgatherqd_256(src: i32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i32x4; + #[link_name = "llvm.x86.avx512.gather3div4.di"] + fn vpgatherqq_256(src: i64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> i64x4; + #[link_name = "llvm.x86.avx512.gather3div4.df"] + fn vgatherqpd_256(src: f64x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f64x4; + #[link_name = "llvm.x86.avx512.gather3div8.sf"] + fn vgatherqps_256(src: f32x4, slice: *const i8, offsets: i64x4, k: u8, scale: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.cmp.ss"] fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8; #[link_name = "llvm.x86.avx512.mask.cmp.sd"] @@ -50274,6 +51590,60 @@ mod tests { assert_eq_m128d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_load_ss() { + #[repr(align(16))] + struct Align { + data: f32, + } + let src = _mm_set_ss(2.0); + let mem = Align { data: 1.0 }; + let r = _mm_mask_load_ss(src, 0b1, &mem.data); + assert_eq_m128(r, _mm_set_ss(1.0)); + let r = _mm_mask_load_ss(src, 0b0, &mem.data); + assert_eq_m128(r, _mm_set_ss(2.0)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_load_ss() { + #[repr(align(16))] + struct Align { + data: f32, + } + let mem = Align { data: 1.0 }; + let r = _mm_maskz_load_ss(0b1, &mem.data); + assert_eq_m128(r, _mm_set_ss(1.0)); + let r = _mm_maskz_load_ss(0b0, &mem.data); + assert_eq_m128(r, _mm_set_ss(0.0)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_load_sd() { + #[repr(align(16))] + struct Align { + data: f64, + } + let src = _mm_set_sd(2.0); + let mem = Align { data: 1.0 }; + let r = _mm_mask_load_sd(src, 0b1, &mem.data); + assert_eq_m128d(r, _mm_set_sd(1.0)); + let r = _mm_mask_load_sd(src, 0b0, &mem.data); + assert_eq_m128d(r, _mm_set_sd(2.0)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_load_sd() { + #[repr(align(16))] + struct Align { + data: f64, + } + let mem = Align { data: 1.0 }; + let r = _mm_maskz_load_sd(0b1, &mem.data); + assert_eq_m128d(r, _mm_set_sd(1.0)); + let r = _mm_maskz_load_sd(0b0, &mem.data); + assert_eq_m128d(r, _mm_set_sd(0.0)); + } + #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm_mask_storeu_pd() { let mut r = [42_f64; 2]; @@ -50298,6 +51668,34 @@ mod tests { assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_store_ss() { + #[repr(align(16))] + struct Align { + data: f32, + } + let a = _mm_set_ss(2.0); + let mut mem = Align { data: 1.0 }; + _mm_mask_store_ss(&mut mem.data, 0b1, a); + assert_eq!(mem.data, 2.0); + _mm_mask_store_ss(&mut mem.data, 0b0, a); + assert_eq!(mem.data, 2.0); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_store_sd() { + #[repr(align(16))] + struct Align { + data: f64, + } + let a = _mm_set_sd(2.0); + let mut mem = Align { data: 1.0 }; + _mm_mask_store_sd(&mut mem.data, 0b1, a); + assert_eq!(mem.data, 2.0); + _mm_mask_store_sd(&mut mem.data, 0b0, a); + assert_eq!(mem.data, 2.0); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_setr_pd() { let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index a2b2496caf..5ea6dcc026 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -7649,20 +7649,6 @@ mod tests { assert_eq!(&arr[..], &expected[..],); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_i32scatter_epi64() { - let mut arr = [0i64; 64]; - let index = _mm_setr_epi32(0, 16, 32, 48); - let src = _mm256_setr_epi64x(1, 2, 3, 4); - // A multiplier of 8 is word-addressing - _mm256_i32scatter_epi64::<8>(arr.as_mut_ptr() as *mut u8, index, src); - let mut expected = [0i64; 64]; - for i in 0..4 { - expected[i * 16] = (i + 1) as i64; - } - assert_eq!(&arr[..], &expected[..],); - } - #[simd_test(enable = "avx512f")] unsafe fn test_mm512_i64scatter_epi64() { let mut arr = [0i64; 128]; @@ -7721,6 +7707,566 @@ mod tests { assert_eq!(&arr[..], &expected[..],); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32logather_epi64() { + let base_addr: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8]; + let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1); + let r = _mm512_i32logather_epi64::<8>(vindex, base_addr.as_ptr().cast()); + let expected = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1); + assert_eq_m512i(expected, r); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32logather_epi64() { + let base_addr: [i64; 8] = [1, 2, 3, 4, 5, 6, 7, 8]; + let src = _mm512_setr_epi64(9, 10, 11, 12, 13, 14, 15, 16); + let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1); + let r = + _mm512_mask_i32logather_epi64::<8>(src, 0b01010101, vindex, base_addr.as_ptr().cast()); + let expected = _mm512_setr_epi64(2, 10, 4, 12, 6, 14, 8, 16); + assert_eq_m512i(expected, r); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32logather_pd() { + let base_addr: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.]; + let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1); + let r = _mm512_i32logather_pd::<8>(vindex, base_addr.as_ptr().cast()); + let expected = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.); + assert_eq_m512d(expected, r); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32logather_pd() { + let base_addr: [f64; 8] = [1., 2., 3., 4., 5., 6., 7., 8.]; + let src = _mm512_setr_pd(9., 10., 11., 12., 13., 14., 15., 16.); + let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1); + let r = _mm512_mask_i32logather_pd::<8>(src, 0b01010101, vindex, base_addr.as_ptr().cast()); + let expected = _mm512_setr_pd(2., 10., 4., 12., 6., 14., 8., 16.); + assert_eq_m512d(expected, r); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32loscatter_epi64() { + let mut base_addr: [i64; 8] = [0; 8]; + let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1); + let src = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1); + _mm512_i32loscatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2, 3, 4, 5, 6, 7, 8]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32loscatter_epi64() { + let mut base_addr: [i64; 8] = [0; 8]; + let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1); + let src = _mm512_setr_epi64(2, 3, 4, 5, 6, 7, 8, 1); + _mm512_mask_i32loscatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src); + let expected = [0, 2, 0, 4, 0, 6, 0, 8]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32loscatter_pd() { + let mut base_addr: [f64; 8] = [0.; 8]; + let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1); + let src = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.); + _mm512_i32loscatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2., 3., 4., 5., 6., 7., 8.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32loscatter_pd() { + let mut base_addr: [f64; 8] = [0.; 8]; + let vindex = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1); + let src = _mm512_setr_pd(2., 3., 4., 5., 6., 7., 8., 1.); + _mm512_mask_i32loscatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src); + let expected = [0., 2., 0., 4., 0., 6., 0., 8.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mmask_i32gather_epi32() { + let base_addr: [i32; 4] = [1, 2, 3, 4]; + let src = _mm_setr_epi32(5, 6, 7, 8); + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let r = _mm_mmask_i32gather_epi32::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_epi32(2, 6, 4, 8); + assert_eq_m128i(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mmask_i32gather_epi64() { + let base_addr: [i64; 2] = [1, 2]; + let src = _mm_setr_epi64x(5, 6); + let vindex = _mm_setr_epi32(1, 0, -1, -1); + let r = _mm_mmask_i32gather_epi64::<8>(src, 0b01, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_epi64x(2, 6); + assert_eq_m128i(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mmask_i32gather_pd() { + let base_addr: [f64; 2] = [1., 2.]; + let src = _mm_setr_pd(5., 6.); + let vindex = _mm_setr_epi32(1, 0, -1, -1); + let r = _mm_mmask_i32gather_pd::<8>(src, 0b01, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_pd(2., 6.); + assert_eq_m128d(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mmask_i32gather_ps() { + let base_addr: [f32; 4] = [1., 2., 3., 4.]; + let src = _mm_setr_ps(5., 6., 7., 8.); + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let r = _mm_mmask_i32gather_ps::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_ps(2., 6., 4., 8.); + assert_eq_m128(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mmask_i64gather_epi32() { + let base_addr: [i32; 2] = [1, 2]; + let src = _mm_setr_epi32(5, 6, 7, 8); + let vindex = _mm_setr_epi64x(1, 0); + let r = _mm_mmask_i64gather_epi32::<4>(src, 0b01, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_epi32(2, 6, 0, 0); + assert_eq_m128i(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mmask_i64gather_epi64() { + let base_addr: [i64; 2] = [1, 2]; + let src = _mm_setr_epi64x(5, 6); + let vindex = _mm_setr_epi64x(1, 0); + let r = _mm_mmask_i64gather_epi64::<8>(src, 0b01, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_epi64x(2, 6); + assert_eq_m128i(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mmask_i64gather_pd() { + let base_addr: [f64; 2] = [1., 2.]; + let src = _mm_setr_pd(5., 6.); + let vindex = _mm_setr_epi64x(1, 0); + let r = _mm_mmask_i64gather_pd::<8>(src, 0b01, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_pd(2., 6.); + assert_eq_m128d(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mmask_i64gather_ps() { + let base_addr: [f32; 2] = [1., 2.]; + let src = _mm_setr_ps(5., 6., 7., 8.); + let vindex = _mm_setr_epi64x(1, 0); + let r = _mm_mmask_i64gather_ps::<4>(src, 0b01, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_ps(2., 6., 0., 0.); + assert_eq_m128(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mmask_i32gather_epi32() { + let base_addr: [i32; 8] = [1, 2, 3, 4, 5, 6, 7, 8]; + let src = _mm256_setr_epi32(9, 10, 11, 12, 13, 14, 15, 16); + let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); + let r = + _mm256_mmask_i32gather_epi32::<4>(src, 0b01010101, vindex, base_addr.as_ptr().cast()); + let expected = _mm256_setr_epi32(2, 10, 4, 12, 6, 14, 8, 16); + assert_eq_m256i(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mmask_i32gather_epi64() { + let base_addr: [i64; 4] = [1, 2, 3, 4]; + let src = _mm256_setr_epi64x(9, 10, 11, 12); + let vindex = _mm_setr_epi32(1, 2, 3, 4); + let r = _mm256_mmask_i32gather_epi64::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast()); + let expected = _mm256_setr_epi64x(2, 10, 4, 12); + assert_eq_m256i(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mmask_i32gather_pd() { + let base_addr: [f64; 4] = [1., 2., 3., 4.]; + let src = _mm256_setr_pd(9., 10., 11., 12.); + let vindex = _mm_setr_epi32(1, 2, 3, 4); + let r = _mm256_mmask_i32gather_pd::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast()); + let expected = _mm256_setr_pd(2., 10., 4., 12.); + assert_eq_m256d(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mmask_i32gather_ps() { + let base_addr: [f32; 8] = [1., 2., 3., 4., 5., 6., 7., 8.]; + let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.); + let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); + let r = _mm256_mmask_i32gather_ps::<4>(src, 0b01010101, vindex, base_addr.as_ptr().cast()); + let expected = _mm256_setr_ps(2., 10., 4., 12., 6., 14., 8., 16.); + assert_eq_m256(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mmask_i64gather_epi32() { + let base_addr: [i32; 4] = [1, 2, 3, 4]; + let src = _mm_setr_epi32(9, 10, 11, 12); + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let r = _mm256_mmask_i64gather_epi32::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_epi32(2, 10, 4, 12); + assert_eq_m128i(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mmask_i64gather_epi64() { + let base_addr: [i64; 4] = [1, 2, 3, 4]; + let src = _mm256_setr_epi64x(9, 10, 11, 12); + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let r = _mm256_mmask_i64gather_epi64::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast()); + let expected = _mm256_setr_epi64x(2, 10, 4, 12); + assert_eq_m256i(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mmask_i64gather_pd() { + let base_addr: [f64; 4] = [1., 2., 3., 4.]; + let src = _mm256_setr_pd(9., 10., 11., 12.); + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let r = _mm256_mmask_i64gather_pd::<8>(src, 0b0101, vindex, base_addr.as_ptr().cast()); + let expected = _mm256_setr_pd(2., 10., 4., 12.); + assert_eq_m256d(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mmask_i64gather_ps() { + let base_addr: [f32; 4] = [1., 2., 3., 4.]; + let src = _mm_setr_ps(9., 10., 11., 12.); + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let r = _mm256_mmask_i64gather_ps::<4>(src, 0b0101, vindex, base_addr.as_ptr().cast()); + let expected = _mm_setr_ps(2., 10., 4., 12.); + assert_eq_m128(expected, r); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_i32scatter_epi32() { + let mut base_addr: [i32; 4] = [0; 4]; + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let src = _mm_setr_epi32(2, 3, 4, 1); + _mm_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2, 3, 4]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_i32scatter_epi32() { + let mut base_addr: [i32; 4] = [0; 4]; + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let src = _mm_setr_epi32(2, 3, 4, 1); + _mm_mask_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src); + let expected = [0, 2, 0, 4]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_i32scatter_epi64() { + let mut base_addr: [i64; 2] = [0; 2]; + let vindex = _mm_setr_epi32(1, 0, -1, -1); + let src = _mm_setr_epi64x(2, 1); + _mm_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_i32scatter_epi64() { + let mut base_addr: [i64; 2] = [0; 2]; + let vindex = _mm_setr_epi32(1, 0, -1, -1); + let src = _mm_setr_epi64x(2, 1); + _mm_mask_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src); + let expected = [0, 2]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_i32scatter_pd() { + let mut base_addr: [f64; 2] = [0.; 2]; + let vindex = _mm_setr_epi32(1, 0, -1, -1); + let src = _mm_setr_pd(2., 1.); + _mm_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_i32scatter_pd() { + let mut base_addr: [f64; 2] = [0.; 2]; + let vindex = _mm_setr_epi32(1, 0, -1, -1); + let src = _mm_setr_pd(2., 1.); + _mm_mask_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src); + let expected = [0., 2.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_i32scatter_ps() { + let mut base_addr: [f32; 4] = [0.; 4]; + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let src = _mm_setr_ps(2., 3., 4., 1.); + _mm_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2., 3., 4.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_i32scatter_ps() { + let mut base_addr: [f32; 4] = [0.; 4]; + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let src = _mm_setr_ps(2., 3., 4., 1.); + _mm_mask_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src); + let expected = [0., 2., 0., 4.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_i64scatter_epi32() { + let mut base_addr: [i32; 2] = [0; 2]; + let vindex = _mm_setr_epi64x(1, 0); + let src = _mm_setr_epi32(2, 1, -1, -1); + _mm_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_i64scatter_epi32() { + let mut base_addr: [i32; 2] = [0; 2]; + let vindex = _mm_setr_epi64x(1, 0); + let src = _mm_setr_epi32(2, 1, -1, -1); + _mm_mask_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src); + let expected = [0, 2]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_i64scatter_epi64() { + let mut base_addr: [i64; 2] = [0; 2]; + let vindex = _mm_setr_epi64x(1, 0); + let src = _mm_setr_epi64x(2, 1); + _mm_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_i64scatter_epi64() { + let mut base_addr: [i64; 2] = [0; 2]; + let vindex = _mm_setr_epi64x(1, 0); + let src = _mm_setr_epi64x(2, 1); + _mm_mask_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src); + let expected = [0, 2]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_i64scatter_pd() { + let mut base_addr: [f64; 2] = [0.; 2]; + let vindex = _mm_setr_epi64x(1, 0); + let src = _mm_setr_pd(2., 1.); + _mm_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_i64scatter_pd() { + let mut base_addr: [f64; 2] = [0.; 2]; + let vindex = _mm_setr_epi64x(1, 0); + let src = _mm_setr_pd(2., 1.); + _mm_mask_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src); + let expected = [0., 2.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_i64scatter_ps() { + let mut base_addr: [f32; 2] = [0.; 2]; + let vindex = _mm_setr_epi64x(1, 0); + let src = _mm_setr_ps(2., 1., -1., -1.); + _mm_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_i64scatter_ps() { + let mut base_addr: [f32; 2] = [0.; 2]; + let vindex = _mm_setr_epi64x(1, 0); + let src = _mm_setr_ps(2., 1., -1., -1.); + _mm_mask_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b01, vindex, src); + let expected = [0., 2.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i32scatter_epi32() { + let mut base_addr: [i32; 8] = [0; 8]; + let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); + let src = _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 8, 1); + _mm256_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2, 3, 4, 5, 6, 7, 8]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_i32scatter_epi32() { + let mut base_addr: [i32; 8] = [0; 8]; + let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); + let src = _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 8, 1); + _mm256_mask_i32scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src); + let expected = [0, 2, 0, 4, 0, 6, 0, 8]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i32scatter_epi64() { + let mut base_addr: [i64; 4] = [0; 4]; + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let src = _mm256_setr_epi64x(2, 3, 4, 1); + _mm256_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2, 3, 4]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_i32scatter_epi64() { + let mut base_addr: [i64; 4] = [0; 4]; + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let src = _mm256_setr_epi64x(2, 3, 4, 1); + _mm256_mask_i32scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src); + let expected = [0, 2, 0, 4]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i32scatter_pd() { + let mut base_addr: [f64; 4] = [0.; 4]; + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let src = _mm256_setr_pd(2., 3., 4., 1.); + _mm256_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2., 3., 4.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_i32scatter_pd() { + let mut base_addr: [f64; 4] = [0.; 4]; + let vindex = _mm_setr_epi32(1, 2, 3, 0); + let src = _mm256_setr_pd(2., 3., 4., 1.); + _mm256_mask_i32scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src); + let expected = [0., 2., 0., 4.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i32scatter_ps() { + let mut base_addr: [f32; 8] = [0.; 8]; + let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); + let src = _mm256_setr_ps(2., 3., 4., 5., 6., 7., 8., 1.); + _mm256_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2., 3., 4., 5., 6., 7., 8.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_i32scatter_ps() { + let mut base_addr: [f32; 8] = [0.; 8]; + let vindex = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); + let src = _mm256_setr_ps(2., 3., 4., 5., 6., 7., 8., 1.); + _mm256_mask_i32scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b01010101, vindex, src); + let expected = [0., 2., 0., 4., 0., 6., 0., 8.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i64scatter_epi32() { + let mut base_addr: [i32; 4] = [0; 4]; + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let src = _mm_setr_epi32(2, 3, 4, 1); + _mm256_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2, 3, 4]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_i64scatter_epi32() { + let mut base_addr: [i32; 4] = [0; 4]; + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let src = _mm_setr_epi32(2, 3, 4, 1); + _mm256_mask_i64scatter_epi32::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src); + let expected = [0, 2, 0, 4]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i64scatter_epi64() { + let mut base_addr: [i64; 4] = [0; 4]; + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let src = _mm256_setr_epi64x(2, 3, 4, 1); + _mm256_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1, 2, 3, 4]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_i64scatter_epi64() { + let mut base_addr: [i64; 4] = [0; 4]; + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let src = _mm256_setr_epi64x(2, 3, 4, 1); + _mm256_mask_i64scatter_epi64::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src); + let expected = [0, 2, 0, 4]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i64scatter_pd() { + let mut base_addr: [f64; 4] = [0.; 4]; + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let src = _mm256_setr_pd(2., 3., 4., 1.); + _mm256_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2., 3., 4.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_i64scatter_pd() { + let mut base_addr: [f64; 4] = [0.; 4]; + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let src = _mm256_setr_pd(2., 3., 4., 1.); + _mm256_mask_i64scatter_pd::<8>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src); + let expected = [0., 2., 0., 4.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_i64scatter_ps() { + let mut base_addr: [f32; 4] = [0.; 4]; + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let src = _mm_setr_ps(2., 3., 4., 1.); + _mm256_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), vindex, src); + let expected = [1., 2., 3., 4.]; + assert_eq!(expected, base_addr); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_i64scatter_ps() { + let mut base_addr: [f32; 4] = [0.; 4]; + let vindex = _mm256_setr_epi64x(1, 2, 3, 0); + let src = _mm_setr_ps(2., 3., 4., 1.); + _mm256_mask_i64scatter_ps::<4>(base_addr.as_mut_ptr().cast(), 0b0101, vindex, src); + let expected = [0., 2., 0., 4.]; + assert_eq!(expected, base_addr); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_rol_epi64() { #[rustfmt::skip] From 5a4089cb38e433674ca2932878dbf2969c7f401e Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 1 Jul 2024 12:41:17 +0530 Subject: [PATCH 3/6] Implemented VEX versions Modified stdarch-test to accept VEX versions --- crates/core_arch/missing-x86.md | 62 -- crates/core_arch/src/x86/avx512ifma.rs | 128 ++++ crates/core_arch/src/x86/avx512vnni.rs | 876 +++++++++++++++++++++++ crates/core_arch/src/x86/avxneconvert.rs | 188 +++++ crates/core_arch/src/x86/mod.rs | 5 +- crates/stdarch-test/src/lib.rs | 2 +- crates/stdarch-verify/tests/x86-intel.rs | 2 +- 7 files changed, 1198 insertions(+), 65 deletions(-) create mode 100644 crates/core_arch/src/x86/avxneconvert.rs diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index 8d75a237a4..03c25d41fa 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -1135,82 +1135,20 @@

-
["AVX_IFMA"]

- - * [ ] [`_mm256_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52hi_avx_epu64) - * [ ] [`_mm256_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52lo_avx_epu64) - * [ ] [`_mm_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52hi_avx_epu64) - * [ ] [`_mm_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52lo_avx_epu64) -

- -
["AVX_NE_CONVERT"]

- * [ ] [`_mm256_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps) * [ ] [`_mm256_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps) - * [ ] [`_mm256_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps) * [ ] [`_mm256_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps) - * [ ] [`_mm256_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps) * [ ] [`_mm256_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps) * [ ] [`_mm256_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh) - * [ ] [`_mm_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps) * [ ] [`_mm_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps) - * [ ] [`_mm_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps) * [ ] [`_mm_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps) - * [ ] [`_mm_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps) * [ ] [`_mm_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps) * [ ] [`_mm_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh) * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)

-
["AVX_VNNI"]

- - * [ ] [`_mm256_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_avx_epi32) - * [ ] [`_mm256_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_avx_epi32) - * [ ] [`_mm256_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_avx_epi32) - * [ ] [`_mm256_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_avx_epi32) - * [ ] [`_mm_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_avx_epi32) - * [ ] [`_mm_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_avx_epi32) - * [ ] [`_mm_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_avx_epi32) - * [ ] [`_mm_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_avx_epi32) -

- - -
["AVX_VNNI_INT16"]

- - * [ ] [`_mm256_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsud_epi32) - * [ ] [`_mm256_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsuds_epi32) - * [ ] [`_mm256_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusd_epi32) - * [ ] [`_mm256_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusds_epi32) - * [ ] [`_mm256_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuud_epi32) - * [ ] [`_mm256_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuuds_epi32) - * [ ] [`_mm_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsud_epi32) - * [ ] [`_mm_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsuds_epi32) - * [ ] [`_mm_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusd_epi32) - * [ ] [`_mm_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusds_epi32) - * [ ] [`_mm_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuud_epi32) - * [ ] [`_mm_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuuds_epi32) -

- - -
["AVX_VNNI_INT8"]

- - * [ ] [`_mm256_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssd_epi32) - * [ ] [`_mm256_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssds_epi32) - * [ ] [`_mm256_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsud_epi32) - * [ ] [`_mm256_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsuds_epi32) - * [ ] [`_mm256_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuud_epi32) - * [ ] [`_mm256_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuuds_epi32) - * [ ] [`_mm_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssd_epi32) - * [ ] [`_mm_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssds_epi32) - * [ ] [`_mm_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsud_epi32) - * [ ] [`_mm_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsuds_epi32) - * [ ] [`_mm_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuud_epi32) - * [ ] [`_mm_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuuds_epi32) -

- -
["CET_SS"]

* [ ] [`_clrssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clrssbsy) diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs index 01bb704ae7..3bf9958e3d 100644 --- a/crates/core_arch/src/x86/avx512ifma.rs +++ b/crates/core_arch/src/x86/avx512ifma.rs @@ -114,6 +114,24 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64( simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512()) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64) +#[inline] +#[target_feature(enable = "avxifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpmadd52huq) +)] +pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + vpmadd52huq_256(a, b, c) +} + /// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit /// unsigned integer from the intermediate result with the @@ -169,6 +187,24 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64( simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256()) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64) +#[inline] +#[target_feature(enable = "avxifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpmadd52luq) +)] +pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + vpmadd52luq_256(a, b, c) +} + /// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit /// unsigned integer from the intermediate result with the @@ -224,6 +260,24 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64( simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256()) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64) +#[inline] +#[target_feature(enable = "avxifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpmadd52huq) +)] +pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + vpmadd52huq_128(a, b, c) +} + /// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit /// unsigned integer from the intermediate result with the @@ -269,6 +323,24 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _ simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128()) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64) +#[inline] +#[target_feature(enable = "avxifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpmadd52luq) +)] +pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + vpmadd52luq_128(a, b, c) +} + /// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit /// unsigned integer from the intermediate result with the @@ -427,6 +499,20 @@ mod tests { assert_eq_m512i(expected, actual); } + #[simd_test(enable = "avxifma")] + unsafe fn test_mm256_madd52hi_avx_epu64() { + let a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + let actual = _mm256_madd52hi_avx_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm256_set1_epi64x(11030549757952); + + assert_eq_m256i(expected, actual); + } + #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm256_madd52hi_epu64() { let a = _mm256_set1_epi64x(10 << 40); @@ -471,6 +557,20 @@ mod tests { assert_eq_m256i(expected, actual); } + #[simd_test(enable = "avxifma")] + unsafe fn test_mm256_madd52lo_avx_epu64() { + let a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + let actual = _mm256_madd52lo_avx_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm256_set1_epi64x(100055558127628); + + assert_eq_m256i(expected, actual); + } + #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm256_madd52lo_epu64() { let a = _mm256_set1_epi64x(10 << 40); @@ -515,6 +615,20 @@ mod tests { assert_eq_m256i(expected, actual); } + #[simd_test(enable = "avxifma")] + unsafe fn test_mm_madd52hi_avx_epu64() { + let a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + let actual = _mm_madd52hi_avx_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm_set1_epi64x(11030549757952); + + assert_eq_m128i(expected, actual); + } + #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm_madd52hi_epu64() { let a = _mm_set1_epi64x(10 << 40); @@ -559,6 +673,20 @@ mod tests { assert_eq_m128i(expected, actual); } + #[simd_test(enable = "avxifma")] + unsafe fn test_mm_madd52lo_avx_epu64() { + let a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + let actual = _mm_madd52lo_avx_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm_set1_epi64x(100055558127628); + + assert_eq_m128i(expected, actual); + } + #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm_madd52lo_epu64() { let a = _mm_set1_epi64x(10 << 40); diff --git a/crates/core_arch/src/x86/avx512vnni.rs b/crates/core_arch/src/x86/avx512vnni.rs index 67a626b7ed..2ed800d295 100644 --- a/crates/core_arch/src/x86/avx512vnni.rs +++ b/crates/core_arch/src/x86/avx512vnni.rs @@ -50,6 +50,20 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwssd) +)] +pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216) @@ -96,6 +110,20 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwssd) +)] +pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213) @@ -178,6 +206,20 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwssds) +)] +pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225) @@ -224,6 +266,20 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwssds) +)] +pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222) @@ -311,6 +367,20 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbusd) +)] +pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198) @@ -357,6 +427,20 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbusd) +)] +pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195) @@ -439,6 +523,20 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbusds) +)] +pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207) @@ -485,6 +583,20 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbusds) +)] +pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204) @@ -526,6 +638,390 @@ pub unsafe fn _mm_maskz_dpbusds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbssd) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbssd) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbssds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbssds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbsud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbsud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbsuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbsuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbuud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbuud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbuuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbuuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwsud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwsud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwsuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwsuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwusd) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwusd) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwusds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwusds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwuud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwuud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwuuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwuuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.vpdpwssd.512"] @@ -555,6 +1051,66 @@ extern "C" { fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx512.vpdpbusds.128"] fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + + #[link_name = "llvm.x86.avx2.vpdpbssd.128"] + fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbssd.256"] + fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbssds.128"] + fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbssds.256"] + fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbsud.128"] + fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbsud.256"] + fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbsuds.128"] + fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbsuds.256"] + fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbuud.128"] + fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbuud.256"] + fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbuuds.128"] + fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbuuds.256"] + fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwsud.128"] + fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwsud.256"] + fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwsuds.128"] + fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwsuds.256"] + fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwusd.128"] + fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwusd.256"] + fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwusds.128"] + fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwusds.256"] + fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwuud.128"] + fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwuud.256"] + fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwuuds.128"] + fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwuuds.256"] + fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; } #[cfg(test)] @@ -597,6 +1153,16 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm256_dpwssd_avx_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwssd_avx_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm256_dpwssd_epi32() { let src = _mm256_set1_epi32(1); @@ -631,6 +1197,16 @@ mod tests { assert_eq_m256i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm_dpwssd_avx_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwssd_avx_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm_dpwssd_epi32() { let src = _mm_set1_epi32(1); @@ -699,6 +1275,16 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm256_dpwssds_avx_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwssds_avx_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm256_dpwssds_epi32() { let src = _mm256_set1_epi32(1); @@ -733,6 +1319,16 @@ mod tests { assert_eq_m256i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm_dpwssds_avx_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwssds_avx_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm_dpwssds_epi32() { let src = _mm_set1_epi32(1); @@ -801,6 +1397,16 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm256_dpbusd_avx_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbusd_avx_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm256_dpbusd_epi32() { let src = _mm256_set1_epi32(1); @@ -835,6 +1441,16 @@ mod tests { assert_eq_m256i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm_dpbusd_avx_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbusd_avx_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm_dpbusd_epi32() { let src = _mm_set1_epi32(1); @@ -903,6 +1519,16 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm256_dpbusds_avx_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbusds_avx_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm256_dpbusds_epi32() { let src = _mm256_set1_epi32(1); @@ -937,6 +1563,16 @@ mod tests { assert_eq_m256i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm_dpbusds_avx_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbusds_avx_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm_dpbusds_epi32() { let src = _mm_set1_epi32(1); @@ -970,4 +1606,244 @@ mod tests { let e = _mm_set1_epi32(5); assert_eq_m128i(r, e); } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbssd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbssd_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbssd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbssd_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbssds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbssds_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbssds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbssds_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbsud_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbsud_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbsud_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbsud_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbsuds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbsuds_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbsuds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbsuds_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbuud_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbuud_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbuud_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbuud_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbuuds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbuuds_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbuuds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbuuds_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwsud_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwsud_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwsud_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwsud_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwsuds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwsuds_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwsuds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwsuds_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwusd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwusd_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwusd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwusd_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwusds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwusds_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwusds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwusds_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwuud_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwuud_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwuud_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwuud_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwuuds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwuuds_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwuuds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwuuds_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } } diff --git a/crates/core_arch/src/x86/avxneconvert.rs b/crates/core_arch/src/x86/avxneconvert.rs new file mode 100644 index 0000000000..1d29cfc0f5 --- /dev/null +++ b/crates/core_arch/src/x86/avxneconvert.rs @@ -0,0 +1,188 @@ +use crate::core_arch::{simd::*, x86::*}; + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location +/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) +/// floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vbcstnebf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_bcstnebf16_ps(a: *const u16) -> __m128 { + transmute(bcstnebf162ps_128(a)) +} + +/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location +/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point +/// elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vbcstnebf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_bcstnebf16_ps(a: *const u16) -> __m256 { + transmute(bcstnebf162ps_256(a)) +} + +/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at +/// location a to single precision (32-bit) floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneebf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 { + transmute(cvtneebf162ps_128(a)) +} + +/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at +/// location a to single precision (32-bit) floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneebf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 { + transmute(cvtneebf162ps_256(a)) +} + +/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at +/// location a to single precision (32-bit) floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneobf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 { + transmute(cvtneobf162ps_128(a)) +} + +/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at +/// location a to single precision (32-bit) floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneobf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 { + transmute(cvtneobf162ps_256(a)) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.vbcstnebf162ps128"] + fn bcstnebf162ps_128(a: *const u16) -> f32x4; + #[link_name = "llvm.x86.vbcstnebf162ps256"] + fn bcstnebf162ps_256(a: *const u16) -> f32x8; + + #[link_name = "llvm.x86.vcvtneebf162ps128"] + fn cvtneebf162ps_128(a: *const __m128bh) -> __m128; + #[link_name = "llvm.x86.vcvtneebf162ps256"] + fn cvtneebf162ps_256(a: *const __m256bh) -> __m256; + + #[link_name = "llvm.x86.vcvtneobf162ps128"] + fn cvtneobf162ps_128(a: *const __m128bh) -> __m128; + #[link_name = "llvm.x86.vcvtneobf162ps256"] + fn cvtneobf162ps_256(a: *const __m256bh) -> __m256; +} + +#[cfg(test)] +mod tests { + use crate::core_arch::x86::*; + use std::ptr::addr_of; + use stdarch_test::simd_test; + + const BF16_ONE: u16 = 0b0_01111111_0000000; + const BF16_TWO: u16 = 0b0_10000000_0000000; + const BF16_THREE: u16 = 0b0_10000000_1000000; + const BF16_FOUR: u16 = 0b0_10000001_0000000; + const BF16_FIVE: u16 = 0b0_10000001_0100000; + const BF16_SIX: u16 = 0b0_10000001_1000000; + const BF16_SEVEN: u16 = 0b0_10000001_1100000; + const BF16_EIGHT: u16 = 0b0_10000010_0000000; + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm_bcstnebf16_ps() { + let a = BF16_ONE; + let r = _mm_bcstnebf16_ps(addr_of!(a)); + let e = _mm_set_ps(1., 1., 1., 1.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm256_bcstnebf16_ps() { + let a = BF16_ONE; + let r = _mm256_bcstnebf16_ps(addr_of!(a)); + let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm_cvtneebf16_ps() { + let a = __m128bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm_cvtneebf16_ps(addr_of!(a)); + let e = _mm_setr_ps(1., 3., 5., 7.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm256_cvtneebf16_ps() { + let a = __m256bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm256_cvtneebf16_ps(addr_of!(a)); + let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm_cvtneobf16_ps() { + let a = __m128bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm_cvtneobf16_ps(addr_of!(a)); + let e = _mm_setr_ps(2., 4., 6., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm256_cvtneobf16_ps() { + let a = __m256bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm256_cvtneobf16_ps(addr_of!(a)); + let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.); + assert_eq_m256(r, e); + } +} diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 8b1d3bbbb6..6f8c51c16a 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -894,6 +894,9 @@ mod f16c; pub use self::f16c::*; mod avx512bf16; - #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub use self::avx512bf16::*; + +mod avxneconvert; +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub use self::avxneconvert::*; diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index a8c2d36e11..a2835e3b0c 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -84,7 +84,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { // 2. It is a mark, indicating that the instruction will be // compiled into other instructions - mainly because of llvm // optimization. - let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected)); + let found = expected == "nop" || instrs.iter().any(|s| s.contains(expected)); // Look for subroutine call instructions in the disassembly to detect whether // inlining failed: all intrinsics are `#[inline(always)]`, so calling one diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 15d2454f43..f12fa5f0ee 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -758,7 +758,7 @@ fn equate( (&Type::ConstPtr(&Type::PrimSigned(8)), "char const*") => {} (&Type::ConstPtr(&Type::PrimSigned(32)), "__int32 const*" | "int const*") => {} (&Type::ConstPtr(&Type::PrimSigned(64)), "__int64 const*") => {} - (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {} + (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*" | "__bf16 const*") => {} (&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {} (&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {} From 2f14492b8ac538c19be8790185631b717df50fe7 Mon Sep 17 00:00:00 2001 From: sayantn Date: Mon, 1 Jul 2024 16:13:23 +0530 Subject: [PATCH 4/6] Implemented the missing AVX512BF16 intrinsics --- crates/core_arch/missing-x86.md | 15 -- crates/core_arch/src/x86/avx512bf16.rs | 244 +++++++++++++++++++++++ crates/stdarch-verify/tests/x86-intel.rs | 2 +- 3 files changed, 245 insertions(+), 16 deletions(-) diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index 03c25d41fa..0cc767daf1 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -147,27 +147,12 @@

-
["AVX512_BF16", "AVX512F"]

- - * [ ] [`_mm512_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps) - * [ ] [`_mm512_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps) - * [ ] [`_mm512_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps) - * [ ] [`_mm_cvtsbh_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss) -

- -
["AVX512_BF16", "AVX512VL"]

- * [ ] [`_mm256_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps) - * [ ] [`_mm256_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps) - * [ ] [`_mm256_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps) * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh) * [ ] [`_mm_cvtness_sbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh) - * [ ] [`_mm_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps) * [ ] [`_mm_mask_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh) - * [ ] [`_mm_mask_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps) * [ ] [`_mm_maskz_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh) - * [ ] [`_mm_maskz_cvtpbh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)

diff --git a/crates/core_arch/src/x86/avx512bf16.rs b/crates/core_arch/src/x86/avx512bf16.rs index dbd8b4a6f3..6fe6cd5d86 100644 --- a/crates/core_arch/src/x86/avx512bf16.rs +++ b/crates/core_arch/src/x86/avx512bf16.rs @@ -365,6 +365,131 @@ pub unsafe fn _mm512_maskz_dpbf16_ps( transmute(simd_select_bitmask(k, rst, zero)) } +/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) +/// floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 { + _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a)))) +} + +/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) +/// floating-point elements, and store the results in dst using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 { + let cvt = _mm512_cvtpbh_ps(a); + transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16())) +} + +/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) +/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out +/// when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 { + let cvt = _mm512_cvtpbh_ps(a); + let zero = _mm512_setzero_ps(); + transmute(simd_select_bitmask(k, cvt.as_f32x16(), zero.as_f32x16())) +} + +/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) +/// floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 { + _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a)))) +} + +/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) +/// floating-point elements, and store the results in dst using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 { + let cvt = _mm256_cvtpbh_ps(a); + transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8())) +} + +/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit) +/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out +/// when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 { + let cvt = _mm256_cvtpbh_ps(a); + let zero = _mm256_setzero_ps(); + transmute(simd_select_bitmask(k, cvt.as_f32x8(), zero.as_f32x8())) +} + +/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point +/// elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 { + _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a)))) +} + +/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point +/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 { + let cvt = _mm_cvtpbh_ps(a); + transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4())) +} + +/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point +/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding +/// mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 { + let cvt = _mm_cvtpbh_ps(a); + let zero = _mm_setzero_ps(); + transmute(simd_select_bitmask(k, cvt.as_f32x4(), zero.as_f32x4())) +} + +/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point +/// element, and store the result in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss) +#[inline] +#[target_feature(enable = "avx512bf16,avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtsbh_ss(a: u16) -> f32 { + f32::from_bits((a as u32) << 16) +} + #[cfg(test)] mod tests { use crate::{core_arch::x86::*, mem::transmute}; @@ -1592,4 +1717,123 @@ mod tests { ]; assert_eq!(result, expected_result); } + + const BF16_ONE: u16 = 0b0_01111111_0000000; + const BF16_TWO: u16 = 0b0_10000000_0000000; + const BF16_THREE: u16 = 0b0_10000000_1000000; + const BF16_FOUR: u16 = 0b0_10000001_0000000; + const BF16_FIVE: u16 = 0b0_10000001_0100000; + const BF16_SIX: u16 = 0b0_10000001_1000000; + const BF16_SEVEN: u16 = 0b0_10000001_1100000; + const BF16_EIGHT: u16 = 0b0_10000010_0000000; + + #[simd_test(enable = "avx512bf16")] + unsafe fn test_mm512_cvtpbh_ps() { + let a = __m256bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm512_cvtpbh_ps(a); + let e = _mm512_setr_ps( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512bf16")] + unsafe fn test_mm512_mask_cvtpbh_ps() { + let a = __m256bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let src = _mm512_setr_ps( + 9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let k = 0b1010_1010_1010_1010; + let r = _mm512_mask_cvtpbh_ps(src, k, a); + let e = _mm512_setr_ps( + 9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512bf16")] + unsafe fn test_mm512_maskz_cvtpbh_ps() { + let a = __m256bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let k = 0b1010_1010_1010_1010; + let r = _mm512_maskz_cvtpbh_ps(k, a); + let e = _mm512_setr_ps( + 0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm256_cvtpbh_ps() { + let a = __m128bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm256_cvtpbh_ps(a); + let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm256_mask_cvtpbh_ps() { + let a = __m128bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.); + let k = 0b1010_1010; + let r = _mm256_mask_cvtpbh_ps(src, k, a); + let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm256_maskz_cvtpbh_ps() { + let a = __m128bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let k = 0b1010_1010; + let r = _mm256_maskz_cvtpbh_ps(k, a); + let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm_cvtpbh_ps() { + let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0); + let r = _mm_cvtpbh_ps(a); + let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm_mask_cvtpbh_ps() { + let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0); + let src = _mm_setr_ps(9., 10., 11., 12.); + let k = 0b1010; + let r = _mm_mask_cvtpbh_ps(src, k, a); + let e = _mm_setr_ps(9., 2., 11., 4.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm_maskz_cvtpbh_ps() { + let a = __m128bh(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0); + let k = 0b1010; + let r = _mm_maskz_cvtpbh_ps(k, a); + let e = _mm_setr_ps(0., 2., 0., 4.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512bf16")] + unsafe fn test_mm_cvtsbh_ss() { + let r = _mm_cvtsbh_ss(BF16_ONE); + assert_eq!(r, 1.); + } } diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index f12fa5f0ee..f38c7b1a3f 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -699,7 +699,7 @@ fn equate( (&Type::PrimSigned(32), "__int32" | "const int" | "int") => {} (&Type::PrimSigned(64), "__int64" | "long long") => {} (&Type::PrimUnsigned(8), "unsigned char") => {} - (&Type::PrimUnsigned(16), "unsigned short") => {} + (&Type::PrimUnsigned(16), "unsigned short" | "__bfloat16") => {} ( &Type::PrimUnsigned(32), "unsigned __int32" | "unsigned int" | "unsigned long" | "const unsigned int", From 0f1f4d1befa0be7457734d467b11a0bc3c3da240 Mon Sep 17 00:00:00 2001 From: sayantn Date: Tue, 2 Jul 2024 16:36:29 +0530 Subject: [PATCH 5/6] Implemented some missing functions These cannot be linked with LLVM because of the lack of `bfloat16` and `i1` types in Rust. So, inline asm was the only way --- crates/core_arch/missing-x86.md | 12 --- crates/core_arch/src/x86/avx512bf16.rs | 112 ++++++++++++++++++++++- crates/core_arch/src/x86/avxneconvert.rs | 65 +++++++++++++ 3 files changed, 176 insertions(+), 13 deletions(-) diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index 0cc767daf1..16f6c58cbb 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -147,15 +147,6 @@

-
["AVX512_BF16", "AVX512VL"]

- - * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh) - * [ ] [`_mm_cvtness_sbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh) - * [ ] [`_mm_mask_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh) - * [ ] [`_mm_maskz_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh) -

- -
["AVX512_FP16"]

* [ ] [`_mm256_castpd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph) @@ -1125,12 +1116,9 @@ * [ ] [`_mm256_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps) * [ ] [`_mm256_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps) * [ ] [`_mm256_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps) - * [ ] [`_mm256_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh) * [ ] [`_mm_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps) * [ ] [`_mm_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps) * [ ] [`_mm_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps) - * [ ] [`_mm_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh) - * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)

diff --git a/crates/core_arch/src/x86/avx512bf16.rs b/crates/core_arch/src/x86/avx512bf16.rs index 6fe6cd5d86..a9be7de5db 100644 --- a/crates/core_arch/src/x86/avx512bf16.rs +++ b/crates/core_arch/src/x86/avx512bf16.rs @@ -2,6 +2,7 @@ //! //! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16 +use crate::arch::asm; use crate::core_arch::{simd::*, x86::*}; use crate::intrinsics::simd::*; @@ -490,9 +491,85 @@ pub unsafe fn _mm_cvtsbh_ss(a: u16) -> f32 { f32::from_bits((a as u32) << 16) } +/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl,sse")] +#[cfg_attr(test, assert_instr("vcvtneps2bf16"))] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh { + let mut dst: __m128bh; + asm!( + "vcvtneps2bf16 {dst}, {src}", + dst = lateout(xmm_reg) dst, + src = in(xmm_reg) a, + options(pure, nomem, nostack, preserves_flags) + ); + dst +} + +/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// floating-point elements, and store the results in dst using writemask k (elements are copied +/// from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl,sse,avx512f")] +#[cfg_attr(test, assert_instr("vcvtneps2bf16"))] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh { + let mut dst = src; + asm!( + "vcvtneps2bf16 {dst}{{{k}}},{src}", + dst = inlateout(xmm_reg) dst, + src = in(xmm_reg) a, + k = in(kreg) k, + options(pure, nomem, nostack, preserves_flags) + ); + dst +} + +/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out +/// when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl,sse,avx512f")] +#[cfg_attr(test, assert_instr("vcvtneps2bf16"))] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh { + let mut dst: __m128bh; + asm!( + "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}", + dst = lateout(xmm_reg) dst, + src = in(xmm_reg) a, + k = in(kreg) k, + options(pure, nomem, nostack, preserves_flags) + ); + dst +} + +/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point +/// element, and store the result in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh) +#[inline] +#[target_feature(enable = "avx512bf16,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtness_sbh(a: f32) -> u16 { + simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0) +} + #[cfg(test)] mod tests { - use crate::{core_arch::x86::*, mem::transmute}; + use crate::core_arch::simd::u16x4; + use crate::{ + core_arch::x86::*, + mem::{transmute, transmute_copy}, + }; use stdarch_test::simd_test; #[simd_test(enable = "avx512bf16,avx512vl")] @@ -1836,4 +1913,37 @@ mod tests { let r = _mm_cvtsbh_ss(BF16_ONE); assert_eq!(r, 1.); } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm_cvtneps_pbh() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let r: u16x4 = transmute_copy(&_mm_cvtneps_pbh(a)); + let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR); + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm_mask_cvtneps_pbh() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let src = __m128bh(5, 6, 7, 8, !0, !0, !0, !0); + let k = 0b1010; + let r: u16x4 = transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a)); + let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR); + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm_maskz_cvtneps_pbh() { + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let k = 0b1010; + let r: u16x4 = transmute_copy(&_mm_maskz_cvtneps_pbh(k, a)); + let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR); + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bf16,avx512vl")] + unsafe fn test_mm_cvtness_sbh() { + let r = _mm_cvtness_sbh(1.); + assert_eq!(r, BF16_ONE); + } } diff --git a/crates/core_arch/src/x86/avxneconvert.rs b/crates/core_arch/src/x86/avxneconvert.rs index 1d29cfc0f5..1834b43a81 100644 --- a/crates/core_arch/src/x86/avxneconvert.rs +++ b/crates/core_arch/src/x86/avxneconvert.rs @@ -1,3 +1,4 @@ +use crate::arch::asm; use crate::core_arch::{simd::*, x86::*}; #[cfg(test)] @@ -95,6 +96,50 @@ pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 { transmute(cvtneobf162ps_256(a)) } +/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point +/// elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_bf16) +#[inline] +#[target_feature(enable = "avxneconvert,sse")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneps2bf16) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh { + let mut dst: __m128bh; + asm!( + "{{vex}}vcvtneps2bf16 {dst},{src}", + dst = lateout(xmm_reg) dst, + src = in(xmm_reg) a, + options(pure, nomem, nostack, preserves_flags) + ); + dst +} + +/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point +/// elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_bf16) +#[inline] +#[target_feature(enable = "avxneconvert,sse,avx")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneps2bf16) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh { + let mut dst: __m128bh; + asm!( + "{{vex}}vcvtneps2bf16 {dst},{src}", + dst = lateout(xmm_reg) dst, + src = in(ymm_reg) a, + options(pure, nomem, nostack, preserves_flags) + ); + dst +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.vbcstnebf162ps128"] @@ -115,7 +160,9 @@ extern "C" { #[cfg(test)] mod tests { + use crate::core_arch::simd::{u16x4, u16x8}; use crate::core_arch::x86::*; + use crate::mem::transmute_copy; use std::ptr::addr_of; use stdarch_test::simd_test; @@ -185,4 +232,22 @@ mod tests { let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.); assert_eq_m256(r, e); } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm_cvtneps_avx_pbh() { + let a = _mm_setr_ps(1., 2., 3., 4.); + let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a)); + let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR); + assert_eq!(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm256_cvtneps_avx_pbh() { + let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); + let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a)); + let e = u16x8::new( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + assert_eq!(r, e); + } } From 5dc20a2432df6bd8005dd76a197b3d54b49e55cf Mon Sep 17 00:00:00 2001 From: sayantn Date: Fri, 5 Jul 2024 14:36:33 +0530 Subject: [PATCH 6/6] Added a `bf16` type --- crates/core_arch/src/x86/avx512bf16.rs | 17 ++++++++-------- crates/core_arch/src/x86/avxneconvert.rs | 22 ++++++++++----------- crates/core_arch/src/x86/mod.rs | 25 ++++++++++++++++++++++++ crates/stdarch-verify/src/lib.rs | 1 + crates/stdarch-verify/tests/x86-intel.rs | 8 ++++++-- 5 files changed, 52 insertions(+), 21 deletions(-) diff --git a/crates/core_arch/src/x86/avx512bf16.rs b/crates/core_arch/src/x86/avx512bf16.rs index a9be7de5db..7d99809353 100644 --- a/crates/core_arch/src/x86/avx512bf16.rs +++ b/crates/core_arch/src/x86/avx512bf16.rs @@ -486,9 +486,9 @@ pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] -#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _mm_cvtsbh_ss(a: u16) -> f32 { - f32::from_bits((a as u32) << 16) +#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")] +pub unsafe fn _mm_cvtsbh_ss(a: bf16) -> f32 { + f32::from_bits((a.to_bits() as u32) << 16) } /// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) @@ -558,9 +558,10 @@ pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] -#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _mm_cvtness_sbh(a: f32) -> u16 { - simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0) +#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")] +pub unsafe fn _mm_cvtness_sbh(a: f32) -> bf16 { + let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0); + bf16::from_bits(value) } #[cfg(test)] @@ -1910,7 +1911,7 @@ mod tests { #[simd_test(enable = "avx512bf16")] unsafe fn test_mm_cvtsbh_ss() { - let r = _mm_cvtsbh_ss(BF16_ONE); + let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE)); assert_eq!(r, 1.); } @@ -1944,6 +1945,6 @@ mod tests { #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm_cvtness_sbh() { let r = _mm_cvtness_sbh(1.); - assert_eq!(r, BF16_ONE); + assert_eq!(r.to_bits(), BF16_ONE); } } diff --git a/crates/core_arch/src/x86/avxneconvert.rs b/crates/core_arch/src/x86/avxneconvert.rs index 1834b43a81..4eb1a9b30c 100644 --- a/crates/core_arch/src/x86/avxneconvert.rs +++ b/crates/core_arch/src/x86/avxneconvert.rs @@ -1,5 +1,5 @@ use crate::arch::asm; -use crate::core_arch::{simd::*, x86::*}; +use crate::core_arch::x86::*; #[cfg(test)] use stdarch_test::assert_instr; @@ -15,9 +15,9 @@ use stdarch_test::assert_instr; all(test, any(target_os = "linux", target_env = "msvc")), assert_instr(vbcstnebf162ps) )] -#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _mm_bcstnebf16_ps(a: *const u16) -> __m128 { - transmute(bcstnebf162ps_128(a)) +#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")] +pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 { + bcstnebf162ps_128(a) } /// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location @@ -31,9 +31,9 @@ pub unsafe fn _mm_bcstnebf16_ps(a: *const u16) -> __m128 { all(test, any(target_os = "linux", target_env = "msvc")), assert_instr(vbcstnebf162ps) )] -#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _mm256_bcstnebf16_ps(a: *const u16) -> __m256 { - transmute(bcstnebf162ps_256(a)) +#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")] +pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 { + bcstnebf162ps_256(a) } /// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at @@ -143,9 +143,9 @@ pub unsafe fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh { #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.vbcstnebf162ps128"] - fn bcstnebf162ps_128(a: *const u16) -> f32x4; + fn bcstnebf162ps_128(a: *const bf16) -> __m128; #[link_name = "llvm.x86.vbcstnebf162ps256"] - fn bcstnebf162ps_256(a: *const u16) -> f32x8; + fn bcstnebf162ps_256(a: *const bf16) -> __m256; #[link_name = "llvm.x86.vcvtneebf162ps128"] fn cvtneebf162ps_128(a: *const __m128bh) -> __m128; @@ -177,7 +177,7 @@ mod tests { #[simd_test(enable = "avxneconvert")] unsafe fn test_mm_bcstnebf16_ps() { - let a = BF16_ONE; + let a = bf16::from_bits(BF16_ONE); let r = _mm_bcstnebf16_ps(addr_of!(a)); let e = _mm_set_ps(1., 1., 1., 1.); assert_eq_m128(r, e); @@ -185,7 +185,7 @@ mod tests { #[simd_test(enable = "avxneconvert")] unsafe fn test_mm256_bcstnebf16_ps() { - let a = BF16_ONE; + let a = bf16::from_bits(BF16_ONE); let r = _mm256_bcstnebf16_ps(addr_of!(a)); let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.); assert_eq_m256(r, e); diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 6f8c51c16a..9365fe10a2 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -337,6 +337,31 @@ types! { ); } +/// The BFloat16 type used in AVX-512 intrinsics. +#[repr(transparent)] +#[derive(Copy, Clone, Debug)] +#[allow(non_camel_case_types)] +#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")] +pub struct bf16(u16); + +impl bf16 { + /// Raw transmutation from `u16` + #[inline] + #[must_use] + #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")] + pub const fn from_bits(bits: u16) -> bf16 { + bf16(bits) + } + + /// Raw transmutation to `u16` + #[inline] + #[must_use = "this returns the result of the operation, without modifying the original"] + #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")] + pub const fn to_bits(self) -> u16 { + self.0 + } +} + /// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer #[allow(non_camel_case_types)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs index ff31c31c89..94569dfd0c 100644 --- a/crates/stdarch-verify/src/lib.rs +++ b/crates/stdarch-verify/src/lib.rs @@ -197,6 +197,7 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream { "_MM_MANTISSA_SIGN_ENUM" => quote! { &MM_MANTISSA_SIGN_ENUM }, "_MM_PERM_ENUM" => quote! { &MM_PERM_ENUM }, "bool" => quote! { &BOOL }, + "bf16" => quote! { &BF16 }, "f32" => quote! { &F32 }, "f64" => quote! { &F64 }, "i16" => quote! { &I16 }, diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index f38c7b1a3f..8de2c88b81 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -22,6 +22,7 @@ struct Function { has_test: bool, } +static BF16: Type = Type::BFloat16; static F32: Type = Type::PrimFloat(32); static F64: Type = Type::PrimFloat(64); static I8: Type = Type::PrimSigned(8); @@ -65,6 +66,7 @@ enum Type { PrimFloat(u8), PrimSigned(u8), PrimUnsigned(u8), + BFloat16, MutPtr(&'static Type), ConstPtr(&'static Type), M128, @@ -699,7 +701,8 @@ fn equate( (&Type::PrimSigned(32), "__int32" | "const int" | "int") => {} (&Type::PrimSigned(64), "__int64" | "long long") => {} (&Type::PrimUnsigned(8), "unsigned char") => {} - (&Type::PrimUnsigned(16), "unsigned short" | "__bfloat16") => {} + (&Type::PrimUnsigned(16), "unsigned short") => {} + (&Type::BFloat16, "__bfloat16") => {} ( &Type::PrimUnsigned(32), "unsigned __int32" | "unsigned int" | "unsigned long" | "const unsigned int", @@ -758,9 +761,10 @@ fn equate( (&Type::ConstPtr(&Type::PrimSigned(8)), "char const*") => {} (&Type::ConstPtr(&Type::PrimSigned(32)), "__int32 const*" | "int const*") => {} (&Type::ConstPtr(&Type::PrimSigned(64)), "__int64 const*") => {} - (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*" | "__bf16 const*") => {} + (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {} (&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {} (&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {} + (&Type::ConstPtr(&Type::BFloat16), "__bf16 const*") => {} (&Type::ConstPtr(&Type::M128), "__m128 const*") => {} (&Type::ConstPtr(&Type::M128BH), "__m128bh const*") => {}