diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index 8da6074cac..11ad3f04c1 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -149,21 +149,10 @@
["AVX512F"]

- * [ ] [`_cvtmask16_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtmask16_u32) - * [ ] [`_cvtu32_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtu32_mask16) - * [ ] [`_kortest_mask16_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortest_mask16_u8) - * [ ] [`_kortestc_mask16_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestc_mask16_u8) - * [ ] [`_kortestz_mask16_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestz_mask16_u8) - * [ ] [`_kshiftli_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftli_mask16) - * [ ] [`_kshiftri_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftri_mask16) - * [ ] [`_load_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_load_mask16) - * [ ] [`_mm512_cvtsd_f64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsd_f64) - * [ ] [`_mm512_cvtss_f32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtss_f32) * [ ] [`_mm512_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_epi64) * [ ] [`_mm512_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32logather_pd) * [ ] [`_mm512_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_epi64) * [ ] [`_mm512_i32loscatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32loscatter_pd) - * [ ] [`_mm512_kortestz`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kortestz) * [ ] [`_mm512_mask_i32logather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_epi64) * [ ] [`_mm512_mask_i32logather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32logather_pd) * [ ] [`_mm512_mask_i32loscatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32loscatter_epi64) @@ -175,7 +164,6 @@ * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss) * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd) * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss) - * [ ] [`_store_mask16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_store_mask16)

@@ -204,9 +192,6 @@ * [ ] [`_mm256_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_epi64) * [ ] [`_mm256_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_pd) * [ ] [`_mm256_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mmask_i64gather_ps) - * [ ] [`_mm256_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt14_pd) - * [ ] [`_mm256_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt14_ps) - * [ ] [`_mm_abs_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi64) * [ ] [`_mm_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi32) * [ ] [`_mm_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_epi64) * [ ] [`_mm_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32scatter_pd) @@ -215,7 +200,6 @@ * [ ] [`_mm_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_epi64) * [ ] [`_mm_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_pd) * [ ] [`_mm_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64scatter_ps) - * [ ] [`_mm_mask_abs_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_abs_epi64) * [ ] [`_mm_mask_i32scatter_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi32) * [ ] [`_mm_mask_i32scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_epi64) * [ ] [`_mm_mask_i32scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32scatter_pd) @@ -224,10 +208,6 @@ * [ ] [`_mm_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_epi64) * [ ] [`_mm_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_pd) * [ ] [`_mm_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64scatter_ps) - * [ ] [`_mm_mask_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epi64) - * [ ] [`_mm_maskz_abs_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_abs_epi64) - * [ ] [`_mm_maskz_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epi64) - * [ ] [`_mm_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi64) * [ ] [`_mm_mmask_i32gather_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi32) * [ ] [`_mm_mmask_i32gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_epi64) * [ ] [`_mm_mmask_i32gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i32gather_pd) @@ -236,8 +216,6 @@ * [ ] [`_mm_mmask_i64gather_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_epi64) * [ ] [`_mm_mmask_i64gather_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_pd) * [ ] [`_mm_mmask_i64gather_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mmask_i64gather_ps) - * [ ] [`_mm_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_pd) - * [ ] [`_mm_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ps)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index e5f78a2ffb..8c88d3aa2f 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -2,6 +2,7 @@ use crate::{ arch::asm, core_arch::{simd::*, x86::*}, intrinsics::simd::*, + intrinsics::{fmaf32, fmaf64}, mem, ptr, }; @@ -48,11 +49,9 @@ use stdarch_test::assert_instr; #[cfg_attr(test, assert_instr(vpabsd))] pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i { let a = a.as_i32x16(); - // all-0 is a properly initialized i32x16 - let zero: i32x16 = mem::zeroed(); - let sub = simd_sub(zero, a); - let cmp: i32x16 = simd_gt(a, zero); - transmute(simd_select(cmp, a, sub)) + let zero = i32x16::splat(0); + let r = simd_select::(simd_lt(a, zero), simd_neg(a), a); + transmute(r) } /// Computes the absolute value of packed 32-bit integers in `a`, and store the @@ -143,11 +142,9 @@ pub unsafe fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpabsq))] pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i { let a = a.as_i64x8(); - // all-0 is a properly initialized i64x8 - let zero: i64x8 = mem::zeroed(); - let sub = simd_sub(zero, a); - let cmp: i64x8 = simd_gt(a, zero); - transmute(simd_select(cmp, a, sub)) + let zero = i64x8::splat(0); + let r = simd_select::(simd_lt(a, zero), simd_neg(a), a); + transmute(r) } /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -184,11 +181,9 @@ pub unsafe fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i { #[cfg_attr(test, assert_instr(vpabsq))] pub unsafe fn _mm256_abs_epi64(a: __m256i) -> __m256i { let a = a.as_i64x4(); - // all-0 is a properly initialized i64x4 - let zero: i64x4 = mem::zeroed(); - let sub = simd_sub(zero, a); - let cmp: i64x4 = simd_gt(a, zero); - transmute(simd_select(cmp, a, sub)) + let zero = i64x4::splat(0); + let r = simd_select::(simd_lt(a, zero), simd_neg(a), a); + transmute(r) } /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -205,7 +200,7 @@ pub unsafe fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __ /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi64&expand=45) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_abs_epi64) #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] @@ -216,18 +211,54 @@ pub unsafe fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i { transmute(simd_select_bitmask(k, abs, zero)) } +/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpabsq))] +pub unsafe fn _mm_abs_epi64(a: __m128i) -> __m128i { + let a = a.as_i64x2(); + let zero = i64x2::splat(0); + let r = simd_select::(simd_lt(a, zero), simd_neg(a), a); + transmute(r) +} + +/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_abs_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpabsq))] +pub unsafe fn _mm_mask_abs_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i { + let abs = _mm_abs_epi64(a).as_i64x2(); + transmute(simd_select_bitmask(k, abs, src.as_i64x2())) +} + +/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_abs_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpabsq))] +pub unsafe fn _mm_maskz_abs_epi64(k: __mmask8, a: __m128i) -> __m128i { + let abs = _mm_abs_epi64(a).as_i64x2(); + let zero = i64x2::splat(0); + transmute(simd_select_bitmask(k, abs, zero)) +} + /// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_abs_ps&expand=65) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vpandq))] +#[cfg_attr(test, assert_instr(vpandd))] pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 { - let a = _mm512_set1_epi32(0x7FFFFFFF); // from LLVM code - let b = transmute::(v2.as_f32x16()); - let abs = _mm512_and_epi32(a, b); - transmute(abs) + simd_fabs(v2) } /// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -238,8 +269,7 @@ pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpandd))] pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 { - let abs = _mm512_abs_ps(v2).as_f32x16(); - transmute(simd_select_bitmask(k, abs, src.as_f32x16())) + simd_select_bitmask(k, simd_fabs(v2), src) } /// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst. @@ -250,10 +280,7 @@ pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m51 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpandq))] pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d { - let a = _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF); // from LLVM code - let b = transmute::(v2.as_f64x8()); - let abs = _mm512_and_epi64(a, b); - transmute(abs) + simd_fabs(v2) } /// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -264,8 +291,7 @@ pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpandq))] pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d { - let abs = _mm512_abs_pd(v2).as_f64x8(); - transmute(simd_select_bitmask(k, abs, src.as_f64x8())) + simd_select_bitmask(k, simd_fabs(v2), src) } /// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1264,7 +1290,9 @@ pub unsafe fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmuldq))] pub unsafe fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmuldq(a.as_i32x16(), b.as_i32x16())) + let a = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(a.as_i64x8())); + let b = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(b.as_i64x8())); + transmute(simd_mul(a, b)) } /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1476,7 +1504,10 @@ pub unsafe fn _mm512_mask_mullox_epi64( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmuludq))] pub unsafe fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmuludq(a.as_u32x16(), b.as_u32x16())) + let a = a.as_u64x8(); + let b = b.as_u64x8(); + let mask = u64x8::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1906,7 +1937,9 @@ pub unsafe fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxsd))] pub unsafe fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmaxsd(a.as_i32x16(), b.as_i32x16())) + let a = a.as_i32x16(); + let b = b.as_i32x16(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1992,7 +2025,9 @@ pub unsafe fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxsq))] pub unsafe fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmaxsq(a.as_i64x8(), b.as_i64x8())) + let a = a.as_i64x8(); + let b = b.as_i64x8(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2028,7 +2063,9 @@ pub unsafe fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxsq))] pub unsafe fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i { - transmute(vpmaxsq256(a.as_i64x4(), b.as_i64x4())) + let a = a.as_i64x4(); + let b = b.as_i64x4(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2064,7 +2101,9 @@ pub unsafe fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxsq))] pub unsafe fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i { - transmute(vpmaxsq128(a.as_i64x2(), b.as_i64x2())) + let a = a.as_i64x2(); + let b = b.as_i64x2(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2276,7 +2315,9 @@ pub unsafe fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxud))] pub unsafe fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmaxud(a.as_u32x16(), b.as_u32x16())) + let a = a.as_u32x16(); + let b = b.as_u32x16(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2362,7 +2403,9 @@ pub unsafe fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxuq))] pub unsafe fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmaxuq(a.as_u64x8(), b.as_u64x8())) + let a = a.as_u64x8(); + let b = b.as_u64x8(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2398,7 +2441,9 @@ pub unsafe fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxuq))] pub unsafe fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i { - transmute(vpmaxuq256(a.as_u64x4(), b.as_u64x4())) + let a = a.as_u64x4(); + let b = b.as_u64x4(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2434,7 +2479,9 @@ pub unsafe fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxuq))] pub unsafe fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i { - transmute(vpmaxuq128(a.as_u64x2(), b.as_u64x2())) + let a = a.as_u64x2(); + let b = b.as_u64x2(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2470,7 +2517,9 @@ pub unsafe fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminsd))] pub unsafe fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i { - transmute(vpminsd(a.as_i32x16(), b.as_i32x16())) + let a = a.as_i32x16(); + let b = b.as_i32x16(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2556,7 +2605,9 @@ pub unsafe fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminsq))] pub unsafe fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i { - transmute(vpminsq(a.as_i64x8(), b.as_i64x8())) + let a = a.as_i64x8(); + let b = b.as_i64x8(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2592,7 +2643,9 @@ pub unsafe fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminsq))] pub unsafe fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i { - transmute(vpminsq256(a.as_i64x4(), b.as_i64x4())) + let a = a.as_i64x4(); + let b = b.as_i64x4(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2620,6 +2673,44 @@ pub unsafe fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m transmute(simd_select_bitmask(k, min, zero)) } +/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpminsq))] +pub unsafe fn _mm_min_epi64(a: __m128i, b: __m128i) -> __m128i { + let a = a.as_i64x2(); + let b = b.as_i64x2(); + transmute(simd_select::(simd_lt(a, b), a, b)) +} + +/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_min_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpminsq))] +pub unsafe fn _mm_mask_min_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { + let min = _mm_min_epi64(a, b).as_i64x2(); + transmute(simd_select_bitmask(k, min, src.as_i64x2())) +} + +/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_min_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vpminsq))] +pub unsafe fn _mm_maskz_min_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { + let min = _mm_min_epi64(a, b).as_i64x2(); + let zero = _mm_setzero_si128().as_i64x2(); + transmute(simd_select_bitmask(k, min, zero)) +} + /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_min_ps&expand=3769) @@ -2806,7 +2897,9 @@ pub unsafe fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminud))] pub unsafe fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i { - transmute(vpminud(a.as_u32x16(), b.as_u32x16())) + let a = a.as_u32x16(); + let b = b.as_u32x16(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2892,7 +2985,9 @@ pub unsafe fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminuq))] pub unsafe fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i { - transmute(vpminuq(a.as_u64x8(), b.as_u64x8())) + let a = a.as_u64x8(); + let b = b.as_u64x8(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2928,7 +3023,9 @@ pub unsafe fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminuq))] pub unsafe fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i { - transmute(vpminuq256(a.as_u64x4(), b.as_u64x4())) + let a = a.as_u64x4(); + let b = b.as_u64x4(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2964,7 +3061,9 @@ pub unsafe fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminuq))] pub unsafe fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i { - transmute(vpminuq128(a.as_u64x2(), b.as_u64x2())) + let a = a.as_u64x2(); + let b = b.as_u64x2(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -3000,7 +3099,7 @@ pub unsafe fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtps))] pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 { - transmute(vsqrtps(a.as_f32x16(), _MM_FROUND_CUR_DIRECTION)) + simd_fsqrt(a) } /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -3011,8 +3110,7 @@ pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtps))] pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 { - let sqrt = _mm512_sqrt_ps(a).as_f32x16(); - transmute(simd_select_bitmask(k, sqrt, src.as_f32x16())) + simd_select_bitmask(k, simd_fsqrt(a), src) } /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3023,9 +3121,7 @@ pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m51 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtps))] pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 { - let sqrt = _mm512_sqrt_ps(a).as_f32x16(); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, sqrt, zero)) + simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_ps()) } /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -3036,8 +3132,7 @@ pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtps))] pub unsafe fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 { - let sqrt = _mm256_sqrt_ps(a).as_f32x8(); - transmute(simd_select_bitmask(k, sqrt, src.as_f32x8())) + simd_select_bitmask(k, simd_fsqrt(a), src) } /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3048,9 +3143,7 @@ pub unsafe fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtps))] pub unsafe fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 { - let sqrt = _mm256_sqrt_ps(a).as_f32x8(); - let zero = _mm256_setzero_ps().as_f32x8(); - transmute(simd_select_bitmask(k, sqrt, zero)) + simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_ps()) } /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -3061,8 +3154,7 @@ pub unsafe fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtps))] pub unsafe fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 { - let sqrt = _mm_sqrt_ps(a).as_f32x4(); - transmute(simd_select_bitmask(k, sqrt, src.as_f32x4())) + simd_select_bitmask(k, simd_fsqrt(a), src) } /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3073,9 +3165,7 @@ pub unsafe fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtps))] pub unsafe fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 { - let sqrt = _mm_sqrt_ps(a).as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - transmute(simd_select_bitmask(k, sqrt, zero)) + simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_ps()) } /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. @@ -3086,7 +3176,7 @@ pub unsafe fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtpd))] pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d { - transmute(vsqrtpd(a.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) + simd_fsqrt(a) } /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -3097,8 +3187,7 @@ pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtpd))] pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d { - let sqrt = _mm512_sqrt_pd(a).as_f64x8(); - transmute(simd_select_bitmask(k, sqrt, src.as_f64x8())) + simd_select_bitmask(k, simd_fsqrt(a), src) } /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3109,9 +3198,7 @@ pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m5 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtpd))] pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d { - let sqrt = _mm512_sqrt_pd(a).as_f64x8(); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, sqrt, zero)) + simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_pd()) } /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -3122,8 +3209,7 @@ pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtpd))] pub unsafe fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d { - let sqrt = _mm256_sqrt_pd(a).as_f64x4(); - transmute(simd_select_bitmask(k, sqrt, src.as_f64x4())) + simd_select_bitmask(k, simd_fsqrt(a), src) } /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3134,9 +3220,7 @@ pub unsafe fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m2 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtpd))] pub unsafe fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d { - let sqrt = _mm256_sqrt_pd(a).as_f64x4(); - let zero = _mm256_setzero_pd().as_f64x4(); - transmute(simd_select_bitmask(k, sqrt, zero)) + simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_pd()) } /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -3147,8 +3231,7 @@ pub unsafe fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtpd))] pub unsafe fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d { - let sqrt = _mm_sqrt_pd(a).as_f64x2(); - transmute(simd_select_bitmask(k, sqrt, src.as_f64x2())) + simd_select_bitmask(k, simd_fsqrt(a), src) } /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3159,9 +3242,7 @@ pub unsafe fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtpd))] pub unsafe fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d { - let sqrt = _mm_sqrt_pd(a).as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - transmute(simd_select_bitmask(k, sqrt, zero)) + simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_pd()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst. @@ -3172,7 +3253,7 @@ pub unsafe fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16())) + simd_fma(a, b, c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3183,8 +3264,7 @@ pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { - let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fmadd, a.as_f32x16())) + simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3195,9 +3275,7 @@ pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { - let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16(); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, fmadd, zero)) + simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3208,8 +3286,7 @@ pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m51 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { - let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fmadd, c.as_f32x16())) + simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3220,8 +3297,7 @@ pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 { - let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fmadd, a.as_f32x8())) + simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3232,9 +3308,7 @@ pub unsafe fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 { - let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8(); - let zero = _mm256_setzero_ps().as_f32x8(); - transmute(simd_select_bitmask(k, fmadd, zero)) + simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), _mm256_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3245,8 +3319,7 @@ pub unsafe fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 { - let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fmadd, c.as_f32x8())) + simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3257,8 +3330,7 @@ pub unsafe fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fmadd, a.as_f32x4())) + simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3269,9 +3341,7 @@ pub unsafe fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - transmute(simd_select_bitmask(k, fmadd, zero)) + simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), _mm_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3282,8 +3352,7 @@ pub unsafe fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) - #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps pub unsafe fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fmadd, c.as_f32x4())) + simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst. @@ -3294,7 +3363,7 @@ pub unsafe fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) - #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8())) + simd_fma(a, b, c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3305,8 +3374,7 @@ pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { - let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fmadd, a.as_f64x8())) + simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3317,9 +3385,7 @@ pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m51 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8(); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, fmadd, zero)) + simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3330,8 +3396,7 @@ pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m5 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { - let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fmadd, c.as_f64x8())) + simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3342,8 +3407,7 @@ pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mma #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d { - let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fmadd, a.as_f64x4())) + simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3354,9 +3418,7 @@ pub unsafe fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m25 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d { - let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4(); - let zero = _mm256_setzero_pd().as_f64x4(); - transmute(simd_select_bitmask(k, fmadd, zero)) + simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), _mm256_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3367,8 +3429,7 @@ pub unsafe fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m2 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d { - let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fmadd, c.as_f64x4())) + simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3379,8 +3440,7 @@ pub unsafe fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mma #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fmadd, a.as_f64x2())) + simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3391,9 +3451,7 @@ pub unsafe fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - transmute(simd_select_bitmask(k, fmadd, zero)) + simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), _mm_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3404,8 +3462,7 @@ pub unsafe fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fmadd, c.as_f64x2())) + simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst. @@ -3414,11 +3471,9 @@ pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub)) + simd_fma(a, b, simd_neg(c)) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3427,10 +3482,9 @@ pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { - let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fmsub, a.as_f32x16())) + simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3439,11 +3493,9 @@ pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { - let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16(); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, fmsub, zero)) + simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3452,10 +3504,9 @@ pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m51 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { - let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fmsub, c.as_f32x16())) + simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3466,8 +3517,7 @@ pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 { - let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fmsub, a.as_f32x8())) + simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3478,9 +3528,7 @@ pub unsafe fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 { - let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8(); - let zero = _mm256_setzero_ps().as_f32x8(); - transmute(simd_select_bitmask(k, fmsub, zero)) + simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), _mm256_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3491,8 +3539,7 @@ pub unsafe fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 { - let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fmsub, c.as_f32x8())) + simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3503,8 +3550,7 @@ pub unsafe fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fmsub, a.as_f32x4())) + simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3515,9 +3561,7 @@ pub unsafe fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - transmute(simd_select_bitmask(k, fmsub, zero)) + simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), _mm_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3528,8 +3572,7 @@ pub unsafe fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) - #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fmsub, c.as_f32x4())) + simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst. @@ -3538,11 +3581,9 @@ pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) - #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); - transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub)) + simd_fma(a, b, simd_neg(c)) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3551,10 +3592,9 @@ pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { - let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fmsub, a.as_f64x8())) + simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3563,11 +3603,9 @@ pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m51 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8(); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, fmsub, zero)) + simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3576,10 +3614,9 @@ pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub +#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { - let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fmsub, c.as_f64x8())) + simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3590,8 +3627,7 @@ pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mma #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d { - let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fmsub, a.as_f64x4())) + simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3602,9 +3638,7 @@ pub unsafe fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m25 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d { - let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4(); - let zero = _mm256_setzero_pd().as_f64x4(); - transmute(simd_select_bitmask(k, fmsub, zero)) + simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), _mm256_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3615,8 +3649,7 @@ pub unsafe fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m2 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d { - let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fmsub, c.as_f64x4())) + simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3627,8 +3660,7 @@ pub unsafe fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mma #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fmsub, a.as_f64x2())) + simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3639,9 +3671,7 @@ pub unsafe fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - transmute(simd_select_bitmask(k, fmsub, zero)) + simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), _mm_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3652,8 +3682,7 @@ pub unsafe fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub pub unsafe fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fmsub, c.as_f64x2())) + simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst. @@ -3664,12 +3693,13 @@ pub unsafe fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - transmute(vfmaddsub213ps( - a.as_f32x16(), - b.as_f32x16(), - c.as_f32x16(), - _MM_FROUND_CUR_DIRECTION, - )) + let add = simd_fma(a, b, c); + let sub = simd_fma(a, b, simd_neg(c)); + simd_shuffle!( + add, + sub, + [16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15] + ) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3680,8 +3710,7 @@ pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { - let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x16())) + simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3692,9 +3721,7 @@ pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { - let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16(); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, fmaddsub, zero)) + simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3705,8 +3732,7 @@ pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { - let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x16())) + simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3717,8 +3743,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mma #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 { - let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x8())) + simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3729,9 +3754,7 @@ pub unsafe fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m2 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 { - let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8(); - let zero = _mm256_setzero_ps().as_f32x8(); - transmute(simd_select_bitmask(k, fmaddsub, zero)) + simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), _mm256_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3742,8 +3765,7 @@ pub unsafe fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 { - let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x8())) + simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3754,8 +3776,7 @@ pub unsafe fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mma #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x4())) + simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3766,9 +3787,7 @@ pub unsafe fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - transmute(simd_select_bitmask(k, fmaddsub, zero)) + simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), _mm_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3779,8 +3798,7 @@ pub unsafe fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps pub unsafe fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x4())) + simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst. @@ -3791,12 +3809,9 @@ pub unsafe fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - transmute(vfmaddsub213pd( - a.as_f64x8(), - b.as_f64x8(), - c.as_f64x8(), - _MM_FROUND_CUR_DIRECTION, - )) + let add = simd_fma(a, b, c); + let sub = simd_fma(a, b, simd_neg(c)); + simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7]) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3807,8 +3822,7 @@ pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { - let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x8())) + simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3819,9 +3833,7 @@ pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8(); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, fmaddsub, zero)) + simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), _mm512_setzero_pd()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3832,8 +3844,7 @@ pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { - let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x8())) + simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3844,8 +3855,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d { - let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x4())) + simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3856,9 +3866,7 @@ pub unsafe fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d { - let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4(); - let zero = _mm256_setzero_pd().as_f64x4(); - transmute(simd_select_bitmask(k, fmaddsub, zero)) + simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), _mm256_setzero_pd()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3869,8 +3877,7 @@ pub unsafe fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d { - let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x4())) + simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3881,8 +3888,7 @@ pub unsafe fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x2())) + simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3893,9 +3899,7 @@ pub unsafe fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - transmute(simd_select_bitmask(k, fmaddsub, zero)) + simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), _mm_setzero_pd()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3906,8 +3910,7 @@ pub unsafe fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x2())) + simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst. @@ -3916,16 +3919,15 @@ pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - transmute(vfmaddsub213ps( - a.as_f32x16(), - b.as_f32x16(), + let add = simd_fma(a, b, c); + let sub = simd_fma(a, b, simd_neg(c)); + simd_shuffle!( + add, sub, - _MM_FROUND_CUR_DIRECTION, - )) + [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31] + ) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3934,10 +3936,9 @@ pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { - let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x16())) + simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3946,11 +3947,9 @@ pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { - let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16(); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, fmsubadd, zero)) + simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3959,10 +3958,9 @@ pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __ #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { - let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x16())) + simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -3973,8 +3971,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mma #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 { - let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x8())) + simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -3985,9 +3982,7 @@ pub unsafe fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m2 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 { - let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8(); - let zero = _mm256_setzero_ps().as_f32x8(); - transmute(simd_select_bitmask(k, fmsubadd, zero)) + simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), _mm256_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -3998,8 +3993,7 @@ pub unsafe fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 { - let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x8())) + simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4010,8 +4004,7 @@ pub unsafe fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mma #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x4())) + simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4022,9 +4015,7 @@ pub unsafe fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - transmute(simd_select_bitmask(k, fmsubadd, zero)) + simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), _mm_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4035,8 +4026,7 @@ pub unsafe fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x4())) + simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst. @@ -4045,16 +4035,11 @@ pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); - transmute(vfmaddsub213pd( - a.as_f64x8(), - b.as_f64x8(), - sub, - _MM_FROUND_CUR_DIRECTION, - )) + let add = simd_fma(a, b, c); + let sub = simd_fma(a, b, simd_neg(c)); + simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15]) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4063,10 +4048,9 @@ pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { - let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x8())) + simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4075,11 +4059,9 @@ pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __ #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8(); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, fmsubadd, zero)) + simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4088,10 +4070,9 @@ pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: _ #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { - let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x8())) + simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4102,8 +4083,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d { - let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x4())) + simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4114,9 +4094,7 @@ pub unsafe fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d { - let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4(); - let zero = _mm256_setzero_pd().as_f64x4(); - transmute(simd_select_bitmask(k, fmsubadd, zero)) + simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), _mm256_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4127,8 +4105,7 @@ pub unsafe fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d { - let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x4())) + simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4139,8 +4116,7 @@ pub unsafe fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x2())) + simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4151,9 +4127,7 @@ pub unsafe fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - transmute(simd_select_bitmask(k, fmsubadd, zero)) + simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), _mm_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4164,8 +4138,7 @@ pub unsafe fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x2())) + simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst. @@ -4174,11 +4147,9 @@ pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); - transmute(vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16())) + simd_fma(simd_neg(a), b, c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4187,10 +4158,9 @@ pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { - let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fnmadd, a.as_f32x16())) + simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4199,11 +4169,9 @@ pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m51 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { - let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16(); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, fnmadd, zero)) + simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4212,10 +4180,9 @@ pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { - let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fnmadd, c.as_f32x16())) + simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4226,8 +4193,7 @@ pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 { - let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fnmadd, a.as_f32x8())) + simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4238,9 +4204,7 @@ pub unsafe fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 { - let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8(); - let zero = _mm256_setzero_ps().as_f32x8(); - transmute(simd_select_bitmask(k, fnmadd, zero)) + simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), _mm256_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4251,8 +4215,7 @@ pub unsafe fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m25 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 { - let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fnmadd, c.as_f32x8())) + simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4263,8 +4226,7 @@ pub unsafe fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fnmadd, a.as_f32x4())) + simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4275,9 +4237,7 @@ pub unsafe fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) - #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - transmute(simd_select_bitmask(k, fnmadd, zero)) + simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), _mm_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4288,8 +4248,7 @@ pub unsafe fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fnmadd, c.as_f32x4())) + simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst. @@ -4298,11 +4257,9 @@ pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f64x8()); - transmute(vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8())) + simd_fma(simd_neg(a), b, c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4311,10 +4268,9 @@ pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { - let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fnmadd, a.as_f64x8())) + simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4323,11 +4279,9 @@ pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8(); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, fnmadd, zero)) + simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4336,10 +4290,9 @@ pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { - let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fnmadd, c.as_f64x8())) + simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4350,8 +4303,7 @@ pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mm #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d { - let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fnmadd, a.as_f64x4())) + simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4362,9 +4314,7 @@ pub unsafe fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m2 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d { - let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4(); - let zero = _mm256_setzero_pd().as_f64x4(); - transmute(simd_select_bitmask(k, fnmadd, zero)) + simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), _mm256_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4375,8 +4325,7 @@ pub unsafe fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d { - let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fnmadd, c.as_f64x4())) + simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4387,8 +4336,7 @@ pub unsafe fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mm #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fnmadd, a.as_f64x2())) + simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4399,9 +4347,7 @@ pub unsafe fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - transmute(simd_select_bitmask(k, fnmadd, zero)) + simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), _mm_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4412,8 +4358,7 @@ pub unsafe fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fnmadd, c.as_f64x2())) + simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst. @@ -4422,12 +4367,9 @@ pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { - let zero: f32x16 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f32x16()); - let subc = simd_sub(zero, c.as_f32x16()); - transmute(vfmadd132ps(suba, b.as_f32x16(), subc)) + simd_fma(simd_neg(a), b, simd_neg(c)) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4436,10 +4378,9 @@ pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 { - let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fnmsub, a.as_f32x16())) + simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4448,11 +4389,9 @@ pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m51 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 { - let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16(); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, fnmsub, zero)) + simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4461,10 +4400,9 @@ pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 { - let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16(); - transmute(simd_select_bitmask(k, fnmsub, c.as_f32x16())) + simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4475,8 +4413,7 @@ pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 { - let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fnmsub, a.as_f32x8())) + simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4487,9 +4424,7 @@ pub unsafe fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 { - let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8(); - let zero = _mm256_setzero_ps().as_f32x8(); - transmute(simd_select_bitmask(k, fnmsub, zero)) + simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), _mm256_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4500,8 +4435,7 @@ pub unsafe fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m25 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 { - let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8(); - transmute(simd_select_bitmask(k, fnmsub, c.as_f32x8())) + simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4512,8 +4446,7 @@ pub unsafe fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fnmsub, a.as_f32x4())) + simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4524,9 +4457,7 @@ pub unsafe fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) - #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - transmute(simd_select_bitmask(k, fnmsub, zero)) + simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), _mm_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4537,8 +4468,7 @@ pub unsafe fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4(); - transmute(simd_select_bitmask(k, fnmsub, c.as_f32x4())) + simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst. @@ -4547,12 +4477,9 @@ pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let zero: f64x8 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f64x8()); - let subc = simd_sub(zero, c.as_f64x8()); - transmute(vfmadd132pd(suba, b.as_f64x8(), subc)) + simd_fma(simd_neg(a), b, simd_neg(c)) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4561,10 +4488,9 @@ pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d { #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d { - let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fnmsub, a.as_f64x8())) + simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4573,11 +4499,9 @@ pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d { - let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8(); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, fnmsub, zero)) + simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4586,10 +4510,9 @@ pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d { - let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8(); - transmute(simd_select_bitmask(k, fnmsub, c.as_f64x8())) + simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4600,8 +4523,7 @@ pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mm #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d { - let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fnmsub, a.as_f64x4())) + simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4612,9 +4534,7 @@ pub unsafe fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m2 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d { - let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4(); - let zero = _mm256_setzero_pd().as_f64x4(); - transmute(simd_select_bitmask(k, fnmsub, zero)) + simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), _mm256_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4625,8 +4545,7 @@ pub unsafe fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d { - let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4(); - transmute(simd_select_bitmask(k, fnmsub, c.as_f64x4())) + simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). @@ -4637,8 +4556,7 @@ pub unsafe fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mm #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fnmsub, a.as_f64x2())) + simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -4649,9 +4567,7 @@ pub unsafe fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - transmute(simd_select_bitmask(k, fnmsub, zero)) + simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), _mm_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set). @@ -4662,8 +4578,7 @@ pub unsafe fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd pub unsafe fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2(); - transmute(simd_select_bitmask(k, fnmsub, c.as_f64x2())) + simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), c) } /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14. @@ -4929,6 +4844,21 @@ pub unsafe fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 { )) } +/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vrsqrt14ps))] +pub unsafe fn _mm256_rsqrt14_ps(a: __m256) -> __m256 { + transmute(vrsqrt14ps256( + a.as_f32x8(), + _mm256_setzero_ps().as_f32x8(), + 0b11111111, + )) +} + /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_ps&expand=4815) @@ -4955,6 +4885,21 @@ pub unsafe fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 { )) } +/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vrsqrt14ps))] +pub unsafe fn _mm_rsqrt14_ps(a: __m128) -> __m128 { + transmute(vrsqrt14ps128( + a.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b00001111, + )) +} + /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_ps&expand=4813) @@ -5014,6 +4959,21 @@ pub unsafe fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d { transmute(vrsqrt14pd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k)) } +/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt14_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vrsqrt14pd))] +pub unsafe fn _mm256_rsqrt14_pd(a: __m256d) -> __m256d { + transmute(vrsqrt14pd256( + a.as_f64x4(), + _mm256_setzero_pd().as_f64x4(), + 0b00001111, + )) +} + /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_rsqrt14_pd&expand=4808) @@ -5040,6 +5000,21 @@ pub unsafe fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d { )) } +/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt14_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vrsqrt14pd))] +pub unsafe fn _mm_rsqrt14_pd(a: __m128d) -> __m128d { + transmute(vrsqrt14pd128( + a.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b00000011, + )) +} + /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_rsqrt14_pd&expand=4806) @@ -8153,11 +8128,7 @@ pub unsafe fn _mm512_fmadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmadd132psround(a, b, c, ROUNDING); - transmute(r) + vfmadd132psround(a, b, c, ROUNDING) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -8182,11 +8153,7 @@ pub unsafe fn _mm512_mask_fmadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmadd132psround(a, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -8211,12 +8178,7 @@ pub unsafe fn _mm512_maskz_fmadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmadd132psround(a, b, c, ROUNDING); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, r, zero)) + simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -8241,11 +8203,7 @@ pub unsafe fn _mm512_mask3_fmadd_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmadd132psround(a, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\ @@ -8269,11 +8227,7 @@ pub unsafe fn _mm512_fmadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmadd132pdround(a, b, c, ROUNDING); - transmute(r) + vfmadd132pdround(a, b, c, ROUNDING) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -8298,11 +8252,7 @@ pub unsafe fn _mm512_mask_fmadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmadd132pdround(a, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -8327,12 +8277,7 @@ pub unsafe fn _mm512_maskz_fmadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmadd132pdround(a, b, c, ROUNDING); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, r, zero)) + simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -8357,11 +8302,7 @@ pub unsafe fn _mm512_mask3_fmadd_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmadd132pdround(a, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\ @@ -8377,7 +8318,7 @@ pub unsafe fn _mm512_mask3_fmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fmsub_round_ps( a: __m512, @@ -8385,12 +8326,7 @@ pub unsafe fn _mm512_fmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let r = vfmadd132psround(a, b, sub, ROUNDING); - transmute(r) + vfmadd132psround(a, b, simd_neg(c), ROUNDING) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -8406,7 +8342,7 @@ pub unsafe fn _mm512_fmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fmsub_round_ps( a: __m512, @@ -8415,12 +8351,8 @@ pub unsafe fn _mm512_mask_fmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let r = vfmadd132psround(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -8436,7 +8368,7 @@ pub unsafe fn _mm512_mask_fmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fmsub_round_ps( k: __mmask16, @@ -8445,12 +8377,8 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let r = vfmadd132psround(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, zero)) + let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -8466,7 +8394,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fmsub_round_ps( a: __m512, @@ -8475,13 +8403,8 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let c = c.as_f32x16(); - let sub = simd_sub(zero, c); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let r = vfmadd132psround(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\ @@ -8497,7 +8420,7 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fmsub_round_pd( a: __m512d, @@ -8505,12 +8428,7 @@ pub unsafe fn _mm512_fmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let r = vfmadd132pdround(a, b, sub, ROUNDING); - transmute(r) + vfmadd132pdround(a, b, simd_neg(c), ROUNDING) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -8526,7 +8444,7 @@ pub unsafe fn _mm512_fmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fmsub_round_pd( a: __m512d, @@ -8535,12 +8453,8 @@ pub unsafe fn _mm512_mask_fmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let r = vfmadd132pdround(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -8556,7 +8470,7 @@ pub unsafe fn _mm512_mask_fmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fmsub_round_pd( k: __mmask8, @@ -8565,12 +8479,8 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let r = vfmadd132pdround(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, zero)) + let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -8586,7 +8496,7 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fmsub_round_pd( a: __m512d, @@ -8595,13 +8505,8 @@ pub unsafe fn _mm512_mask3_fmsub_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let c = c.as_f64x8(); - let sub = simd_sub(zero, c); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let r = vfmadd132pdround(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\ @@ -8625,11 +8530,7 @@ pub unsafe fn _mm512_fmaddsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmaddsub213ps(a, b, c, ROUNDING); - transmute(r) + vfmaddsubpsround(a, b, c, ROUNDING) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -8654,11 +8555,7 @@ pub unsafe fn _mm512_mask_fmaddsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmaddsub213ps(a, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -8683,12 +8580,7 @@ pub unsafe fn _mm512_maskz_fmaddsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmaddsub213ps(a, b, c, ROUNDING); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, r, zero)) + simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -8713,11 +8605,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmaddsub213ps(a, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\ @@ -8741,11 +8629,7 @@ pub unsafe fn _mm512_fmaddsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmaddsub213pd(a, b, c, ROUNDING); - transmute(r) + vfmaddsubpdround(a, b, c, ROUNDING) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -8770,11 +8654,7 @@ pub unsafe fn _mm512_mask_fmaddsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmaddsub213pd(a, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -8799,12 +8679,7 @@ pub unsafe fn _mm512_maskz_fmaddsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmaddsub213pd(a, b, c, ROUNDING); - let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, r, zero)) + simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), _mm512_setzero_pd()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -8829,11 +8704,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmaddsub213pd(a, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\ @@ -8849,7 +8720,7 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fmsubadd_round_ps( a: __m512, @@ -8857,12 +8728,7 @@ pub unsafe fn _mm512_fmsubadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let r = vfmaddsub213ps(a, b, sub, ROUNDING); - transmute(r) + vfmaddsubpsround(a, b, simd_neg(c), ROUNDING) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -8878,7 +8744,7 @@ pub unsafe fn _mm512_fmsubadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fmsubadd_round_ps( a: __m512, @@ -8887,12 +8753,8 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let r = vfmaddsub213ps(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -8908,7 +8770,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fmsubadd_round_ps( k: __mmask16, @@ -8917,12 +8779,8 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f32x16()); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let r = vfmaddsub213ps(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, zero)) + let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -8938,7 +8796,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fmsubadd_round_ps( a: __m512, @@ -8947,13 +8805,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let c = c.as_f32x16(); - let sub = simd_sub(zero, c); - let a = a.as_f32x16(); - let b = b.as_f32x16(); - let r = vfmaddsub213ps(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\ @@ -8969,7 +8822,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fmsubadd_round_pd( a: __m512d, @@ -8977,12 +8830,7 @@ pub unsafe fn _mm512_fmsubadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let r = vfmaddsub213pd(a, b, sub, ROUNDING); - transmute(r) + vfmaddsubpdround(a, b, simd_neg(c), ROUNDING) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -8998,7 +8846,7 @@ pub unsafe fn _mm512_fmsubadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fmsubadd_round_pd( a: __m512d, @@ -9007,12 +8855,8 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let r = vfmaddsub213pd(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -9028,7 +8872,7 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fmsubadd_round_pd( k: __mmask8, @@ -9037,12 +8881,8 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, c.as_f64x8()); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let r = vfmaddsub213pd(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, zero)) + let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -9058,7 +8898,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd +#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fmsubadd_round_pd( a: __m512d, @@ -9067,13 +8907,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let c = c.as_f64x8(); - let sub = simd_sub(zero, c); - let a = a.as_f64x8(); - let b = b.as_f64x8(); - let r = vfmaddsub213pd(a, b, sub, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\ @@ -9089,7 +8924,7 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fnmadd_round_ps( a: __m512, @@ -9097,12 +8932,7 @@ pub unsafe fn _mm512_fnmadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmadd132psround(sub, b, c, ROUNDING); - transmute(r) + vfmadd132psround(simd_neg(a), b, c, ROUNDING) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -9118,7 +8948,7 @@ pub unsafe fn _mm512_fnmadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fnmadd_round_ps( a: __m512, @@ -9127,12 +8957,8 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmadd132psround(sub, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, a.as_f32x16())) + let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING); + simd_select_bitmask(k, r, a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -9148,7 +8974,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fnmadd_round_ps( k: __mmask16, @@ -9157,12 +8983,8 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmadd132psround(sub, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, zero)) + let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING); + simd_select_bitmask(k, r, _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -9178,7 +9000,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fnmadd_round_ps( a: __m512, @@ -9187,12 +9009,8 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f32x16()); - let b = b.as_f32x16(); - let c = c.as_f32x16(); - let r = vfmadd132psround(sub, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING); + simd_select_bitmask(k, r, c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\ @@ -9208,7 +9026,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fnmadd_round_pd( a: __m512d, @@ -9216,12 +9034,7 @@ pub unsafe fn _mm512_fnmadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f64x8()); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmadd132pdround(sub, b, c, ROUNDING); - transmute(r) + vfmadd132pdround(simd_neg(a), b, c, ROUNDING) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -9237,7 +9050,7 @@ pub unsafe fn _mm512_fnmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fnmadd_round_pd( a: __m512d, @@ -9246,13 +9059,8 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let a = a.as_f64x8(); - let sub = simd_sub(zero, a); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmadd132pdround(sub, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING); + simd_select_bitmask(k, r, a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -9268,7 +9076,7 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fnmadd_round_pd( k: __mmask8, @@ -9277,12 +9085,8 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f64x8()); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmadd132pdround(sub, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, zero)) + let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING); + simd_select_bitmask(k, r, _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -9298,7 +9102,7 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fnmadd_round_pd( a: __m512d, @@ -9307,12 +9111,8 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let sub = simd_sub(zero, a.as_f64x8()); - let b = b.as_f64x8(); - let c = c.as_f64x8(); - let r = vfmadd132pdround(sub, b, c, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING); + simd_select_bitmask(k, r, c) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\ @@ -9328,7 +9128,7 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fnmsub_round_ps( a: __m512, @@ -9336,12 +9136,7 @@ pub unsafe fn _mm512_fnmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f32x16()); - let subc = simd_sub(zero, c.as_f32x16()); - let b = b.as_f32x16(); - let r = vfmadd132psround(suba, b, subc, ROUNDING); - transmute(r) + vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -9357,7 +9152,7 @@ pub unsafe fn _mm512_fnmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fnmsub_round_ps( a: __m512, @@ -9366,13 +9161,8 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let a = a.as_f32x16(); - let suba = simd_sub(zero, a); - let subc = simd_sub(zero, c.as_f32x16()); - let b = b.as_f32x16(); - let r = vfmadd132psround(suba, b, subc, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, a) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -9388,7 +9178,7 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fnmsub_round_ps( k: __mmask16, @@ -9397,12 +9187,8 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps( c: __m512, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f32x16()); - let subc = simd_sub(zero, c.as_f32x16()); - let b = b.as_f32x16(); - let r = vfmadd132psround(suba, b, subc, ROUNDING); - transmute(simd_select_bitmask(k, r, zero)) + let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, _mm512_setzero_ps()) } /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -9418,7 +9204,7 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fnmsub_round_ps( a: __m512, @@ -9427,13 +9213,8 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps( k: __mmask16, ) -> __m512 { static_assert_rounding!(ROUNDING); - let zero: f32x16 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f32x16()); - let c = c.as_f32x16(); - let subc = simd_sub(zero, c); - let b = b.as_f32x16(); - let r = vfmadd132psround(suba, b, subc, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, c) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\ @@ -9449,7 +9230,7 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd #[rustc_legacy_const_generics(3)] pub unsafe fn _mm512_fnmsub_round_pd( a: __m512d, @@ -9457,12 +9238,7 @@ pub unsafe fn _mm512_fnmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f64x8()); - let subc = simd_sub(zero, c.as_f64x8()); - let b = b.as_f64x8(); - let r = vfmadd132pdround(suba, b, subc, ROUNDING); - transmute(r) + vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\ @@ -9478,7 +9254,7 @@ pub unsafe fn _mm512_fnmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask_fnmsub_round_pd( a: __m512d, @@ -9487,13 +9263,8 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let a = a.as_f64x8(); - let suba = simd_sub(zero, a); - let subc = simd_sub(zero, c.as_f64x8()); - let b = b.as_f64x8(); - let r = vfmadd132pdround(suba, b, subc, ROUNDING); - transmute(simd_select_bitmask(k, r, a)) + let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, a) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\ @@ -9509,7 +9280,7 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_maskz_fnmsub_round_pd( k: __mmask8, @@ -9518,12 +9289,8 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd( c: __m512d, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f64x8()); - let subc = simd_sub(zero, c.as_f64x8()); - let b = b.as_f64x8(); - let r = vfmadd132pdround(suba, b, subc, ROUNDING); - transmute(simd_select_bitmask(k, r, zero)) + let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, _mm512_setzero_pd()) } /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\ @@ -9539,7 +9306,7 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd #[rustc_legacy_const_generics(4)] pub unsafe fn _mm512_mask3_fnmsub_round_pd( a: __m512d, @@ -9548,13 +9315,8 @@ pub unsafe fn _mm512_mask3_fnmsub_round_pd( k: __mmask8, ) -> __m512d { static_assert_rounding!(ROUNDING); - let zero: f64x8 = mem::zeroed(); - let suba = simd_sub(zero, a.as_f64x8()); - let c = c.as_f64x8(); - let subc = simd_sub(zero, c); - let b = b.as_f64x8(); - let r = vfmadd132pdround(suba, b, subc, ROUNDING); - transmute(simd_select_bitmask(k, r, c)) + let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING); + simd_select_bitmask(k, r, c) } /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\ @@ -15947,7 +15709,7 @@ pub unsafe fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i { #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm512_setzero_pd() -> __m512d { // All-0 is a properly initialized __m512d - mem::zeroed() + const { mem::zeroed() } } /// Returns vector of type `__m512` with all elements set to zero. @@ -15959,7 +15721,7 @@ pub unsafe fn _mm512_setzero_pd() -> __m512d { #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm512_setzero_ps() -> __m512 { // All-0 is a properly initialized __m512 - mem::zeroed() + const { mem::zeroed() } } /// Return vector of type `__m512` with all elements set to zero. @@ -15971,7 +15733,7 @@ pub unsafe fn _mm512_setzero_ps() -> __m512 { #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm512_setzero() -> __m512 { // All-0 is a properly initialized __m512 - mem::zeroed() + const { mem::zeroed() } } /// Returns vector of type `__m512i` with all elements set to zero. @@ -15983,7 +15745,7 @@ pub unsafe fn _mm512_setzero() -> __m512 { #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm512_setzero_si512() -> __m512i { // All-0 is a properly initialized __m512i - mem::zeroed() + const { mem::zeroed() } } /// Return vector of type `__m512i` with all elements set to zero. @@ -15995,7 +15757,7 @@ pub unsafe fn _mm512_setzero_si512() -> __m512i { #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm512_setzero_epi32() -> __m512i { // All-0 is a properly initialized __m512i - mem::zeroed() + const { mem::zeroed() } } /// Sets packed 32-bit integers in `dst` with the supplied values in reverse @@ -25556,8 +25318,27 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(vmovd))] pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 { - let extract: i32 = simd_extract!(a.as_i32x16(), 0); - extract + simd_extract!(a.as_i32x16(), 0) +} + +/// Copy the lower single-precision (32-bit) floating-point element of a to dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtss_f32) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_cvtss_f32(a: __m512) -> f32 { + simd_extract!(a, 0) +} + +/// Copy the lower double-precision (64-bit) floating-point element of a to dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtsd_f64) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm512_cvtsd_f64(a: __m512d) -> f64 { + simd_extract!(a, 0) } /// Broadcast the low packed 32-bit integer from a to all elements of dst. @@ -27493,6 +27274,26 @@ pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i { _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b) } +/// Convert 16-bit mask a into an integer value, and store the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask16_u32) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _cvtmask16_u32(a: __mmask16) -> u32 { + a as u32 +} + +/// Convert 32-bit integer value a to an 16-bit mask and store the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask16) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _cvtu32_mask16(a: u32) -> __mmask16 { + a as __mmask16 +} + /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=kand_mask16&expand=3212) @@ -27623,6 +27424,83 @@ pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 { _mm512_knot(_mm512_kxor(a, b)) } +/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise +/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask16_u8) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortest_mask16_u8(a: __mmask16, b: __mmask16, all_ones: *mut u8) -> u8 { + let tmp = _kor_mask16(a, b); + *all_ones = (tmp == 0xffff) as u8; + (tmp == 0) as u8 +} + +/// Compute the bitwise OR of 16-bit masks a and b. If the result is all ones, store 1 in dst, otherwise +/// store 0 in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask16_u8) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 { + (_kor_mask16(a, b) == 0xffff) as u8 +} + +/// Compute the bitwise OR of 16-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise +/// store 0 in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask16_u8) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 { + (_kor_mask16(a, b) == 0) as u8 +} + +/// Shift 16-bit mask a left by count bits while shifting in zeros, and store the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask16) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_legacy_const_generics(1)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kshiftli_mask16(a: __mmask16) -> __mmask16 { + a << COUNT +} + +/// Shift 16-bit mask a right by count bits while shifting in zeros, and store the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask16) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_legacy_const_generics(1)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kshiftri_mask16(a: __mmask16) -> __mmask16 { + a >> COUNT +} + +/// Load 16-bit mask from memory +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_load_mask16) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _load_mask16(mem_addr: *const __mmask16) -> __mmask16 { + *mem_addr +} + +/// Store 16-bit mask to memory +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_store_mask16) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _store_mask16(mem_addr: *mut __mmask16, a: __mmask16) { + *mem_addr = a; +} + /// Copy 16-bit mask a to k. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm512_kmov&expand=3228) @@ -27674,12 +27552,20 @@ pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw pub unsafe fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 { - let r = a | b; - if r == 0b11111111_11111111 { - 1 - } else { - 0 - } + let r = (a | b) == 0b11111111_11111111; + r as i32 +} + +/// Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=512_kortestz) +#[inline] +#[target_feature(enable = "avx512f")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kortestw +pub unsafe fn _mm512_kortestz(a: __mmask16, b: __mmask16) -> i32 { + let r = (a | b) == 0; + r as i32 } /// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero. @@ -29208,7 +29094,7 @@ pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { simd_bitmask::(simd_lt(a.as_u32x16(), b.as_u32x16())) } -/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epu32_mask&expand=1057) #[inline] @@ -29216,7 +29102,7 @@ pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmplt_epu32_mask(a, b) & k1 + _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k. @@ -29230,7 +29116,7 @@ pub unsafe fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 { simd_bitmask::(simd_lt(a.as_u32x8(), b.as_u32x8())) } -/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epu32_mask&expand=1055) #[inline] @@ -29238,7 +29124,7 @@ pub unsafe fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmplt_epu32_mask(a, b) & k1 + _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k. @@ -29252,7 +29138,7 @@ pub unsafe fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { simd_bitmask::(simd_lt(a.as_u32x4(), b.as_u32x4())) } -/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epu32_mask&expand=1053) #[inline] @@ -29260,7 +29146,7 @@ pub unsafe fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmplt_epu32_mask(a, b) & k1 + _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -29282,7 +29168,7 @@ pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpgt_epu32_mask(a, b) & k1 + _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -29304,7 +29190,7 @@ pub unsafe fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpgt_epu32_mask(a, b) & k1 + _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -29326,7 +29212,7 @@ pub unsafe fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpgt_epu32_mask(a, b) & k1 + _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -29340,7 +29226,7 @@ pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { simd_bitmask::(simd_le(a.as_u32x16(), b.as_u32x16())) } -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epu32_mask&expand=996) #[inline] @@ -29348,7 +29234,7 @@ pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmple_epu32_mask(a, b) & k1 + _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -29362,7 +29248,7 @@ pub unsafe fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 { simd_bitmask::(simd_le(a.as_u32x8(), b.as_u32x8())) } -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epu32_mask&expand=994) #[inline] @@ -29370,7 +29256,7 @@ pub unsafe fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmple_epu32_mask(a, b) & k1 + _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -29384,7 +29270,7 @@ pub unsafe fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { simd_bitmask::(simd_le(a.as_u32x4(), b.as_u32x4())) } -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epu32_mask&expand=992) #[inline] @@ -29392,7 +29278,7 @@ pub unsafe fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmple_epu32_mask(a, b) & k1 + _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -29414,7 +29300,7 @@ pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpge_epu32_mask(a, b) & k1 + _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -29436,7 +29322,7 @@ pub unsafe fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpge_epu32_mask(a, b) & k1 + _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -29458,7 +29344,7 @@ pub unsafe fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpge_epu32_mask(a, b) & k1 + _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k. @@ -29480,7 +29366,7 @@ pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpeq_epu32_mask(a, b) & k1 + _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k. @@ -29502,7 +29388,7 @@ pub unsafe fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpeq_epu32_mask(a, b) & k1 + _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k. @@ -29524,7 +29410,7 @@ pub unsafe fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpeq_epu32_mask(a, b) & k1 + _mm_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -29546,7 +29432,7 @@ pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpneq_epu32_mask(a, b) & k1 + _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -29568,7 +29454,7 @@ pub unsafe fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpneq_epu32_mask(a, b) & k1 + _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -29590,7 +29476,7 @@ pub unsafe fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud pub unsafe fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpneq_epu32_mask(a, b) & k1 + _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -29606,11 +29492,19 @@ pub unsafe fn _mm512_cmp_epu32_mask( b: __m512i, ) -> __mmask16 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; - let a = a.as_i32x16(); - let b = b.as_i32x16(); - let r = vpcmpud(a, b, IMM3, neg_one); - transmute(r) + let a = a.as_u32x16(); + let b = b.as_u32x16(); + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i32x16::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i32x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -29627,10 +29521,20 @@ pub unsafe fn _mm512_mask_cmp_epu32_mask( b: __m512i, ) -> __mmask16 { static_assert_uimm_bits!(IMM3, 3); - let a = a.as_i32x16(); - let b = b.as_i32x16(); - let r = vpcmpud(a, b, IMM3, k1 as i16); - transmute(r) + let a = a.as_u32x16(); + let b = b.as_u32x16(); + let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i32x16::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i32x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -29646,11 +29550,19 @@ pub unsafe fn _mm256_cmp_epu32_mask( b: __m256i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; - let a = a.as_i32x8(); - let b = b.as_i32x8(); - let r = vpcmpud256(a, b, IMM3, neg_one); - transmute(r) + let a = a.as_u32x8(); + let b = b.as_u32x8(); + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i32x8::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i32x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -29667,10 +29579,20 @@ pub unsafe fn _mm256_mask_cmp_epu32_mask( b: __m256i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let a = a.as_i32x8(); - let b = b.as_i32x8(); - let r = vpcmpud256(a, b, IMM3, k1 as i8); - transmute(r) + let a = a.as_u32x8(); + let b = b.as_u32x8(); + let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i32x8::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i32x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -29683,11 +29605,19 @@ pub unsafe fn _mm256_mask_cmp_epu32_mask( #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))] pub unsafe fn _mm_cmp_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; - let a = a.as_i32x4(); - let b = b.as_i32x4(); - let r = vpcmpud128(a, b, IMM3, neg_one); - transmute(r) + let a = a.as_u32x4(); + let b = b.as_u32x4(); + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i32x4::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i32x4::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -29704,10 +29634,20 @@ pub unsafe fn _mm_mask_cmp_epu32_mask( b: __m128i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let a = a.as_i32x4(); - let b = b.as_i32x4(); - let r = vpcmpud128(a, b, IMM3, k1 as i8); - transmute(r) + let a = a.as_u32x4(); + let b = b.as_u32x4(); + let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i32x4::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i32x4::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k. @@ -29721,7 +29661,7 @@ pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { simd_bitmask::(simd_lt(a.as_i32x16(), b.as_i32x16())) } -/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmplt_epi32_mask&expand=1031) #[inline] @@ -29729,7 +29669,7 @@ pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmplt_epi32_mask(a, b) & k1 + _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k. @@ -29743,7 +29683,7 @@ pub unsafe fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 { simd_bitmask::(simd_lt(a.as_i32x8(), b.as_i32x8())) } -/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmplt_epi32_mask&expand=1028) #[inline] @@ -29751,7 +29691,7 @@ pub unsafe fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmplt_epi32_mask(a, b) & k1 + _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k. @@ -29765,7 +29705,7 @@ pub unsafe fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { simd_bitmask::(simd_lt(a.as_i32x4(), b.as_i32x4())) } -/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmplt_epi32_mask&expand=1026) #[inline] @@ -29773,7 +29713,7 @@ pub unsafe fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmplt_epi32_mask(a, b) & k1 + _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -29795,7 +29735,7 @@ pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpgt_epi32_mask(a, b) & k1 + _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -29817,7 +29757,7 @@ pub unsafe fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpgt_epi32_mask(a, b) & k1 + _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -29839,7 +29779,7 @@ pub unsafe fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpgt_epi32_mask(a, b) & k1 + _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -29853,7 +29793,7 @@ pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { simd_bitmask::(simd_le(a.as_i32x16(), b.as_i32x16())) } -/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cmple_epi32_mask&expand=972) #[inline] @@ -29861,7 +29801,7 @@ pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmple_epi32_mask(a, b) & k1 + _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -29875,7 +29815,7 @@ pub unsafe fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 { simd_bitmask::(simd_le(a.as_i32x8(), b.as_i32x8())) } -/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cmple_epi32_mask&expand=970) #[inline] @@ -29883,7 +29823,7 @@ pub unsafe fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmple_epi32_mask(a, b) & k1 + _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -29897,7 +29837,7 @@ pub unsafe fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { simd_bitmask::(simd_le(a.as_i32x4(), b.as_i32x4())) } -/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cmple_epi32_mask&expand=968) #[inline] @@ -29905,7 +29845,7 @@ pub unsafe fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmple_epi32_mask(a, b) & k1 + _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -29927,7 +29867,7 @@ pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpge_epi32_mask(a, b) & k1 + _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -29949,7 +29889,7 @@ pub unsafe fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpge_epi32_mask(a, b) & k1 + _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -29971,7 +29911,7 @@ pub unsafe fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpge_epi32_mask(a, b) & k1 + _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k. @@ -29993,7 +29933,7 @@ pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpeq_epi32_mask(a, b) & k1 + _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k. @@ -30015,7 +29955,7 @@ pub unsafe fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpeq_epi32_mask(a, b) & k1 + _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k. @@ -30037,7 +29977,7 @@ pub unsafe fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpeq_epi32_mask(a, b) & k1 + _mm_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -30059,7 +29999,7 @@ pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpneq_epi32_mask(a, b) & k1 + _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -30081,7 +30021,7 @@ pub unsafe fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpneq_epi32_mask(a, b) & k1 + _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -30103,7 +30043,7 @@ pub unsafe fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd pub unsafe fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpneq_epi32_mask(a, b) & k1 + _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -30119,11 +30059,19 @@ pub unsafe fn _mm512_cmp_epi32_mask( b: __m512i, ) -> __mmask16 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; let a = a.as_i32x16(); let b = b.as_i32x16(); - let r = vpcmpd(a, b, IMM3, neg_one); - transmute(r) + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i32x16::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i32x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -30142,8 +30090,18 @@ pub unsafe fn _mm512_mask_cmp_epi32_mask( static_assert_uimm_bits!(IMM3, 3); let a = a.as_i32x16(); let b = b.as_i32x16(); - let r = vpcmpd(a, b, IMM3, k1 as i16); - transmute(r) + let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i32x16::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i32x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -30159,11 +30117,19 @@ pub unsafe fn _mm256_cmp_epi32_mask( b: __m256i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; let a = a.as_i32x8(); let b = b.as_i32x8(); - let r = vpcmpd256(a, b, IMM3, neg_one); - transmute(r) + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i32x8::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i32x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -30182,8 +30148,18 @@ pub unsafe fn _mm256_mask_cmp_epi32_mask( static_assert_uimm_bits!(IMM3, 3); let a = a.as_i32x8(); let b = b.as_i32x8(); - let r = vpcmpd256(a, b, IMM3, k1 as i8); - transmute(r) + let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i32x8::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i32x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -30196,11 +30172,19 @@ pub unsafe fn _mm256_mask_cmp_epi32_mask( #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))] pub unsafe fn _mm_cmp_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; let a = a.as_i32x4(); let b = b.as_i32x4(); - let r = vpcmpd128(a, b, IMM3, neg_one); - transmute(r) + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i32x4::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i32x4::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -30219,8 +30203,18 @@ pub unsafe fn _mm_mask_cmp_epi32_mask( static_assert_uimm_bits!(IMM3, 3); let a = a.as_i32x4(); let b = b.as_i32x4(); - let r = vpcmpd128(a, b, IMM3, k1 as i8); - transmute(r) + let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i32x4::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i32x4::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k. @@ -30242,7 +30236,7 @@ pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmplt_epu64_mask(a, b) & k1 + _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k. @@ -30264,7 +30258,7 @@ pub unsafe fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmplt_epu64_mask(a, b) & k1 + _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k. @@ -30286,7 +30280,7 @@ pub unsafe fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmplt_epu64_mask(a, b) & k1 + _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -30308,7 +30302,7 @@ pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpgt_epu64_mask(a, b) & k1 + _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -30330,7 +30324,7 @@ pub unsafe fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpgt_epu64_mask(a, b) & k1 + _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -30352,7 +30346,7 @@ pub unsafe fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpgt_epu64_mask(a, b) & k1 + _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -30374,7 +30368,7 @@ pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmple_epu64_mask(a, b) & k1 + _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -30396,7 +30390,7 @@ pub unsafe fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmple_epu64_mask(a, b) & k1 + _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -30418,7 +30412,7 @@ pub unsafe fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmple_epu64_mask(a, b) & k1 + _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -30440,7 +30434,7 @@ pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpge_epu64_mask(a, b) & k1 + _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -30462,7 +30456,7 @@ pub unsafe fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpge_epu64_mask(a, b) & k1 + _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -30484,7 +30478,7 @@ pub unsafe fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpge_epu64_mask(a, b) & k1 + _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k. @@ -30506,7 +30500,7 @@ pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpeq_epu64_mask(a, b) & k1 + _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k. @@ -30528,7 +30522,7 @@ pub unsafe fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpeq_epu64_mask(a, b) & k1 + _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k. @@ -30550,7 +30544,7 @@ pub unsafe fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpeq_epu64_mask(a, b) & k1 + _mm_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -30572,7 +30566,7 @@ pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpneq_epu64_mask(a, b) & k1 + _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -30594,7 +30588,7 @@ pub unsafe fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpneq_epu64_mask(a, b) & k1 + _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -30616,7 +30610,7 @@ pub unsafe fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq pub unsafe fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpneq_epu64_mask(a, b) & k1 + _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -30632,11 +30626,19 @@ pub unsafe fn _mm512_cmp_epu64_mask( b: __m512i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; - let a = a.as_i64x8(); - let b = b.as_i64x8(); - let r = vpcmpuq(a, b, IMM3, neg_one); - transmute(r) + let a = a.as_u64x8(); + let b = b.as_u64x8(); + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i64x8::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i64x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -30653,10 +30655,20 @@ pub unsafe fn _mm512_mask_cmp_epu64_mask( b: __m512i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let a = a.as_i64x8(); - let b = b.as_i64x8(); - let r = vpcmpuq(a, b, IMM3, k1 as i8); - transmute(r) + let a = a.as_u64x8(); + let b = b.as_u64x8(); + let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i64x8::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i64x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -30672,11 +30684,19 @@ pub unsafe fn _mm256_cmp_epu64_mask( b: __m256i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; - let a = a.as_i64x4(); - let b = b.as_i64x4(); - let r = vpcmpuq256(a, b, IMM3, neg_one); - transmute(r) + let a = a.as_u64x4(); + let b = b.as_u64x4(); + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i64x4::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i64x4::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -30693,10 +30713,20 @@ pub unsafe fn _mm256_mask_cmp_epu64_mask( b: __m256i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let a = a.as_i64x4(); - let b = b.as_i64x4(); - let r = vpcmpuq256(a, b, IMM3, k1 as i8); - transmute(r) + let a = a.as_u64x4(); + let b = b.as_u64x4(); + let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i64x4::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i64x4::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -30709,11 +30739,19 @@ pub unsafe fn _mm256_mask_cmp_epu64_mask( #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))] pub unsafe fn _mm_cmp_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; - let a = a.as_i64x2(); - let b = b.as_i64x2(); - let r = vpcmpuq128(a, b, IMM3, neg_one); - transmute(r) + let a = a.as_u64x2(); + let b = b.as_u64x2(); + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i64x2::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i64x2::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -30730,10 +30768,20 @@ pub unsafe fn _mm_mask_cmp_epu64_mask( b: __m128i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let a = a.as_i64x2(); - let b = b.as_i64x2(); - let r = vpcmpuq128(a, b, IMM3, k1 as i8); - transmute(r) + let a = a.as_u64x2(); + let b = b.as_u64x2(); + let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i64x2::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i64x2::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k. @@ -30755,7 +30803,7 @@ pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmplt_epi64_mask(a, b) & k1 + _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k. @@ -30777,7 +30825,7 @@ pub unsafe fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmplt_epi64_mask(a, b) & k1 + _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k. @@ -30799,7 +30847,7 @@ pub unsafe fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmplt_epi64_mask(a, b) & k1 + _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -30821,7 +30869,7 @@ pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpgt_epi64_mask(a, b) & k1 + _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -30843,7 +30891,7 @@ pub unsafe fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpgt_epi64_mask(a, b) & k1 + _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -30865,7 +30913,7 @@ pub unsafe fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpgt_epi64_mask(a, b) & k1 + _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -30887,7 +30935,7 @@ pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmple_epi64_mask(a, b) & k1 + _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -30909,7 +30957,7 @@ pub unsafe fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmple_epi64_mask(a, b) & k1 + _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -30931,7 +30979,7 @@ pub unsafe fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmple_epi64_mask(a, b) & k1 + _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -30953,7 +31001,7 @@ pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpge_epi64_mask(a, b) & k1 + _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -30975,7 +31023,7 @@ pub unsafe fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpge_epi64_mask(a, b) & k1 + _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -30997,7 +31045,7 @@ pub unsafe fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpge_epi64_mask(a, b) & k1 + _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k. @@ -31019,7 +31067,7 @@ pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpeq_epi64_mask(a, b) & k1 + _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k. @@ -31041,7 +31089,7 @@ pub unsafe fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpeq_epi64_mask(a, b) & k1 + _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k. @@ -31063,7 +31111,7 @@ pub unsafe fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpeq_epi64_mask(a, b) & k1 + _mm_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -31085,7 +31133,7 @@ pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpneq_epi64_mask(a, b) & k1 + _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -31107,7 +31155,7 @@ pub unsafe fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 { - _mm256_cmpneq_epi64_mask(a, b) & k1 + _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -31129,7 +31177,7 @@ pub unsafe fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq pub unsafe fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpneq_epi64_mask(a, b) & k1 + _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -31145,11 +31193,19 @@ pub unsafe fn _mm512_cmp_epi64_mask( b: __m512i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; let a = a.as_i64x8(); let b = b.as_i64x8(); - let r = vpcmpq(a, b, IMM3, neg_one); - transmute(r) + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i64x8::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i64x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -31168,8 +31224,18 @@ pub unsafe fn _mm512_mask_cmp_epi64_mask( static_assert_uimm_bits!(IMM3, 3); let a = a.as_i64x8(); let b = b.as_i64x8(); - let r = vpcmpq(a, b, IMM3, k1 as i8); - transmute(r) + let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i64x8::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i64x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -31185,11 +31251,19 @@ pub unsafe fn _mm256_cmp_epi64_mask( b: __m256i, ) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; let a = a.as_i64x4(); let b = b.as_i64x4(); - let r = vpcmpq256(a, b, IMM3, neg_one); - transmute(r) + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i64x4::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i64x4::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -31208,8 +31282,18 @@ pub unsafe fn _mm256_mask_cmp_epi64_mask( static_assert_uimm_bits!(IMM3, 3); let a = a.as_i64x4(); let b = b.as_i64x4(); - let r = vpcmpq256(a, b, IMM3, k1 as i8); - transmute(r) + let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i64x4::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i64x4::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -31222,11 +31306,19 @@ pub unsafe fn _mm256_mask_cmp_epi64_mask( #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))] pub unsafe fn _mm_cmp_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 { static_assert_uimm_bits!(IMM3, 3); - let neg_one = -1; let a = a.as_i64x2(); let b = b.as_i64x2(); - let r = vpcmpq128(a, b, IMM3, neg_one); - transmute(r) + let r = match IMM3 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i64x2::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i64x2::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -31245,8 +31337,18 @@ pub unsafe fn _mm_mask_cmp_epi64_mask( static_assert_uimm_bits!(IMM3, 3); let a = a.as_i64x2(); let b = b.as_i64x2(); - let r = vpcmpq128(a, b, IMM3, k1 as i8); - transmute(r) + let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::splat(0)); + let r = match IMM3 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i64x2::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i64x2::splat(-1), + }; + simd_bitmask(r) } /// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a. @@ -35212,13 +35314,7 @@ pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtss))] pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vsqrtss( - a.as_f32x4(), - b.as_f32x4(), - src.as_f32x4(), - k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, - )) + vsqrtss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) } /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -35229,13 +35325,7 @@ pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) - #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtss))] pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vsqrtss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, - )) + vsqrtss(a, b, _mm_setzero_ps(), k, _MM_FROUND_CUR_DIRECTION) } /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -35246,13 +35336,7 @@ pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtsd))] pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vsqrtsd( - a.as_f64x2(), - b.as_f64x2(), - src.as_f64x2(), - k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, - )) + vsqrtsd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) } /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -35263,13 +35347,7 @@ pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vsqrtsd))] pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vsqrtsd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, - )) + vsqrtsd(a, b, _mm_setzero_pd(), k, _MM_FROUND_CUR_DIRECTION) } /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. @@ -36014,13 +36092,13 @@ pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213ss))] +#[cfg_attr(test, assert_instr(vfmadd))] pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { let mut fmadd: f32 = simd_extract!(a, 0); if (k & 0b00000001) != 0 { let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmadd = fmaf32(fmadd, extractb, extractc); } simd_insert!(a, 0, fmadd) } @@ -36031,14 +36109,14 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213ss))] +#[cfg_attr(test, assert_instr(vfmadd))] pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { let mut fmadd: f32 = 0.; if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract!(a, 0); let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmadd = fmaf32(extracta, extractb, extractc); } simd_insert!(a, 0, fmadd) } @@ -36049,13 +36127,13 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) - #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213ss))] +#[cfg_attr(test, assert_instr(vfmadd))] pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { let mut fmadd: f32 = simd_extract!(c, 0); if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract!(a, 0); let extractb: f32 = simd_extract!(b, 0); - fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); + fmadd = fmaf32(extracta, extractb, fmadd); } simd_insert!(c, 0, fmadd) } @@ -36066,13 +36144,13 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) - #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213sd))] +#[cfg_attr(test, assert_instr(vfmadd))] pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { let mut fmadd: f64 = simd_extract!(a, 0); if (k & 0b00000001) != 0 { let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmadd = fmaf64(fmadd, extractb, extractc); } simd_insert!(a, 0, fmadd) } @@ -36083,14 +36161,14 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213sd))] +#[cfg_attr(test, assert_instr(vfmadd))] pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { let mut fmadd: f64 = 0.; if (k & 0b00000001) != 0 { let extracta: f64 = simd_extract!(a, 0); let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmadd = fmaf64(extracta, extractb, extractc); } simd_insert!(a, 0, fmadd) } @@ -36101,13 +36179,13 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213sd))] +#[cfg_attr(test, assert_instr(vfmadd))] pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { let mut fmadd: f64 = simd_extract!(c, 0); if (k & 0b00000001) != 0 { let extracta: f64 = simd_extract!(a, 0); let extractb: f64 = simd_extract!(b, 0); - fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); + fmadd = fmaf64(extracta, extractb, fmadd); } simd_insert!(c, 0, fmadd) } @@ -36118,14 +36196,14 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213ss))] +#[cfg_attr(test, assert_instr(vfmsub))] pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { let mut fmsub: f32 = simd_extract!(a, 0); if (k & 0b00000001) != 0 { let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmsub = fmaf32(fmsub, extractb, extractc); } simd_insert!(a, 0, fmsub) } @@ -36136,7 +36214,7 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213ss))] +#[cfg_attr(test, assert_instr(vfmsub))] pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { let mut fmsub: f32 = 0.; if (k & 0b00000001) != 0 { @@ -36144,7 +36222,7 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) - let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmsub = fmaf32(extracta, extractb, extractc); } simd_insert!(a, 0, fmsub) } @@ -36155,14 +36233,14 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) - #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213ss))] +#[cfg_attr(test, assert_instr(vfmsub))] pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { let mut fmsub: f32 = simd_extract!(c, 0); if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract!(a, 0); let extractb: f32 = simd_extract!(b, 0); let extractc = -fmsub; - fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmsub = fmaf32(extracta, extractb, extractc); } simd_insert!(c, 0, fmsub) } @@ -36173,14 +36251,14 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) - #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213sd))] +#[cfg_attr(test, assert_instr(vfmsub))] pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { let mut fmsub: f64 = simd_extract!(a, 0); if (k & 0b00000001) != 0 { let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmsub = fmaf64(fmsub, extractb, extractc); } simd_insert!(a, 0, fmsub) } @@ -36191,7 +36269,7 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213sd))] +#[cfg_attr(test, assert_instr(vfmsub))] pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { let mut fmsub: f64 = 0.; if (k & 0b00000001) != 0 { @@ -36199,7 +36277,7 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmsub = fmaf64(extracta, extractb, extractc); } simd_insert!(a, 0, fmsub) } @@ -36210,14 +36288,14 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213sd))] +#[cfg_attr(test, assert_instr(vfmsub))] pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { let mut fmsub: f64 = simd_extract!(c, 0); if (k & 0b00000001) != 0 { let extracta: f64 = simd_extract!(a, 0); let extractb: f64 = simd_extract!(b, 0); let extractc = -fmsub; - fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fmsub = fmaf64(extracta, extractb, extractc); } simd_insert!(c, 0, fmsub) } @@ -36228,14 +36306,14 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213ss))] +#[cfg_attr(test, assert_instr(vfnmadd))] pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { let mut fnmadd: f32 = simd_extract!(a, 0); if (k & 0b00000001) != 0 { let extracta = -fnmadd; let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmadd = fmaf32(extracta, extractb, extractc); } simd_insert!(a, 0, fnmadd) } @@ -36246,7 +36324,7 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) - #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213ss))] +#[cfg_attr(test, assert_instr(vfnmadd))] pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { let mut fnmadd: f32 = 0.; if (k & 0b00000001) != 0 { @@ -36254,7 +36332,7 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) let extracta = -extracta; let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmadd = fmaf32(extracta, extractb, extractc); } simd_insert!(a, 0, fnmadd) } @@ -36265,14 +36343,14 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213ss))] +#[cfg_attr(test, assert_instr(vfnmadd))] pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { let mut fnmadd: f32 = simd_extract!(c, 0); if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract!(a, 0); let extracta = -extracta; let extractb: f32 = simd_extract!(b, 0); - fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION); + fnmadd = fmaf32(extracta, extractb, fnmadd); } simd_insert!(c, 0, fnmadd) } @@ -36283,14 +36361,14 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213sd))] +#[cfg_attr(test, assert_instr(vfnmadd))] pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { let mut fnmadd: f64 = simd_extract!(a, 0); if (k & 0b00000001) != 0 { let extracta = -fnmadd; let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmadd = fmaf64(extracta, extractb, extractc); } simd_insert!(a, 0, fnmadd) } @@ -36301,7 +36379,7 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213sd))] +#[cfg_attr(test, assert_instr(vfnmadd))] pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { let mut fnmadd: f64 = 0.; if (k & 0b00000001) != 0 { @@ -36309,7 +36387,7 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128 let extracta = -extracta; let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmadd = fmaf64(extracta, extractb, extractc); } simd_insert!(a, 0, fnmadd) } @@ -36320,14 +36398,14 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213sd))] +#[cfg_attr(test, assert_instr(vfnmadd))] pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { let mut fnmadd: f64 = simd_extract!(c, 0); if (k & 0b00000001) != 0 { let extracta: f64 = simd_extract!(a, 0); let extracta = -extracta; let extractb: f64 = simd_extract!(b, 0); - fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION); + fnmadd = fmaf64(extracta, extractb, fnmadd); } simd_insert!(c, 0, fnmadd) } @@ -36338,7 +36416,7 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213ss))] +#[cfg_attr(test, assert_instr(vfnmsub))] pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { let mut fnmsub: f32 = simd_extract!(a, 0); if (k & 0b00000001) != 0 { @@ -36346,7 +36424,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) - let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmsub = fmaf32(extracta, extractb, extractc); } simd_insert!(a, 0, fnmsub) } @@ -36357,7 +36435,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) - #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213ss))] +#[cfg_attr(test, assert_instr(vfnmsub))] pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { let mut fnmsub: f32 = 0.; if (k & 0b00000001) != 0 { @@ -36366,7 +36444,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmsub = fmaf32(extracta, extractb, extractc); } simd_insert!(a, 0, fnmsub) } @@ -36377,7 +36455,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213ss))] +#[cfg_attr(test, assert_instr(vfnmsub))] pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { let mut fnmsub: f32 = simd_extract!(c, 0); if (k & 0b00000001) != 0 { @@ -36385,7 +36463,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) let extracta = -extracta; let extractb: f32 = simd_extract!(b, 0); let extractc = -fnmsub; - fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmsub = fmaf32(extracta, extractb, extractc); } simd_insert!(c, 0, fnmsub) } @@ -36396,7 +36474,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213sd))] +#[cfg_attr(test, assert_instr(vfnmsub))] pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { let mut fnmsub: f64 = simd_extract!(a, 0); if (k & 0b00000001) != 0 { @@ -36404,7 +36482,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmsub = fmaf64(extracta, extractb, extractc); } simd_insert!(a, 0, fnmsub) } @@ -36415,7 +36493,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213sd))] +#[cfg_attr(test, assert_instr(vfnmsub))] pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { let mut fnmsub: f64 = 0.; if (k & 0b00000001) != 0 { @@ -36424,7 +36502,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128 let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmsub = fmaf64(extracta, extractb, extractc); } simd_insert!(a, 0, fnmsub) } @@ -36435,7 +36513,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128 #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213sd))] +#[cfg_attr(test, assert_instr(vfnmsub))] pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { let mut fnmsub: f64 = simd_extract!(c, 0); if (k & 0b00000001) != 0 { @@ -36443,7 +36521,7 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask let extracta = -extracta; let extractb: f64 = simd_extract!(b, 0); let extractc = -fnmsub; - fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + fnmsub = fmaf64(extracta, extractb, extractc); } simd_insert!(c, 0, fnmsub) } @@ -37357,11 +37435,7 @@ pub unsafe fn _mm_maskz_min_round_sd( #[rustc_legacy_const_generics(2)] pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128) -> __m128 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vsqrtss(a, b, zero, 0b1, ROUNDING); - transmute(r) + vsqrtss(a, b, _mm_setzero_ps(), 0b1, ROUNDING) } /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -37386,11 +37460,7 @@ pub unsafe fn _mm_mask_sqrt_round_ss( b: __m128, ) -> __m128 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - let r = vsqrtss(a, b, src, k, ROUNDING); - transmute(r) + vsqrtss(a, b, src, k, ROUNDING) } /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -37414,11 +37484,7 @@ pub unsafe fn _mm_maskz_sqrt_round_ss( b: __m128, ) -> __m128 { static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vsqrtss(a, b, zero, k, ROUNDING); - transmute(r) + vsqrtss(a, b, _mm_setzero_ps(), k, ROUNDING) } /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ @@ -37438,11 +37504,7 @@ pub unsafe fn _mm_maskz_sqrt_round_ss( #[rustc_legacy_const_generics(2)] pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d) -> __m128d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vsqrtsd(a, b, zero, 0b1, ROUNDING); - transmute(r) + vsqrtsd(a, b, _mm_setzero_pd(), 0b1, ROUNDING) } /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -37467,11 +37529,7 @@ pub unsafe fn _mm_mask_sqrt_round_sd( b: __m128d, ) -> __m128d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vsqrtsd(a, b, src, k, ROUNDING); - transmute(r) + vsqrtsd(a, b, src, k, ROUNDING) } /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -37495,11 +37553,7 @@ pub unsafe fn _mm_maskz_sqrt_round_sd( b: __m128d, ) -> __m128d { static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vsqrtsd(a, b, zero, k, ROUNDING); - transmute(r) + vsqrtsd(a, b, _mm_setzero_pd(), k, ROUNDING) } /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ @@ -38198,14 +38252,14 @@ pub unsafe fn _mm_maskz_scalef_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(3)] pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128) -> __m128 { static_assert_rounding!(ROUNDING); let extracta: f32 = simd_extract!(a, 0); let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + let r = vfmaddssround(extracta, extractb, extractc, ROUNDING); simd_insert!(a, 0, r) } @@ -38222,7 +38276,7 @@ pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: _ #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask_fmadd_round_ss( a: __m128, @@ -38235,7 +38289,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss( if (k & 0b00000001) != 0 { let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING); + fmadd = vfmaddssround(fmadd, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fmadd) } @@ -38253,7 +38307,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_maskz_fmadd_round_ss( k: __mmask8, @@ -38267,7 +38321,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss( let extracta: f32 = simd_extract!(a, 0); let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + fmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fmadd) } @@ -38285,7 +38339,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask3_fmadd_round_ss( a: __m128, @@ -38298,7 +38352,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss( if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract!(a, 0); let extractb: f32 = simd_extract!(b, 0); - fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING); + fmadd = vfmaddssround(extracta, extractb, fmadd, ROUNDING); } simd_insert!(c, 0, fmadd) } @@ -38316,7 +38370,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(3)] pub unsafe fn _mm_fmadd_round_sd( a: __m128d, @@ -38327,7 +38381,7 @@ pub unsafe fn _mm_fmadd_round_sd( let extracta: f64 = simd_extract!(a, 0); let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + let fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING); simd_insert!(a, 0, fmadd) } @@ -38344,7 +38398,7 @@ pub unsafe fn _mm_fmadd_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask_fmadd_round_sd( a: __m128d, @@ -38357,7 +38411,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd( if (k & 0b00000001) != 0 { let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING); + fmadd = vfmaddsdround(fmadd, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fmadd) } @@ -38375,7 +38429,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_maskz_fmadd_round_sd( k: __mmask8, @@ -38389,7 +38443,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd( let extracta: f64 = simd_extract!(a, 0); let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fmadd) } @@ -38407,7 +38461,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask3_fmadd_round_sd( a: __m128d, @@ -38420,7 +38474,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd( if (k & 0b00000001) != 0 { let extracta: f64 = simd_extract!(a, 0); let extractb: f64 = simd_extract!(b, 0); - fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING); + fmadd = vfmaddsdround(extracta, extractb, fmadd, ROUNDING); } simd_insert!(c, 0, fmadd) } @@ -38438,7 +38492,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(3)] pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: __m128) -> __m128 { static_assert_rounding!(ROUNDING); @@ -38446,7 +38500,7 @@ pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: _ let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + let fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING); simd_insert!(a, 0, fmsub) } @@ -38463,7 +38517,7 @@ pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: _ #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask_fmsub_round_ss( a: __m128, @@ -38477,7 +38531,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss( let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING); + fmsub = vfmaddssround(fmsub, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fmsub) } @@ -38495,7 +38549,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_maskz_fmsub_round_ss( k: __mmask8, @@ -38510,7 +38564,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss( let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fmsub) } @@ -38528,7 +38582,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask3_fmsub_round_ss( a: __m128, @@ -38542,7 +38596,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss( let extracta: f32 = simd_extract!(a, 0); let extractb: f32 = simd_extract!(b, 0); let extractc = -fmsub; - fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING); } simd_insert!(c, 0, fmsub) } @@ -38560,7 +38614,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(3)] pub unsafe fn _mm_fmsub_round_sd( a: __m128d, @@ -38572,7 +38626,7 @@ pub unsafe fn _mm_fmsub_round_sd( let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + let fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING); simd_insert!(a, 0, fmsub) } @@ -38589,7 +38643,7 @@ pub unsafe fn _mm_fmsub_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask_fmsub_round_sd( a: __m128d, @@ -38603,7 +38657,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd( let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING); + fmsub = vfmaddsdround(fmsub, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fmsub) } @@ -38621,7 +38675,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_maskz_fmsub_round_sd( k: __mmask8, @@ -38636,7 +38690,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd( let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fmsub) } @@ -38654,7 +38708,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask3_fmsub_round_sd( a: __m128d, @@ -38668,7 +38722,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd( let extracta: f64 = simd_extract!(a, 0); let extractb: f64 = simd_extract!(b, 0); let extractc = -fmsub; - fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING); } simd_insert!(c, 0, fmsub) } @@ -38686,7 +38740,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(3)] pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: __m128) -> __m128 { static_assert_rounding!(ROUNDING); @@ -38694,7 +38748,7 @@ pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: let extracta = -extracta; let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + let fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING); simd_insert!(a, 0, fnmadd) } @@ -38711,7 +38765,7 @@ pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask_fnmadd_round_ss( a: __m128, @@ -38725,7 +38779,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss( let extracta = -fnmadd; let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fnmadd) } @@ -38743,7 +38797,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_maskz_fnmadd_round_ss( k: __mmask8, @@ -38758,7 +38812,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss( let extracta = -extracta; let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); - fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fnmadd) } @@ -38776,7 +38830,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask3_fnmadd_round_ss( a: __m128, @@ -38790,7 +38844,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss( let extracta: f32 = simd_extract!(a, 0); let extracta = -extracta; let extractb: f32 = simd_extract!(b, 0); - fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING); + fnmadd = vfmaddssround(extracta, extractb, fnmadd, ROUNDING); } simd_insert!(c, 0, fnmadd) } @@ -38808,7 +38862,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(3)] pub unsafe fn _mm_fnmadd_round_sd( a: __m128d, @@ -38820,7 +38874,7 @@ pub unsafe fn _mm_fnmadd_round_sd( let extracta = -extracta; let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + let fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING); simd_insert!(a, 0, fnmadd) } @@ -38837,7 +38891,7 @@ pub unsafe fn _mm_fnmadd_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask_fnmadd_round_sd( a: __m128d, @@ -38851,7 +38905,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd( let extracta = -fnmadd; let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fnmadd) } @@ -38869,7 +38923,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_maskz_fnmadd_round_sd( k: __mmask8, @@ -38884,7 +38938,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd( let extracta = -extracta; let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); - fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fnmadd) } @@ -38902,7 +38956,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask3_fnmadd_round_sd( a: __m128d, @@ -38916,7 +38970,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd( let extracta: f64 = simd_extract!(a, 0); let extracta = -extracta; let extractb: f64 = simd_extract!(b, 0); - fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING); + fnmadd = vfmaddsdround(extracta, extractb, fnmadd, ROUNDING); } simd_insert!(c, 0, fnmadd) } @@ -38934,7 +38988,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(3)] pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: __m128) -> __m128 { static_assert_rounding!(ROUNDING); @@ -38943,7 +38997,7 @@ pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + let fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING); simd_insert!(a, 0, fnmsub) } @@ -38960,7 +39014,7 @@ pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask_fnmsub_round_ss( a: __m128, @@ -38975,7 +39029,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss( let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fnmsub) } @@ -38993,7 +39047,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_maskz_fnmsub_round_ss( k: __mmask8, @@ -39009,7 +39063,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss( let extractb: f32 = simd_extract!(b, 0); let extractc: f32 = simd_extract!(c, 0); let extractc = -extractc; - fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fnmsub) } @@ -39027,7 +39081,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask3_fnmsub_round_ss( a: __m128, @@ -39042,7 +39096,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss( let extracta = -extracta; let extractb: f32 = simd_extract!(b, 0); let extractc = -fnmsub; - fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); + fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING); } simd_insert!(c, 0, fnmsub) } @@ -39060,7 +39114,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(3)] pub unsafe fn _mm_fnmsub_round_sd( a: __m128d, @@ -39073,7 +39127,7 @@ pub unsafe fn _mm_fnmsub_round_sd( let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + let fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING); simd_insert!(a, 0, fnmsub) } @@ -39090,7 +39144,7 @@ pub unsafe fn _mm_fnmsub_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask_fnmsub_round_sd( a: __m128d, @@ -39105,7 +39159,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd( let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fnmsub) } @@ -39123,7 +39177,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_maskz_fnmsub_round_sd( k: __mmask8, @@ -39139,7 +39193,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd( let extractb: f64 = simd_extract!(b, 0); let extractc: f64 = simd_extract!(c, 0); let extractc = -extractc; - fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING); } simd_insert!(a, 0, fnmsub) } @@ -39157,7 +39211,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd( #[inline] #[target_feature(enable = "avx512f")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] #[rustc_legacy_const_generics(4)] pub unsafe fn _mm_mask3_fnmsub_round_sd( a: __m128d, @@ -39172,7 +39226,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd( let extracta = -extracta; let extractb: f64 = simd_extract!(b, 0); let extractc = -fnmsub; - fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); + fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING); } simd_insert!(c, 0, fnmsub) } @@ -39512,7 +39566,7 @@ pub unsafe fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) b.as_f64x2(), src.as_f32x4(), k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + _MM_FROUND_CUR_DIRECTION, )) } @@ -39529,7 +39583,7 @@ pub unsafe fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 { b.as_f64x2(), _mm_setzero_ps().as_f32x4(), k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + _MM_FROUND_CUR_DIRECTION, )) } @@ -40682,70 +40736,20 @@ pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF; #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.avx512.pmul.dq.512"] - fn vpmuldq(a: i32x16, b: i32x16) -> i64x8; - #[link_name = "llvm.x86.avx512.pmulu.dq.512"] - fn vpmuludq(a: u32x16, b: u32x16) -> u64x8; - - #[link_name = "llvm.x86.avx512.mask.pmaxs.d.512"] - fn vpmaxsd(a: i32x16, b: i32x16) -> i32x16; - - #[link_name = "llvm.x86.avx512.mask.pmaxs.q.512"] - fn vpmaxsq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.mask.pmaxs.q.256"] - fn vpmaxsq256(a: i64x4, b: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx512.mask.pmaxs.q.128"] - fn vpmaxsq128(a: i64x2, b: i64x2) -> i64x2; - - #[link_name = "llvm.x86.avx512.mask.pmins.d.512"] - fn vpminsd(a: i32x16, b: i32x16) -> i32x16; - - #[link_name = "llvm.x86.avx512.mask.pmins.q.512"] - fn vpminsq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.mask.pmins.q.256"] - fn vpminsq256(a: i64x4, b: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx512.mask.pmins.q.128"] - fn vpminsq128(a: i64x2, b: i64x2) -> i64x2; - - #[link_name = "llvm.x86.avx512.mask.pmaxu.d.512"] - fn vpmaxud(a: u32x16, b: u32x16) -> u32x16; - - #[link_name = "llvm.x86.avx512.mask.pmaxu.q.512"] - fn vpmaxuq(a: u64x8, b: u64x8) -> u64x8; - #[link_name = "llvm.x86.avx512.mask.pmaxu.q.256"] - fn vpmaxuq256(a: u64x4, b: u64x4) -> u64x4; - #[link_name = "llvm.x86.avx512.mask.pmaxu.q.128"] - fn vpmaxuq128(a: u64x2, b: u64x2) -> u64x2; - - #[link_name = "llvm.x86.avx512.mask.pminu.d.512"] - fn vpminud(a: u32x16, b: u32x16) -> u32x16; - - #[link_name = "llvm.x86.avx512.mask.pminu.q.512"] - fn vpminuq(a: u64x8, b: u64x8) -> u64x8; - #[link_name = "llvm.x86.avx512.mask.pminu.q.256"] - fn vpminuq256(a: u64x4, b: u64x4) -> u64x4; - #[link_name = "llvm.x86.avx512.mask.pminu.q.128"] - fn vpminuq128(a: u64x2, b: u64x2) -> u64x2; - #[link_name = "llvm.x86.avx512.sqrt.ps.512"] fn vsqrtps(a: f32x16, rounding: i32) -> f32x16; #[link_name = "llvm.x86.avx512.sqrt.pd.512"] fn vsqrtpd(a: f64x8, rounding: i32) -> f64x8; - #[link_name = "llvm.fma.v16f32"] - fn vfmadd132ps(a: f32x16, b: f32x16, c: f32x16) -> f32x16; - #[link_name = "llvm.fma.v8f64"] - fn vfmadd132pd(a: f64x8, b: f64x8, c: f64x8) -> f64x8; - #[link_name = "llvm.x86.avx512.vfmadd.ps.512"] - fn vfmadd132psround(a: f32x16, b: f32x16, c: f32x16, rounding: i32) -> f32x16; + fn vfmadd132psround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512; #[link_name = "llvm.x86.avx512.vfmadd.pd.512"] - fn vfmadd132pdround(a: f64x8, b: f64x8, c: f64x8, rounding: i32) -> f64x8; + fn vfmadd132pdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d; #[link_name = "llvm.x86.avx512.vfmaddsub.ps.512"] - fn vfmaddsub213ps(a: f32x16, b: f32x16, c: f32x16, d: i32) -> f32x16; //from clang + fn vfmaddsubpsround(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512; //from clang #[link_name = "llvm.x86.avx512.vfmaddsub.pd.512"] - fn vfmaddsub213pd(a: f64x8, b: f64x8, c: f64x8, d: i32) -> f64x8; //from clang + fn vfmaddsubpdround(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d; //from clang #[link_name = "llvm.x86.avx512.add.ps.512"] fn vaddps(a: f32x16, b: f32x16, rounding: i32) -> f32x16; @@ -41220,34 +41224,6 @@ extern "C" { #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"] fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"] - fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.ucmp.q.256"] - fn vpcmpuq256(a: i64x4, b: i64x4, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.ucmp.q.128"] - fn vpcmpuq128(a: i64x2, b: i64x2, op: i32, m: i8) -> i8; - - #[link_name = "llvm.x86.avx512.mask.cmp.q.512"] - fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.cmp.q.256"] - fn vpcmpq256(a: i64x4, b: i64x4, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.cmp.q.128"] - fn vpcmpq128(a: i64x2, b: i64x2, op: i32, m: i8) -> i8; - - #[link_name = "llvm.x86.avx512.mask.ucmp.d.512"] - fn vpcmpud(a: i32x16, b: i32x16, op: i32, m: i16) -> i16; - #[link_name = "llvm.x86.avx512.mask.ucmp.d.256"] - fn vpcmpud256(a: i32x8, b: i32x8, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.ucmp.d.128"] - fn vpcmpud128(a: i32x4, b: i32x4, op: i32, m: i8) -> i8; - - #[link_name = "llvm.x86.avx512.mask.cmp.d.512"] - fn vpcmpd(a: i32x16, b: i32x16, op: i32, m: i16) -> i16; - #[link_name = "llvm.x86.avx512.mask.cmp.d.256"] - fn vpcmpd256(a: i32x8, b: i32x8, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.cmp.d.128"] - fn vpcmpd128(a: i32x4, b: i32x4, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.mask.prol.d.512"] fn vprold(a: i32x16, i8: i32) -> i32x16; #[link_name = "llvm.x86.avx512.mask.prol.d.256"] @@ -41500,9 +41476,9 @@ extern "C" { #[link_name = "llvm.x86.avx512.mask.min.sd.round"] fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2; #[link_name = "llvm.x86.avx512.mask.sqrt.ss"] - fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; + fn vsqrtss(a: __m128, b: __m128, src: __m128, mask: u8, rounding: i32) -> __m128; #[link_name = "llvm.x86.avx512.mask.sqrt.sd"] - fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + fn vsqrtsd(a: __m128d, b: __m128d, src: __m128d, mask: u8, rounding: i32) -> __m128d; #[link_name = "llvm.x86.avx512.mask.getexp.ss"] fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4; #[link_name = "llvm.x86.avx512.mask.getexp.sd"] @@ -41531,9 +41507,9 @@ extern "C" { fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; #[link_name = "llvm.x86.avx512.vfmadd.f32"] - fn vfmadd132ss(a: f32, b: f32, c: f32, rounding: i32) -> f32; + fn vfmaddssround(a: f32, b: f32, c: f32, rounding: i32) -> f32; #[link_name = "llvm.x86.avx512.vfmadd.f64"] - fn vfmadd132sd(a: f64, b: f64, c: f64, rounding: i32) -> f64; + fn vfmaddsdround(a: f64, b: f64, c: f64, rounding: i32) -> f64; #[link_name = "llvm.x86.avx512.mask.fixupimm.ss"] fn vfixupimmss(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4; @@ -44152,6 +44128,14 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_rsqrt14_ps() { + let a = _mm256_set1_ps(3.); + let r = _mm256_rsqrt14_ps(a); + let e = _mm256_set1_ps(0.5773392); + assert_eq_m256(r, e); + } + #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_rsqrt14_ps() { let a = _mm256_set1_ps(3.); @@ -44172,6 +44156,14 @@ mod tests { assert_eq_m256(r, e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_rsqrt14_ps() { + let a = _mm_set1_ps(3.); + let r = _mm_rsqrt14_ps(a); + let e = _mm_set1_ps(0.5773392); + assert_eq_m128(r, e); + } + #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm_mask_rsqrt14_ps() { let a = _mm_set1_ps(3.); @@ -54192,6 +54184,22 @@ mod tests { assert_eq_m128i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_cvtmask16_u32() { + let a: __mmask16 = 0b11001100_00110011; + let r = _cvtmask16_u32(a); + let e: u32 = 0b11001100_00110011; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_cvtu32_mask16() { + let a: u32 = 0b11001100_00110011; + let r = _cvtu32_mask16(a); + let e: __mmask16 = 0b11001100_00110011; + assert_eq!(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_kand() { let a: u16 = 0b11001100_00110011; @@ -54298,6 +54306,65 @@ mod tests { assert_eq!(r, e); } + #[simd_test(enable = "avx512dq")] + unsafe fn test_kortest_mask16_u8() { + let a: __mmask16 = 0b0110100101101001; + let b: __mmask16 = 0b1011011010110110; + let mut all_ones: u8 = 0; + let r = _kortest_mask16_u8(a, b, &mut all_ones); + assert_eq!(r, 0); + assert_eq!(all_ones, 1); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_kortestc_mask16_u8() { + let a: __mmask16 = 0b0110100101101001; + let b: __mmask16 = 0b1011011010110110; + let r = _kortestc_mask16_u8(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_kortestz_mask16_u8() { + let a: __mmask16 = 0b0110100101101001; + let b: __mmask16 = 0b1011011010110110; + let r = _kortestz_mask16_u8(a, b); + assert_eq!(r, 0); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_kshiftli_mask16() { + let a: __mmask16 = 0b1001011011000011; + let r = _kshiftli_mask16::<3>(a); + let e: __mmask16 = 0b1011011000011000; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512dq")] + unsafe fn test_kshiftri_mask16() { + let a: __mmask16 = 0b0110100100111100; + let r = _kshiftri_mask16::<3>(a); + let e: __mmask16 = 0b0000110100100111; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_load_mask16() { + let a: __mmask16 = 0b1001011011000011; + let r = _load_mask16(&a); + let e: __mmask16 = 0b1001011011000011; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_store_mask16() { + let a: __mmask16 = 0b0110100100111100; + let mut r = 0; + _store_mask16(&mut r, a); + let e: __mmask16 = 0b0110100100111100; + assert_eq!(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_kmov() { let a: u16 = 0b11001100_00110011; @@ -54342,6 +54409,16 @@ mod tests { assert_eq!(r, 1); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_kortestz() { + let a: u16 = 0b11001100_00110011; + let b: u16 = 0b00101110_00001011; + let r = _mm512_kortestz(a, b); + assert_eq!(r, 0); + let r = _mm512_kortestz(0, 0); + assert_eq!(r, 1); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_test_epi32_mask() { let a = _mm512_set1_epi32(1 << 0); @@ -58410,6 +58487,20 @@ mod tests { assert_eq!(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtss_f32() { + let a = _mm512_setr_ps( + 312.0134, 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50., + ); + assert_eq!(_mm512_cvtss_f32(a), 312.0134); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtsd_f64() { + let r = _mm512_cvtsd_f64(_mm512_setr_pd(-1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8)); + assert_eq!(r, -1.1); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_shuffle_pd() { let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.); diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 359a668582..a2b2496caf 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -574,6 +574,46 @@ mod tests { assert_eq_m256i(r, e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_abs_epi64() { + let a = _mm_set_epi64x(i64::MAX, i64::MIN); + let r = _mm_abs_epi64(a); + let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1)); + assert_eq_m128i(r, e); + let a = _mm_set_epi64x(100, -100); + let r = _mm_abs_epi64(a); + let e = _mm_set_epi64x(100, 100); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_abs_epi64() { + let a = _mm_set_epi64x(i64::MAX, i64::MIN); + let r = _mm_mask_abs_epi64(a, 0, a); + assert_eq_m128i(r, a); + let r = _mm_mask_abs_epi64(a, 0b00000011, a); + let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1)); + assert_eq_m128i(r, e); + let a = _mm_set_epi64x(100, -100); + let r = _mm_mask_abs_epi64(a, 0b00000011, a); + let e = _mm_set_epi64x(100, 100); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_abs_epi64() { + let a = _mm_set_epi64x(i64::MAX, i64::MIN); + let r = _mm_maskz_abs_epi64(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_abs_epi64(0b00000011, a); + let e = _mm_set_epi64x(i64::MAX, i64::MAX.wrapping_add(1)); + assert_eq_m128i(r, e); + let a = _mm_set_epi64x(100, -100); + let r = _mm_maskz_abs_epi64(0b00000011, a); + let e = _mm_set_epi64x(100, 100); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_abs_pd() { let a = _mm512_setr_pd(0., 1., -1., f64::MAX, f64::MIN, 100., -100., -32.); @@ -1673,6 +1713,42 @@ mod tests { assert_eq_m256i(r, e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_min_epi64() { + let a = _mm_set_epi64x(0, 1); + let b = _mm_set_epi64x(3, 2); + let r = _mm_min_epi64(a, b); + let e = _mm_set_epi64x(0, 1); + assert_eq_m128i(r, e); + let a = _mm_set_epi64x(2, 3); + let b = _mm_set_epi64x(1, 0); + let r = _mm_min_epi64(a, b); + let e = _mm_set_epi64x(1, 0); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_min_epi64() { + let a = _mm_set_epi64x(0, 1); + let b = _mm_set_epi64x(3, 2); + let r = _mm_mask_min_epi64(a, 0, a, b); + assert_eq_m128i(r, a); + let r = _mm_mask_min_epi64(a, 0b00000011, a, b); + let e = _mm_set_epi64x(0, 1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_min_epi64() { + let a = _mm_set_epi64x(0, 1); + let b = _mm_set_epi64x(3, 2); + let r = _mm_maskz_min_epi64(0, a, b); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_min_epi64(0b00000011, a, b); + let e = _mm_set_epi64x(0, 1); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_min_pd() { let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.); @@ -2745,6 +2821,14 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_rsqrt14_pd() { + let a = _mm256_set1_pd(3.); + let r = _mm256_rsqrt14_pd(a); + let e = _mm256_set1_pd(0.5773391723632813); + assert_eq_m256d(r, e); + } + #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_rsqrt14_pd() { let a = _mm256_set1_pd(3.); @@ -2765,6 +2849,14 @@ mod tests { assert_eq_m256d(r, e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_rsqrt14_pd() { + let a = _mm_set1_pd(3.); + let r = _mm_rsqrt14_pd(a); + let e = _mm_set1_pd(0.5773391723632813); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm_mask_rsqrt14_pd() { let a = _mm_set1_pd(3.);