diff --git a/crates/core_arch/src/simd_llvm.rs b/crates/core_arch/src/simd_llvm.rs index 5b5c6dd0a8..cbccae416f 100644 --- a/crates/core_arch/src/simd_llvm.rs +++ b/crates/core_arch/src/simd_llvm.rs @@ -3,6 +3,7 @@ //! TODO: should use `link_llvm_intrinsic` instead: issue #112 extern "platform-intrinsic" { + //pub fn simd_select_bitmask pub fn simd_eq(x: T, y: T) -> U; pub fn simd_ne(x: T, y: T) -> U; pub fn simd_lt(x: T, y: T) -> U; @@ -27,6 +28,8 @@ extern "platform-intrinsic" { pub fn simd_insert(x: T, idx: u32, val: U) -> T; pub fn simd_extract(x: T, idx: u32) -> U; + //pub fn simd_select + pub fn simd_bitmask(x: T) -> U; pub fn simd_cast(x: T) -> U; @@ -40,6 +43,12 @@ extern "platform-intrinsic" { pub fn simd_or(x: T, y: T) -> T; pub fn simd_xor(x: T, y: T) -> T; + pub fn simd_saturating_add(x: T, y: T) -> T; + pub fn simd_saturating_sub(x: T, y: T) -> T; + + pub fn simd_gather(values: T, pointers: U, mask: V) -> T; + pub fn simd_scatter(values: T, pointers: U, mask: V); + pub fn simd_reduce_add_unordered(x: T) -> U; pub fn simd_reduce_mul_unordered(x: T) -> U; pub fn simd_reduce_add_ordered(x: T, acc: U) -> U; @@ -61,5 +70,17 @@ extern "platform-intrinsic" { pub fn simd_fmax(a: T, b: T) -> T; pub fn simd_fsqrt(a: T) -> T; + pub fn simd_fsin(a: T) -> T; + pub fn simd_fcos(a: T) -> T; + pub fn simd_fabs(a: T) -> T; + pub fn simd_floor(a: T) -> T; + pub fn simd_ceil(a: T) -> T; + pub fn simd_fexp(a: T) -> T; + pub fn simd_fexp2(a: T) -> T; + pub fn simd_flog10(a: T) -> T; + pub fn simd_flog2(a: T) -> T; + pub fn simd_flog(a: T) -> T; + //pub fn simd_fpowi + //pub fn simd_fpow pub fn simd_fma(a: T, b: T, c: T) -> T; } diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 90867e4ad7..68896e0aad 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -255,7 +255,7 @@ pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vmaxpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d { - maxpd256(a, b) + simd_fmax(a, b) } /// Compares packed single-precision (32-bit) floating-point elements in `a` @@ -267,7 +267,7 @@ pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vmaxps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 { - maxps256(a, b) + simd_fmax(a, b) } /// Compares packed double-precision (64-bit) floating-point elements @@ -279,7 +279,7 @@ pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vminpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d { - minpd256(a, b) + simd_fmin(a, b) } /// Compares packed single-precision (32-bit) floating-point elements in `a` @@ -291,7 +291,7 @@ pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vminps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 { - minps256(a, b) + simd_fmin(a, b) } /// Multiplies packed double-precision (64-bit) floating-point elements @@ -426,7 +426,7 @@ pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d { #[cfg_attr(test, assert_instr(vroundpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d { - roundpd256(a, 0x02) + simd_ceil(a) } /// Rounds packed double-precision (64-bit) floating point elements in `a` @@ -438,7 +438,7 @@ pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vroundpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d { - roundpd256(a, 0x01) + simd_floor(a) } /// Rounds packed single-precision (32-bit) floating point elements in `a` @@ -477,7 +477,7 @@ pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 { #[cfg_attr(test, assert_instr(vroundps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 { - roundps256(a, 0x02) + simd_ceil(a) } /// Rounds packed single-precision (32-bit) floating point elements in `a` @@ -489,7 +489,7 @@ pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vroundps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 { - roundps256(a, 0x01) + simd_floor(a) } /// Returns the square root of packed single-precision (32-bit) floating point @@ -513,7 +513,7 @@ pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vsqrtpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d { - sqrtpd256(a) + simd_fsqrt(a) } /// Blends packed double-precision (64-bit) floating-point elements from @@ -3166,20 +3166,10 @@ extern "C" { fn addsubpd256(a: __m256d, b: __m256d) -> __m256d; #[link_name = "llvm.x86.avx.addsub.ps.256"] fn addsubps256(a: __m256, b: __m256) -> __m256; - #[link_name = "llvm.x86.avx.max.pd.256"] - fn maxpd256(a: __m256d, b: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.max.ps.256"] - fn maxps256(a: __m256, b: __m256) -> __m256; - #[link_name = "llvm.x86.avx.min.pd.256"] - fn minpd256(a: __m256d, b: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.min.ps.256"] - fn minps256(a: __m256, b: __m256) -> __m256; #[link_name = "llvm.x86.avx.round.pd.256"] fn roundpd256(a: __m256d, b: i32) -> __m256d; #[link_name = "llvm.x86.avx.round.ps.256"] fn roundps256(a: __m256, b: i32) -> __m256; - #[link_name = "llvm.x86.avx.sqrt.pd.256"] - fn sqrtpd256(a: __m256d) -> __m256d; #[link_name = "llvm.x86.avx.sqrt.ps.256"] fn sqrtps256(a: __m256) -> __m256; #[link_name = "llvm.x86.avx.blendv.pd.256"] diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index e137f0ce59..193e1ca3ae 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -111,7 +111,7 @@ pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpaddsb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute(paddsb(a.as_i8x32(), b.as_i8x32())) + transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) } /// Adds packed 16-bit integers in `a` and `b` using saturation. @@ -122,7 +122,7 @@ pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpaddsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(paddsw(a.as_i16x16(), b.as_i16x16())) + transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) } /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation. @@ -133,7 +133,7 @@ pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpaddusb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(paddusb(a.as_u8x32(), b.as_u8x32())) + transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) } /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation. @@ -144,7 +144,7 @@ pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpaddusw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(paddusw(a.as_u16x16(), b.as_u16x16())) + transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) } /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary @@ -3331,7 +3331,7 @@ pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsubsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(psubsw(a.as_i16x16(), b.as_i16x16())) + transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in @@ -3343,7 +3343,7 @@ pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsubsb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { - transmute(psubsb(a.as_i8x32(), b.as_i8x32())) + transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) } /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit @@ -3355,7 +3355,7 @@ pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsubusw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(psubusw(a.as_u16x16(), b.as_u16x16())) + transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) } /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit @@ -3367,7 +3367,7 @@ pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsubusb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(psubusb(a.as_u8x32(), b.as_u8x32())) + transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) } /// Unpacks and interleave 8-bit integers from the high half of each @@ -3807,14 +3807,6 @@ extern "C" { fn pabsw(a: i16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> u32x8; - #[link_name = "llvm.x86.avx2.padds.b"] - fn paddsb(a: i8x32, b: i8x32) -> i8x32; - #[link_name = "llvm.x86.avx2.padds.w"] - fn paddsw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.paddus.b"] - fn paddusb(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.paddus.w"] - fn paddusw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pavg.b"] fn pavgb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pavg.w"] @@ -3959,14 +3951,6 @@ extern "C" { fn psrlvq(a: i64x2, count: i64x2) -> i64x2; #[link_name = "llvm.x86.avx2.psrlv.q.256"] fn psrlvq256(a: i64x4, count: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx2.psubs.b"] - fn psubsb(a: i8x32, b: i8x32) -> i8x32; - #[link_name = "llvm.x86.avx2.psubs.w"] - fn psubsw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.psubus.b"] - fn psubusb(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.psubus.w"] - fn psubusw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pshuf.b"] fn pshufb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.permd"] diff --git a/crates/core_arch/src/x86/bswap.rs b/crates/core_arch/src/x86/bswap.rs index 20e3aa6fc3..fcaad26fb6 100644 --- a/crates/core_arch/src/x86/bswap.rs +++ b/crates/core_arch/src/x86/bswap.rs @@ -11,13 +11,7 @@ use stdarch_test::assert_instr; #[cfg_attr(test, assert_instr(bswap))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _bswap(x: i32) -> i32 { - bswap_i32(x) -} - -#[allow(improper_ctypes)] -extern "C" { - #[link_name = "llvm.bswap.i32"] - fn bswap_i32(x: i32) -> i32; + x.swap_bytes() } #[cfg(test)] diff --git a/crates/core_arch/src/x86/fma.rs b/crates/core_arch/src/x86/fma.rs index 48abe9f49a..f3dda6d527 100644 --- a/crates/core_arch/src/x86/fma.rs +++ b/crates/core_arch/src/x86/fma.rs @@ -18,6 +18,7 @@ //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate +use crate::core_arch::simd_llvm::simd_fma; use crate::core_arch::x86::*; #[cfg(test)] @@ -32,7 +33,7 @@ use stdarch_test::assert_instr; #[cfg_attr(test, assert_instr(vfmadd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { - vfmaddpd(a, b, c) + simd_fma(a, b, c) } /// Multiplies packed double-precision (64-bit) floating-point elements in `a` @@ -44,7 +45,7 @@ pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(vfmadd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { - vfmaddpd256(a, b, c) + simd_fma(a, b, c) } /// Multiplies packed single-precision (32-bit) floating-point elements in `a` @@ -56,7 +57,7 @@ pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vfmadd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 { - vfmaddps(a, b, c) + simd_fma(a, b, c) } /// Multiplies packed single-precision (32-bit) floating-point elements in `a` @@ -68,7 +69,7 @@ pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 { #[cfg_attr(test, assert_instr(vfmadd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 { - vfmaddps256(a, b, c) + simd_fma(a, b, c) } /// Multiplies the lower double-precision (64-bit) floating-point elements in @@ -439,14 +440,6 @@ pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.fma.vfmadd.pd"] - fn vfmaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; - #[link_name = "llvm.x86.fma.vfmadd.pd.256"] - fn vfmaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d; - #[link_name = "llvm.x86.fma.vfmadd.ps"] - fn vfmaddps(a: __m128, b: __m128, c: __m128) -> __m128; - #[link_name = "llvm.x86.fma.vfmadd.ps.256"] - fn vfmaddps256(a: __m256, b: __m256, c: __m256) -> __m256; #[link_name = "llvm.x86.fma.vfmadd.sd"] fn vfmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; #[link_name = "llvm.x86.fma.vfmadd.ss"] diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 68965ad93d..6719a84b45 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -351,6 +351,49 @@ mod test; #[cfg(test)] pub use self::test::*; +#[allow(non_camel_case_types)] +#[unstable(feature = "stdimd_internal", issue = "0")] +pub(crate) trait m64Ext: Sized { + fn as_m64(self) -> __m64; + + #[inline] + fn as_u8x8(self) -> crate::core_arch::simd::u8x8 { + unsafe { transmute(self.as_m64()) } + } + + #[inline] + fn as_u16x4(self) -> crate::core_arch::simd::u16x4 { + unsafe { transmute(self.as_m64()) } + } + + #[inline] + fn as_u32x2(self) -> crate::core_arch::simd::u32x2 { + unsafe { transmute(self.as_m64()) } + } + + #[inline] + fn as_i8x8(self) -> crate::core_arch::simd::i8x8 { + unsafe { transmute(self.as_m64()) } + } + + #[inline] + fn as_i16x4(self) -> crate::core_arch::simd::i16x4 { + unsafe { transmute(self.as_m64()) } + } + + #[inline] + fn as_i32x2(self) -> crate::core_arch::simd::i32x2 { + unsafe { transmute(self.as_m64()) } + } +} + +impl m64Ext for __m64 { + #[inline] + fn as_m64(self) -> Self { + self + } +} + #[allow(non_camel_case_types)] #[unstable(feature = "stdimd_internal", issue = "0")] pub(crate) trait m128iExt: Sized { diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index 1dcb94ef3b..b07971c500 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -194,6 +194,7 @@ pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(minps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 { + // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`. minps(a, b) } @@ -219,6 +220,7 @@ pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(maxps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { + // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`. maxps(a, b) } @@ -2618,6 +2620,21 @@ mod tests { let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); let r = _mm_min_ps(a, b); assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); + + // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min` + // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic + // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from + // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals + // `r1` to `a` and `r2` to `b`. + let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); + let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); + let r1: [u8; 16] = transmute(_mm_min_ps(a, b)); + let r2: [u8; 16] = transmute(_mm_min_ps(b, a)); + let a: [u8; 16] = transmute(a); + let b: [u8; 16] = transmute(b); + assert_eq!(r1, b); + assert_eq!(r2, a); + assert_ne!(a, b); // sanity check that -0.0 is actually present } #[simd_test(enable = "sse")] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 5df06c8194..3e738e3352 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -121,7 +121,7 @@ pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(paddsb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i { - transmute(paddsb(a.as_i8x16(), b.as_i8x16())) + transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) } /// Adds packed 16-bit integers in `a` and `b` using saturation. @@ -132,7 +132,7 @@ pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(paddsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i { - transmute(paddsw(a.as_i16x8(), b.as_i16x8())) + transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) } /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation. @@ -143,7 +143,7 @@ pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(paddusb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i { - transmute(paddsub(a.as_u8x16(), b.as_u8x16())) + transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) } /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation. @@ -154,7 +154,7 @@ pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(paddusw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(paddsuw(a.as_u16x8(), b.as_u16x8())) + transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) } /// Averages packed unsigned 8-bit integers in `a` and `b`. @@ -367,7 +367,7 @@ pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(psubsb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i { - transmute(psubsb(a.as_i8x16(), b.as_i8x16())) + transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) } /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a` @@ -379,7 +379,7 @@ pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(psubsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i { - transmute(psubsw(a.as_i16x8(), b.as_i16x8())) + transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) } /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit @@ -391,7 +391,7 @@ pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(psubusb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i { - transmute(psubusb(a.as_u8x16(), b.as_u8x16())) + transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) } /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit @@ -403,7 +403,7 @@ pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(psubusw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(psubusw(a.as_u16x8(), b.as_u16x8())) + transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) } /// Shifts `a` left by `imm8` bytes while shifting in zeros. @@ -1823,7 +1823,7 @@ pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(sqrtpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d { - sqrtpd(a) + simd_fsqrt(a) } /// Returns a new vector with the low element of `a` replaced by subtracting the @@ -3021,14 +3021,6 @@ extern "C" { fn lfence(); #[link_name = "llvm.x86.sse2.mfence"] fn mfence(); - #[link_name = "llvm.x86.sse2.padds.b"] - fn paddsb(a: i8x16, b: i8x16) -> i8x16; - #[link_name = "llvm.x86.sse2.padds.w"] - fn paddsw(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.sse2.paddus.b"] - fn paddsub(a: u8x16, b: u8x16) -> u8x16; - #[link_name = "llvm.x86.sse2.paddus.w"] - fn paddsuw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.pavg.b"] fn pavgb(a: u8x16, b: u8x16) -> u8x16; #[link_name = "llvm.x86.sse2.pavg.w"] @@ -3051,14 +3043,6 @@ extern "C" { fn pmuludq(a: u32x4, b: u32x4) -> u64x2; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; - #[link_name = "llvm.x86.sse2.psubs.b"] - fn psubsb(a: i8x16, b: i8x16) -> i8x16; - #[link_name = "llvm.x86.sse2.psubs.w"] - fn psubsw(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.sse2.psubus.b"] - fn psubusb(a: u8x16, b: u8x16) -> u8x16; - #[link_name = "llvm.x86.sse2.psubus.w"] - fn psubusw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.pslli.w"] fn pslliw(a: i16x8, imm8: i32) -> i16x8; #[link_name = "llvm.x86.sse2.psll.w"] diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index f861629671..2b08c9bb1c 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -601,7 +601,7 @@ pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { #[cfg_attr(test, assert_instr(roundpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d { - roundpd(a, _MM_FROUND_FLOOR) + simd_floor(a) } /// Round the packed single-precision (32-bit) floating-point elements in `a` @@ -614,7 +614,7 @@ pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(roundps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 { - roundps(a, _MM_FROUND_FLOOR) + simd_floor(a) } /// Round the lower double-precision (64-bit) floating-point element in `b` @@ -657,7 +657,7 @@ pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(roundpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d { - roundpd(a, _MM_FROUND_CEIL) + simd_ceil(a) } /// Round the packed single-precision (32-bit) floating-point elements in `a` @@ -670,7 +670,7 @@ pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(roundps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 { - roundps(a, _MM_FROUND_CEIL) + simd_ceil(a) } /// Round the lower double-precision (64-bit) floating-point element in `b` diff --git a/crates/core_arch/src/x86_64/bswap.rs b/crates/core_arch/src/x86_64/bswap.rs index 9e8e76d4f7..90a209ce30 100644 --- a/crates/core_arch/src/x86_64/bswap.rs +++ b/crates/core_arch/src/x86_64/bswap.rs @@ -12,13 +12,7 @@ use stdarch_test::assert_instr; #[cfg_attr(test, assert_instr(bswap))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _bswap64(x: i64) -> i64 { - bswap_i64(x) -} - -#[allow(improper_ctypes)] -extern "C" { - #[link_name = "llvm.bswap.i64"] - fn bswap_i64(x: i64) -> i64; + x.swap_bytes() } #[cfg(test)] diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index a0bc0f7730..4e25d2a02d 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -90,7 +90,7 @@ pub fn assert(_fnptr: usize, fnname: &str, expected: &str) { // Look for `expected` as the first part of any instruction in this // function, e.g., tzcntl in tzcntl %rax,%rax. - let found = instrs.iter().any(|s| s.contains(expected)); + let found = instrs.iter().any(|s| s.starts_with(expected)); // Look for `call` instructions in the disassembly to detect whether // inlining failed: all intrinsics are `#[inline(always)]`, so