Skip to content

Use more simd_* intrinsics #790

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Dec 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions crates/core_arch/src/simd_llvm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
//! TODO: should use `link_llvm_intrinsic` instead: issue #112

extern "platform-intrinsic" {
//pub fn simd_select_bitmask
pub fn simd_eq<T, U>(x: T, y: T) -> U;
pub fn simd_ne<T, U>(x: T, y: T) -> U;
pub fn simd_lt<T, U>(x: T, y: T) -> U;
Expand All @@ -27,6 +28,8 @@ extern "platform-intrinsic" {

pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
//pub fn simd_select
pub fn simd_bitmask<T, U>(x: T) -> U;

pub fn simd_cast<T, U>(x: T) -> U;

Expand All @@ -40,6 +43,12 @@ extern "platform-intrinsic" {
pub fn simd_or<T>(x: T, y: T) -> T;
pub fn simd_xor<T>(x: T, y: T) -> T;

pub fn simd_saturating_add<T>(x: T, y: T) -> T;
pub fn simd_saturating_sub<T>(x: T, y: T) -> T;

pub fn simd_gather<T, U, V>(values: T, pointers: U, mask: V) -> T;
pub fn simd_scatter<T, U, V>(values: T, pointers: U, mask: V);

pub fn simd_reduce_add_unordered<T, U>(x: T) -> U;
pub fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
pub fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
Expand All @@ -61,5 +70,17 @@ extern "platform-intrinsic" {
pub fn simd_fmax<T>(a: T, b: T) -> T;

pub fn simd_fsqrt<T>(a: T) -> T;
pub fn simd_fsin<T>(a: T) -> T;
pub fn simd_fcos<T>(a: T) -> T;
pub fn simd_fabs<T>(a: T) -> T;
pub fn simd_floor<T>(a: T) -> T;
pub fn simd_ceil<T>(a: T) -> T;
pub fn simd_fexp<T>(a: T) -> T;
pub fn simd_fexp2<T>(a: T) -> T;
pub fn simd_flog10<T>(a: T) -> T;
pub fn simd_flog2<T>(a: T) -> T;
pub fn simd_flog<T>(a: T) -> T;
//pub fn simd_fpowi
//pub fn simd_fpow
pub fn simd_fma<T>(a: T, b: T, c: T) -> T;
}
28 changes: 9 additions & 19 deletions crates/core_arch/src/x86/avx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vmaxpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
maxpd256(a, b)
simd_fmax(a, b)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the behavior of these the same, e.g., for subnormals, when one argument contain NaNs, etc. ?

}

/// Compares packed single-precision (32-bit) floating-point elements in `a`
Expand All @@ -267,7 +267,7 @@ pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vmaxps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
maxps256(a, b)
simd_fmax(a, b)
}

/// Compares packed double-precision (64-bit) floating-point elements
Expand All @@ -279,7 +279,7 @@ pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vminpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
minpd256(a, b)
simd_fmin(a, b)
}

/// Compares packed single-precision (32-bit) floating-point elements in `a`
Expand All @@ -291,7 +291,7 @@ pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vminps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
minps256(a, b)
simd_fmin(a, b)
}

/// Multiplies packed double-precision (64-bit) floating-point elements
Expand Down Expand Up @@ -426,7 +426,7 @@ pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d {
#[cfg_attr(test, assert_instr(vroundpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
roundpd256(a, 0x02)
simd_ceil(a)
}

/// Rounds packed double-precision (64-bit) floating point elements in `a`
Expand All @@ -438,7 +438,7 @@ pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vroundpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
roundpd256(a, 0x01)
simd_floor(a)
}

/// Rounds packed single-precision (32-bit) floating point elements in `a`
Expand Down Expand Up @@ -477,7 +477,7 @@ pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 {
#[cfg_attr(test, assert_instr(vroundps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
roundps256(a, 0x02)
simd_ceil(a)
}

/// Rounds packed single-precision (32-bit) floating point elements in `a`
Expand All @@ -489,7 +489,7 @@ pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vroundps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
roundps256(a, 0x01)
simd_floor(a)
}

/// Returns the square root of packed single-precision (32-bit) floating point
Expand All @@ -513,7 +513,7 @@ pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vsqrtpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
sqrtpd256(a)
simd_fsqrt(a)
}

/// Blends packed double-precision (64-bit) floating-point elements from
Expand Down Expand Up @@ -3166,20 +3166,10 @@ extern "C" {
fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
#[link_name = "llvm.x86.avx.addsub.ps.256"]
fn addsubps256(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.avx.max.pd.256"]
fn maxpd256(a: __m256d, b: __m256d) -> __m256d;
#[link_name = "llvm.x86.avx.max.ps.256"]
fn maxps256(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.avx.min.pd.256"]
fn minpd256(a: __m256d, b: __m256d) -> __m256d;
#[link_name = "llvm.x86.avx.min.ps.256"]
fn minps256(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.avx.round.pd.256"]
fn roundpd256(a: __m256d, b: i32) -> __m256d;
#[link_name = "llvm.x86.avx.round.ps.256"]
fn roundps256(a: __m256, b: i32) -> __m256;
#[link_name = "llvm.x86.avx.sqrt.pd.256"]
fn sqrtpd256(a: __m256d) -> __m256d;
#[link_name = "llvm.x86.avx.sqrt.ps.256"]
fn sqrtps256(a: __m256) -> __m256;
#[link_name = "llvm.x86.avx.blendv.pd.256"]
Expand Down
32 changes: 8 additions & 24 deletions crates/core_arch/src/x86/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpaddsb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
transmute(paddsb(a.as_i8x32(), b.as_i8x32()))
transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
}

/// Adds packed 16-bit integers in `a` and `b` using saturation.
Expand All @@ -122,7 +122,7 @@ pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpaddsw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
transmute(paddsw(a.as_i16x16(), b.as_i16x16()))
transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
}

/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
Expand All @@ -133,7 +133,7 @@ pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpaddusb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
transmute(paddusb(a.as_u8x32(), b.as_u8x32()))
transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
}

/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
Expand All @@ -144,7 +144,7 @@ pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpaddusw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
transmute(paddusw(a.as_u16x16(), b.as_u16x16()))
transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
}

/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
Expand Down Expand Up @@ -3331,7 +3331,7 @@ pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpsubsw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
transmute(psubsw(a.as_i16x16(), b.as_i16x16()))
transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
}

/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
Expand All @@ -3343,7 +3343,7 @@ pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpsubsb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
transmute(psubsb(a.as_i8x32(), b.as_i8x32()))
transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
}

/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
Expand All @@ -3355,7 +3355,7 @@ pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpsubusw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
transmute(psubusw(a.as_u16x16(), b.as_u16x16()))
transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
}

/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
Expand All @@ -3367,7 +3367,7 @@ pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpsubusb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
transmute(psubusb(a.as_u8x32(), b.as_u8x32()))
transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
}

/// Unpacks and interleave 8-bit integers from the high half of each
Expand Down Expand Up @@ -3807,14 +3807,6 @@ extern "C" {
fn pabsw(a: i16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pabs.d"]
fn pabsd(a: i32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.padds.b"]
fn paddsb(a: i8x32, b: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.padds.w"]
fn paddsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.paddus.b"]
fn paddusb(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.paddus.w"]
fn paddusw(a: u16x16, b: u16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pavg.b"]
fn pavgb(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.pavg.w"]
Expand Down Expand Up @@ -3959,14 +3951,6 @@ extern "C" {
fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
#[link_name = "llvm.x86.avx2.psrlv.q.256"]
fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
#[link_name = "llvm.x86.avx2.psubs.b"]
fn psubsb(a: i8x32, b: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.psubs.w"]
fn psubsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.psubus.b"]
fn psubusb(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.psubus.w"]
fn psubusw(a: u16x16, b: u16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pshuf.b"]
fn pshufb(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.permd"]
Expand Down
8 changes: 1 addition & 7 deletions crates/core_arch/src/x86/bswap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,7 @@ use stdarch_test::assert_instr;
#[cfg_attr(test, assert_instr(bswap))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _bswap(x: i32) -> i32 {
bswap_i32(x)
}

#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.bswap.i32"]
fn bswap_i32(x: i32) -> i32;
x.swap_bytes()
}

#[cfg(test)]
Expand Down
17 changes: 5 additions & 12 deletions crates/core_arch/src/x86/fma.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate

use crate::core_arch::simd_llvm::simd_fma;
use crate::core_arch::x86::*;

#[cfg(test)]
Expand All @@ -32,7 +33,7 @@ use stdarch_test::assert_instr;
#[cfg_attr(test, assert_instr(vfmadd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
vfmaddpd(a, b, c)
simd_fma(a, b, c)
}

/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
Expand All @@ -44,7 +45,7 @@ pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
#[cfg_attr(test, assert_instr(vfmadd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
vfmaddpd256(a, b, c)
simd_fma(a, b, c)
}

/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
Expand All @@ -56,7 +57,7 @@ pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vfmadd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
vfmaddps(a, b, c)
simd_fma(a, b, c)
}

/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
Expand All @@ -68,7 +69,7 @@ pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(vfmadd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
vfmaddps256(a, b, c)
simd_fma(a, b, c)
}

/// Multiplies the lower double-precision (64-bit) floating-point elements in
Expand Down Expand Up @@ -439,14 +440,6 @@ pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {

#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.fma.vfmadd.pd"]
fn vfmaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
#[link_name = "llvm.x86.fma.vfmadd.pd.256"]
fn vfmaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
#[link_name = "llvm.x86.fma.vfmadd.ps"]
fn vfmaddps(a: __m128, b: __m128, c: __m128) -> __m128;
#[link_name = "llvm.x86.fma.vfmadd.ps.256"]
fn vfmaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
#[link_name = "llvm.x86.fma.vfmadd.sd"]
fn vfmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
#[link_name = "llvm.x86.fma.vfmadd.ss"]
Expand Down
43 changes: 43 additions & 0 deletions crates/core_arch/src/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,49 @@ mod test;
#[cfg(test)]
pub use self::test::*;

#[allow(non_camel_case_types)]
#[unstable(feature = "stdimd_internal", issue = "0")]
pub(crate) trait m64Ext: Sized {
fn as_m64(self) -> __m64;

#[inline]
fn as_u8x8(self) -> crate::core_arch::simd::u8x8 {
unsafe { transmute(self.as_m64()) }
}

#[inline]
fn as_u16x4(self) -> crate::core_arch::simd::u16x4 {
unsafe { transmute(self.as_m64()) }
}

#[inline]
fn as_u32x2(self) -> crate::core_arch::simd::u32x2 {
unsafe { transmute(self.as_m64()) }
}

#[inline]
fn as_i8x8(self) -> crate::core_arch::simd::i8x8 {
unsafe { transmute(self.as_m64()) }
}

#[inline]
fn as_i16x4(self) -> crate::core_arch::simd::i16x4 {
unsafe { transmute(self.as_m64()) }
}

#[inline]
fn as_i32x2(self) -> crate::core_arch::simd::i32x2 {
unsafe { transmute(self.as_m64()) }
}
}

impl m64Ext for __m64 {
#[inline]
fn as_m64(self) -> Self {
self
}
}

#[allow(non_camel_case_types)]
#[unstable(feature = "stdimd_internal", issue = "0")]
pub(crate) trait m128iExt: Sized {
Expand Down
Loading