Skip to content

Commit 244fc32

Browse files
authored
Merge pull request rust-lang#2026 from folkertdev/neon-vpadd
use `intrinsics::simd` for vpadd
2 parents 9ce6076 + 6671f95 commit 244fc32

File tree

3 files changed

+102
-168
lines changed

3 files changed

+102
-168
lines changed

library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs

Lines changed: 40 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -16025,14 +16025,11 @@ pub fn vpaddd_u64(a: uint64x2_t) -> u64 {
1602516025
#[cfg(not(target_arch = "arm64ec"))]
1602616026
#[cfg_attr(test, assert_instr(faddp))]
1602716027
pub fn vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
16028-
unsafe extern "unadjusted" {
16029-
#[cfg_attr(
16030-
any(target_arch = "aarch64", target_arch = "arm64ec"),
16031-
link_name = "llvm.aarch64.neon.faddp.v8f16"
16032-
)]
16033-
fn _vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
16028+
unsafe {
16029+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
16030+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
16031+
simd_add(even, odd)
1603416032
}
16035-
unsafe { _vpaddq_f16(a, b) }
1603616033
}
1603716034
#[doc = "Floating-point add pairwise"]
1603816035
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f32)"]
@@ -16041,14 +16038,11 @@ pub fn vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
1604116038
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1604216039
#[cfg_attr(test, assert_instr(faddp))]
1604316040
pub fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
16044-
unsafe extern "unadjusted" {
16045-
#[cfg_attr(
16046-
any(target_arch = "aarch64", target_arch = "arm64ec"),
16047-
link_name = "llvm.aarch64.neon.faddp.v4f32"
16048-
)]
16049-
fn _vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
16041+
unsafe {
16042+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
16043+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
16044+
simd_add(even, odd)
1605016045
}
16051-
unsafe { _vpaddq_f32(a, b) }
1605216046
}
1605316047
#[doc = "Floating-point add pairwise"]
1605416048
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f64)"]
@@ -16057,14 +16051,11 @@ pub fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
1605716051
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1605816052
#[cfg_attr(test, assert_instr(faddp))]
1605916053
pub fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
16060-
unsafe extern "unadjusted" {
16061-
#[cfg_attr(
16062-
any(target_arch = "aarch64", target_arch = "arm64ec"),
16063-
link_name = "llvm.aarch64.neon.faddp.v2f64"
16064-
)]
16065-
fn _vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
16054+
unsafe {
16055+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
16056+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
16057+
simd_add(even, odd)
1606616058
}
16067-
unsafe { _vpaddq_f64(a, b) }
1606816059
}
1606916060
#[doc = "Add Pairwise"]
1607016061
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s8)"]
@@ -16073,14 +16064,11 @@ pub fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
1607316064
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1607416065
#[cfg_attr(test, assert_instr(addp))]
1607516066
pub fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
16076-
unsafe extern "unadjusted" {
16077-
#[cfg_attr(
16078-
any(target_arch = "aarch64", target_arch = "arm64ec"),
16079-
link_name = "llvm.aarch64.neon.addp.v16i8"
16080-
)]
16081-
fn _vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
16067+
unsafe {
16068+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<16>());
16069+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<16>());
16070+
simd_add(even, odd)
1608216071
}
16083-
unsafe { _vpaddq_s8(a, b) }
1608416072
}
1608516073
#[doc = "Add Pairwise"]
1608616074
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s16)"]
@@ -16089,14 +16077,11 @@ pub fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
1608916077
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1609016078
#[cfg_attr(test, assert_instr(addp))]
1609116079
pub fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
16092-
unsafe extern "unadjusted" {
16093-
#[cfg_attr(
16094-
any(target_arch = "aarch64", target_arch = "arm64ec"),
16095-
link_name = "llvm.aarch64.neon.addp.v8i16"
16096-
)]
16097-
fn _vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
16080+
unsafe {
16081+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
16082+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
16083+
simd_add(even, odd)
1609816084
}
16099-
unsafe { _vpaddq_s16(a, b) }
1610016085
}
1610116086
#[doc = "Add Pairwise"]
1610216087
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s32)"]
@@ -16105,14 +16090,11 @@ pub fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
1610516090
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1610616091
#[cfg_attr(test, assert_instr(addp))]
1610716092
pub fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
16108-
unsafe extern "unadjusted" {
16109-
#[cfg_attr(
16110-
any(target_arch = "aarch64", target_arch = "arm64ec"),
16111-
link_name = "llvm.aarch64.neon.addp.v4i32"
16112-
)]
16113-
fn _vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
16093+
unsafe {
16094+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
16095+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
16096+
simd_add(even, odd)
1611416097
}
16115-
unsafe { _vpaddq_s32(a, b) }
1611616098
}
1611716099
#[doc = "Add Pairwise"]
1611816100
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s64)"]
@@ -16121,119 +16103,62 @@ pub fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
1612116103
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1612216104
#[cfg_attr(test, assert_instr(addp))]
1612316105
pub fn vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
16124-
unsafe extern "unadjusted" {
16125-
#[cfg_attr(
16126-
any(target_arch = "aarch64", target_arch = "arm64ec"),
16127-
link_name = "llvm.aarch64.neon.addp.v2i64"
16128-
)]
16129-
fn _vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
16106+
unsafe {
16107+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
16108+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
16109+
simd_add(even, odd)
1613016110
}
16131-
unsafe { _vpaddq_s64(a, b) }
1613216111
}
1613316112
#[doc = "Add Pairwise"]
1613416113
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
1613516114
#[inline(always)]
16136-
#[cfg(target_endian = "little")]
16137-
#[target_feature(enable = "neon")]
16138-
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
16139-
#[cfg_attr(test, assert_instr(addp))]
16140-
pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
16141-
unsafe { transmute(vpaddq_s8(transmute(a), transmute(b))) }
16142-
}
16143-
#[doc = "Add Pairwise"]
16144-
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
16145-
#[inline(always)]
16146-
#[cfg(target_endian = "big")]
1614716115
#[target_feature(enable = "neon")]
1614816116
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1614916117
#[cfg_attr(test, assert_instr(addp))]
1615016118
pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
16151-
let a: uint8x16_t =
16152-
unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
16153-
let b: uint8x16_t =
16154-
unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
1615516119
unsafe {
16156-
let ret_val: uint8x16_t = transmute(vpaddq_s8(transmute(a), transmute(b)));
16157-
simd_shuffle!(
16158-
ret_val,
16159-
ret_val,
16160-
[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
16161-
)
16120+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<16>());
16121+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<16>());
16122+
simd_add(even, odd)
1616216123
}
1616316124
}
1616416125
#[doc = "Add Pairwise"]
1616516126
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u16)"]
1616616127
#[inline(always)]
16167-
#[cfg(target_endian = "little")]
16168-
#[target_feature(enable = "neon")]
16169-
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
16170-
#[cfg_attr(test, assert_instr(addp))]
16171-
pub fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
16172-
unsafe { transmute(vpaddq_s16(transmute(a), transmute(b))) }
16173-
}
16174-
#[doc = "Add Pairwise"]
16175-
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u16)"]
16176-
#[inline(always)]
16177-
#[cfg(target_endian = "big")]
1617816128
#[target_feature(enable = "neon")]
1617916129
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1618016130
#[cfg_attr(test, assert_instr(addp))]
1618116131
pub fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
16182-
let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
16183-
let b: uint16x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
1618416132
unsafe {
16185-
let ret_val: uint16x8_t = transmute(vpaddq_s16(transmute(a), transmute(b)));
16186-
simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
16133+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
16134+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
16135+
simd_add(even, odd)
1618716136
}
1618816137
}
1618916138
#[doc = "Add Pairwise"]
1619016139
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u32)"]
1619116140
#[inline(always)]
16192-
#[cfg(target_endian = "little")]
16193-
#[target_feature(enable = "neon")]
16194-
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
16195-
#[cfg_attr(test, assert_instr(addp))]
16196-
pub fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
16197-
unsafe { transmute(vpaddq_s32(transmute(a), transmute(b))) }
16198-
}
16199-
#[doc = "Add Pairwise"]
16200-
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u32)"]
16201-
#[inline(always)]
16202-
#[cfg(target_endian = "big")]
1620316141
#[target_feature(enable = "neon")]
1620416142
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1620516143
#[cfg_attr(test, assert_instr(addp))]
1620616144
pub fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
16207-
let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
16208-
let b: uint32x4_t = unsafe { simd_shuffle!(b, b, [3, 2, 1, 0]) };
1620916145
unsafe {
16210-
let ret_val: uint32x4_t = transmute(vpaddq_s32(transmute(a), transmute(b)));
16211-
simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
16146+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
16147+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
16148+
simd_add(even, odd)
1621216149
}
1621316150
}
1621416151
#[doc = "Add Pairwise"]
1621516152
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u64)"]
1621616153
#[inline(always)]
16217-
#[cfg(target_endian = "little")]
16218-
#[target_feature(enable = "neon")]
16219-
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
16220-
#[cfg_attr(test, assert_instr(addp))]
16221-
pub fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
16222-
unsafe { transmute(vpaddq_s64(transmute(a), transmute(b))) }
16223-
}
16224-
#[doc = "Add Pairwise"]
16225-
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u64)"]
16226-
#[inline(always)]
16227-
#[cfg(target_endian = "big")]
1622816154
#[target_feature(enable = "neon")]
1622916155
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
1623016156
#[cfg_attr(test, assert_instr(addp))]
1623116157
pub fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
16232-
let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
16233-
let b: uint64x2_t = unsafe { simd_shuffle!(b, b, [1, 0]) };
1623416158
unsafe {
16235-
let ret_val: uint64x2_t = transmute(vpaddq_s64(transmute(a), transmute(b)));
16236-
simd_shuffle!(ret_val, ret_val, [1, 0])
16159+
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
16160+
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
16161+
simd_add(even, odd)
1623716162
}
1623816163
}
1623916164
#[doc = "Floating-point add pairwise"]

library/stdarch/crates/core_arch/src/macros.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,31 @@ macro_rules! simd_masked_store {
187187
};
188188
}
189189

190+
/// The first N even indices `[0, 2, 4, ...]`.
191+
pub(crate) const fn even<const N: usize>() -> [u32; N] {
192+
let mut out = [0u32; N];
193+
let mut i = 0usize;
194+
while i < N {
195+
out[i] = (2 * i) as u32;
196+
i += 1;
197+
}
198+
out
199+
}
200+
201+
/// The first N odd indices `[1, 3, 5, ...]`.
202+
pub(crate) const fn odd<const N: usize>() -> [u32; N] {
203+
let mut out = [0u32; N];
204+
let mut i = 0usize;
205+
while i < N {
206+
out[i] = (2 * i + 1) as u32;
207+
i += 1;
208+
}
209+
out
210+
}
211+
212+
/// Multiples of N offset by K `[K, K+N, K+2N, ...]`.
190213
pub(crate) const fn deinterleave_mask<const LANES: usize, const N: usize, const K: usize>()
191214
-> [u32; LANES] {
192-
// Produces: [K, K+N, K+2N, ...]
193215
let mut out = [0u32; LANES];
194216
let mut i = 0usize;
195217
while i < LANES {

0 commit comments

Comments
 (0)