Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added widening multiply for u16x8, u32x4, u32x8, i32x4 and i32x8 #182

Merged
merged 16 commits into from
Nov 19, 2024
39 changes: 39 additions & 0 deletions src/i32x4_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,45 @@ impl i32x4 {
}
}
}

#[inline]
#[must_use]
pub fn mul_widen(self, rhs: Self) -> i64x4 {
mcroomp marked this conversation as resolved.
Show resolved Hide resolved
// todo: WASM simd128, but not sure it would really be faster
// than what the compiler comes up with.
pick! {
if #[cfg(target_feature="avx2")] {
let a = convert_to_i64_m256i_from_i32_m128i(self.sse);
let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse);
cast(mul_i64_low_bits_m256i(a, b))
} else if #[cfg(target_feature="sse4.1")] {
let evenp = mul_widen_i32_odd_m128i(self.sse, rhs.sse);

let oddp = mul_widen_i32_odd_m128i(
shr_imm_u64_m128i::<32>(self.sse),
shr_imm_u64_m128i::<32>(rhs.sse));

i64x4 {
a: i64x2 { sse: unpack_low_i64_m128i(evenp, oddp)},
b: i64x2 { sse: unpack_high_i64_m128i(evenp, oddp)}}
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
unsafe {
i64x4 { a: i64x2 { neon: vmull_s32(vget_low_s32(self.neon), vget_low_s32(rhs.neon)) },
b: i64x2 { neon: vmull_s32(vget_high_s32(self.neon), vget_high_s32(rhs.neon)) } }
}
} else {
let a: [i32; 4] = cast(self);
let b: [i32; 4] = cast(rhs);
cast([
i64::from(a[0]) * i64::from(b[0]),
i64::from(a[1]) * i64::from(b[1]),
i64::from(a[2]) * i64::from(b[2]),
i64::from(a[3]) * i64::from(b[3]),
])
}
}
}

#[inline]
#[must_use]
pub fn abs(self) -> Self {
Expand Down
4 changes: 2 additions & 2 deletions src/i64x4_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ pick! {
if #[cfg(target_feature="avx2")] {
#[derive(Default, Clone, Copy, PartialEq, Eq)]
#[repr(C, align(32))]
pub struct i64x4 { avx2: m256i }
pub struct i64x4 { pub(crate) avx2: m256i }
} else {
#[derive(Default, Clone, Copy, PartialEq, Eq)]
#[repr(C, align(32))]
pub struct i64x4 { a : i64x2, b : i64x2 }
pub struct i64x4 { pub(crate) a : i64x2, pub(crate) b : i64x2 }
}
}

Expand Down
38 changes: 38 additions & 0 deletions src/u16x8_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,44 @@ impl u16x8 {
}
}

/// Multiples two `u16x8` and return the high part of intermediate `u32x8`
#[inline]
#[must_use]
pub fn mul_keep_high(self, rhs: Self) -> Self {
pick! {
if #[cfg(target_feature="sse2")] {
Self { sse: mul_u16_keep_high_m128i(self.sse, rhs.sse) }
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
let lhs_low = unsafe { vget_low_u16(self.neon) };
let rhs_low = unsafe { vget_low_u16(rhs.neon) };

let lhs_high = unsafe { vget_high_u16(self.neon) };
let rhs_high = unsafe { vget_high_u16(rhs.neon) };

let low = unsafe { vmull_u16(lhs_low, rhs_low) };
let high = unsafe { vmull_u16(lhs_high, rhs_high) };

u16x8 { neon: unsafe { vuzpq_u16(vreinterpretq_u16_u32(low), vreinterpretq_u16_u32(high)).1 } }
} else if #[cfg(target_feature="simd128")] {
let low = u32x4_extmul_low_u16x8(self.simd, rhs.simd);
let high = u32x4_extmul_high_u16x8(self.simd, rhs.simd);

Self { simd: u16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(low, high) }
} else {
u16x8::new([
((u32::from(rhs.as_array_ref()[0]) * u32::from(self.as_array_ref()[0])) >> 16) as u16,
((u32::from(rhs.as_array_ref()[1]) * u32::from(self.as_array_ref()[1])) >> 16) as u16,
((u32::from(rhs.as_array_ref()[2]) * u32::from(self.as_array_ref()[2])) >> 16) as u16,
((u32::from(rhs.as_array_ref()[3]) * u32::from(self.as_array_ref()[3])) >> 16) as u16,
((u32::from(rhs.as_array_ref()[4]) * u32::from(self.as_array_ref()[4])) >> 16) as u16,
((u32::from(rhs.as_array_ref()[5]) * u32::from(self.as_array_ref()[5])) >> 16) as u16,
((u32::from(rhs.as_array_ref()[6]) * u32::from(self.as_array_ref()[6])) >> 16) as u16,
((u32::from(rhs.as_array_ref()[7]) * u32::from(self.as_array_ref()[7])) >> 16) as u16,
])
}
}
}

#[inline]
pub fn to_array(self) -> [u16; 8] {
cast(self)
Expand Down
95 changes: 94 additions & 1 deletion src/u32x4_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ impl u32x4 {
Self { sse: cmp_gt_mask_i32_m128i((self ^ h).sse, (rhs ^ h).sse) }
} else if #[cfg(target_feature="simd128")] {
Self { simd: u32x4_gt(self.simd, rhs.simd) }
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
unsafe {Self { neon: vcgtq_u32(self.neon, rhs.neon) }}
} else {
Self { arr: [
Expand All @@ -450,6 +450,99 @@ impl u32x4 {
rhs.cmp_gt(self)
}

/// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the result.
/// Useful for implementing divide constant value (see t_usefulness example)
#[inline]
#[must_use]
pub fn mul_keep_high(self, rhs: Self) -> Self {
// todo: WASM simd128, but not sure it would really be faster
// than what the compiler comes up with.

pick! {
if #[cfg(target_feature="avx2")] {
let a = convert_to_i64_m256i_from_u32_m128i(self.sse);
let b = convert_to_i64_m256i_from_u32_m128i(rhs.sse);
let r = mul_u64_low_bits_m256i(a, b);

// the compiler does a good job shuffling the lanes around
let b : [u32;8] = cast(r);
cast([b[1],b[3],b[5],b[7]])
} else if #[cfg(target_feature="sse2")] {
let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse);

let oddp = mul_widen_u32_odd_m128i(
shr_imm_u64_m128i::<32>(self.sse),
shr_imm_u64_m128i::<32>(rhs.sse));

// the compiler does a good job shuffling the lanes around
let a : [u32;4]= cast(evenp);
let b : [u32;4]= cast(oddp);
cast([a[1],b[1],a[3],b[3]])

} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
unsafe {
let l = vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon));
let h = vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon));
u32x4 { neon: vcombine_u32(vshrn_n_u64(l,32), vshrn_n_u64(h,32)) }
}
} else {
let a: [u32; 4] = cast(self);
let b: [u32; 4] = cast(rhs);
cast([
((u64::from(a[0]) * u64::from(b[0])) >> 32) as u32,
((u64::from(a[1]) * u64::from(b[1])) >> 32) as u32,
((u64::from(a[2]) * u64::from(b[2])) >> 32) as u32,
((u64::from(a[3]) * u64::from(b[3])) >> 32) as u32,
])
}
}
}

/// Multiplies corresponding 32 bit lanes and returns the 64 bit result
/// on the corresponding lanes.
///
/// Effectively does two multiplies on 128 bit platforms, but is easier
/// to use than the even version, and runs fast on AVX2.
#[inline]
#[must_use]
pub fn mul_widen(self, rhs: Self) -> u64x4 {
// todo: WASM simd128, but not sure it would really be faster
// than what the compiler comes up with.

pick! {
if #[cfg(target_feature="avx2")] {
// ok to sign extend since we are throwing away the high half of the result anyway
let a = convert_to_i64_m256i_from_i32_m128i(self.sse);
let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse);
cast(mul_u64_low_bits_m256i(a, b))
} else if #[cfg(target_feature="sse2")] {
let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse);

let oddp = mul_widen_u32_odd_m128i(
shr_imm_u64_m128i::<32>(self.sse),
shr_imm_u64_m128i::<32>(rhs.sse));

u64x4 {
a: u64x2 { sse: unpack_low_i64_m128i(evenp, oddp)},
b: u64x2 { sse: unpack_high_i64_m128i(evenp, oddp)}}
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
unsafe {
u64x4 { a: u64x2 { neon: vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon)) },
b: u64x2 { neon: vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon)) } }
}
} else {
let a: [u32; 4] = cast(self);
let b: [u32; 4] = cast(rhs);
cast([
u64::from(a[0]) * u64::from(b[0]),
u64::from(a[1]) * u64::from(b[1]),
u64::from(a[2]) * u64::from(b[2]),
u64::from(a[3]) * u64::from(b[3]),
])
}
}
}

#[inline]
#[must_use]
pub fn blend(self, t: Self, f: Self) -> Self {
Expand Down
24 changes: 24 additions & 0 deletions src/u32x8_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,30 @@ impl u32x8 {
rhs.cmp_gt(self)
}

/// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the result.
/// Useful for implementing divide constant value (see t_usefulness example)
#[inline]
#[must_use]
pub fn mul_keep_high(self, rhs: u32x8) -> u32x8 {
pick! {
if #[cfg(target_feature="avx2")] {
let a : [u32;8]= cast(self);
let b : [u32;8]= cast(rhs);

// let the compiler shuffle the values around, it does the right thing
let r1 : [u32;8] = cast(mul_u64_low_bits_m256i(cast([a[0], 0, a[1], 0, a[2], 0, a[3], 0]), cast([b[0], 0, b[1], 0, b[2], 0, b[3], 0])));
let r2 : [u32;8] = cast(mul_u64_low_bits_m256i(cast([a[4], 0, a[5], 0, a[6], 0, a[7], 0]), cast([b[4], 0, b[5], 0, b[6], 0, b[7], 0])));

cast([r1[1], r1[3], r1[5], r1[7], r2[1], r2[3], r2[5], r2[7]])
} else {
Self {
a : self.a.mul_keep_high(rhs.a),
b : self.b.mul_keep_high(rhs.b),
}
}
}
}

#[inline]
#[must_use]
pub fn blend(self, t: Self, f: Self) -> Self {
Expand Down
23 changes: 20 additions & 3 deletions tests/all_tests/t_i16x8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -369,10 +369,27 @@ fn impl_i16x8_reduce_max() {

#[test]
fn impl_mul_keep_high() {
let a = i16x8::from([1, 200, 300, 4568, -1, -2, -3, -4]);
let b = i16x8::from([5, 600, 700, 8910, -15, -26, -37, 48]);
let a = i16x8::from([i16::MAX, 200, 300, 4568, -1, -2, -3, -4]);
let b = i16x8::from([i16::MIN, 600, 700, 8910, -15, -26, -37, 48]);
let c: [i16; 8] = i16x8::mul_keep_high(a, b).into();
assert_eq!(c, [0, 1, 3, 621, 0, 0, 0, -1]);
assert_eq!(
c,
[
(i32::from(i16::MAX) * i32::from(i16::MIN) >> 16) as i16,
1,
3,
621,
0,
0,
0,
-1
]
);

crate::test_random_vector_vs_scalar(
|a: i16x8, b| i16x8::mul_keep_high(a, b),
|a, b| ((i32::from(a) * i32::from(b)) >> 16) as i16,
);
}

#[test]
Expand Down
19 changes: 19 additions & 0 deletions tests/all_tests/t_i32x4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,22 @@ fn impl_i32x4_shl_each() {
|a, b| a.wrapping_shl(b as u32),
);
}

#[test]
fn impl_i32x4_mul_widen() {
let a = i32x4::from([1, 2, 3 * -1000000, i32::MAX]);
let b = i32x4::from([5, 6, 7 * -1000000, i32::MIN]);
let expected = i64x4::from([
1 * 5,
2 * 6,
3 * 7 * 1000000 * 1000000,
i32::MIN as i64 * i32::MAX as i64,
]);
let actual = a.mul_widen(b);
assert_eq!(expected, actual);

crate::test_random_vector_vs_scalar(
|a: i32x4, b| a.mul_widen(b),
|a, b| a as i64 * b as i64,
);
}
25 changes: 25 additions & 0 deletions tests/all_tests/t_u16x8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,31 @@ fn impl_u16x8_from_u8x16_high() {
assert_eq!(expected, actual);
}

#[test]
fn impl_u16x8_mul_keep_high() {
let a = u16x8::from([u16::MAX, 200, 300, 4568, 1, 2, 3, 200]);
let b = u16x8::from([u16::MAX, 600, 700, 8910, 15, 26, 37, 600]);
let c: [u16; 8] = u16x8::mul_keep_high(a, b).into();
assert_eq!(
c,
[
(u32::from(u16::MAX) * u32::from(u16::MAX) >> 16) as u16,
1,
3,
621,
0,
0,
0,
1
]
);

crate::test_random_vector_vs_scalar(
|a: u16x8, b| u16x8::mul_keep_high(a, b),
|a, b| ((u32::from(a) * u32::from(b)) >> 16) as u16,
);
}

#[test]
fn impl_u16x8_mul_widen() {
let a = u16x8::from([1, 2, 3, 4, 5, 6, i16::MAX as u16, u16::MAX]);
Expand Down
40 changes: 40 additions & 0 deletions tests/all_tests/t_u32x4.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::num::Wrapping;

use wide::*;

#[test]
Expand Down Expand Up @@ -234,3 +235,42 @@ fn test_u32x4_none() {
let a = u32x4::from([0; 4]);
assert!(a.none());
}

#[test]
fn impl_u32x4_mul_widen() {
let a = u32x4::from([1, 2, 3 * 1000000, u32::MAX]);
let b = u32x4::from([5, 6, 7 * 1000000, u32::MAX]);
let expected = u64x4::from([
1 * 5,
2 * 6,
3 * 7 * 1000000 * 1000000,
u32::MAX as u64 * u32::MAX as u64,
]);
let actual = a.mul_widen(b);
assert_eq!(expected, actual);

crate::test_random_vector_vs_scalar(
|a: u32x4, b| a.mul_widen(b),
|a, b| u64::from(a) * u64::from(b),
);
}

#[test]
fn impl_u32x4_mul_keep_high() {
let mul_high = |a: u32, b: u32| ((u64::from(a) * u64::from(b)) >> 32) as u32;
let a = u32x4::from([1, 2 * 10000000, 3 * 1000000, u32::MAX]);
let b = u32x4::from([5, 6 * 100, 7 * 1000000, u32::MAX]);
let expected = u32x4::from([
mul_high(1, 5),
mul_high(2 * 10000000, 6 * 100),
mul_high(3 * 1000000, 7 * 1000000),
mul_high(u32::MAX, u32::MAX),
]);
let actual = a.mul_keep_high(b);
assert_eq!(expected, actual);

crate::test_random_vector_vs_scalar(
|a: u32x4, b| a.mul_keep_high(b),
|a, b| ((u64::from(a) * u64::from(b)) >> 32) as u32,
);
}
8 changes: 8 additions & 0 deletions tests/all_tests/t_u32x8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,3 +295,11 @@ fn test_u32x8_none() {
let a = u32x8::from([0; 8]);
assert!(a.none());
}

#[test]
fn impl_u32x8_mul_keep_high() {
crate::test_random_vector_vs_scalar(
|a: u32x8, b| u32x8::mul_keep_high(a, b),
|a, b| ((u64::from(a) * u64::from(b)) >> 32) as u32,
);
}
Loading