From 86cb2a06682a8f533966813e8bf48f08515b356e Mon Sep 17 00:00:00 2001 From: Kristof Roomp Date: Mon, 13 May 2024 18:17:59 +0200 Subject: [PATCH] Added i16 widening mul since this is implemented badly on some platforms (#153) * add reduce min/max along with tests. Also optimize i16 abs for sse2 * mulwiden * better mulwiden * better mulwiden * fix test * add must_sue and inline * fix spacing * remove unnecessary paren --- src/i16x8_.rs | 44 ++++++++++++++++++++++++++++++++++++++ tests/all_tests/t_i16x8.rs | 18 ++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/src/i16x8_.rs b/src/i16x8_.rs index 43404d2c..8b01d0b5 100644 --- a/src/i16x8_.rs +++ b/src/i16x8_.rs @@ -933,6 +933,50 @@ impl i16x8 { } } + /// multiplies two i16x8 and returns the result as a widened i32x8 + #[inline] + #[must_use] + pub fn mul_widen(self, rhs: Self) -> i32x8 { + pick! { + if #[cfg(target_feature="avx2")] { + let a = convert_to_i32_m256i_from_i16_m128i(self.sse); + let b = convert_to_i32_m256i_from_i16_m128i(rhs.sse); + i32x8 { avx2: mul_i32_keep_low_m256i(a,b) } + } else if #[cfg(target_feature="sse2")] { + let low = mul_i16_keep_low_m128i(self.sse, rhs.sse); + let high = mul_i16_keep_high_m128i(self.sse, rhs.sse); + i32x8 { + a: i32x4 { sse:unpack_low_i16_m128i(low, high) }, + b: i32x4 { sse:unpack_high_i16_m128i(low, high) } + } + } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { + let lhs_low = unsafe { vget_low_s16(self.neon) }; + let rhs_low = unsafe { vget_low_s16(rhs.neon) }; + + let lhs_high = unsafe { vget_high_s16(self.neon) }; + let rhs_high = unsafe { vget_high_s16(rhs.neon) }; + + let low = unsafe { vmull_s16(lhs_low, rhs_low) }; + let high = unsafe { vmull_s16(lhs_high, rhs_high) }; + + i32x8 { a: i32x4 { neon: low }, b: i32x4 {neon: high } } + } else { + let a = self.as_array_ref(); + let b = rhs.as_array_ref(); + i32x8::new([ + i32::from(a[0]) * i32::from(b[0]), + i32::from(a[1]) * i32::from(b[1]), + i32::from(a[2]) * i32::from(b[2]), + i32::from(a[3]) * i32::from(b[3]), + i32::from(a[4]) * i32::from(b[4]), + i32::from(a[5]) * i32::from(b[5]), + i32::from(a[6]) * i32::from(b[6]), + i32::from(a[7]) * i32::from(b[7]), + ]) + } + } + } + /// transpose matrix of 8x8 i16 matrix #[must_use] #[inline] diff --git a/tests/all_tests/t_i16x8.rs b/tests/all_tests/t_i16x8.rs index b69f11d0..8ed17944 100644 --- a/tests/all_tests/t_i16x8.rs +++ b/tests/all_tests/t_i16x8.rs @@ -361,3 +361,21 @@ fn impl_mul_keep_high() { let c: [i16; 8] = i16x8::mul_keep_high(a, b).into(); assert_eq!(c, [0, 1, 3, 621, 0, 0, 0, -1]); } + +#[test] +fn impl_i16x8_mul_widen() { + let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN, i16::MAX]); + let b = i16x8::from([17, -18, 190, -20, 21, -22, i16::MAX, i16::MAX]); + let expected = i32x8::from([ + 17, + -36, + 570, + -80, + 105, + -132, + (i16::MIN as i32) * (i16::MAX as i32), + (i16::MAX as i32) * (i16::MAX as i32), + ]); + let actual = a.mul_widen(b); + assert_eq!(expected, actual); +}