From 86cb2a06682a8f533966813e8bf48f08515b356e Mon Sep 17 00:00:00 2001
From: Kristof Roomp <kristofr@gmail.com>
Date: Mon, 13 May 2024 18:17:59 +0200
Subject: [PATCH] Added i16 widening mul since this is implemented badly on
 some platforms (#153)

* add reduce min/max along with tests. Also optimize i16 abs for sse2

* mulwiden

* better mulwiden

* better mulwiden

* fix test

* add must_sue and inline

* fix spacing

* remove unnecessary paren
---
 src/i16x8_.rs              | 44 ++++++++++++++++++++++++++++++++++++++
 tests/all_tests/t_i16x8.rs | 18 ++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/src/i16x8_.rs b/src/i16x8_.rs
index 43404d2c..8b01d0b5 100644
--- a/src/i16x8_.rs
+++ b/src/i16x8_.rs
@@ -933,6 +933,50 @@ impl i16x8 {
     }
   }
 
+  /// multiplies two i16x8 and returns the result as a widened i32x8
+  #[inline]
+  #[must_use]
+  pub fn mul_widen(self, rhs: Self) -> i32x8 {
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        let a = convert_to_i32_m256i_from_i16_m128i(self.sse);
+        let b = convert_to_i32_m256i_from_i16_m128i(rhs.sse);
+        i32x8 { avx2: mul_i32_keep_low_m256i(a,b) }
+      } else if #[cfg(target_feature="sse2")] {
+         let low = mul_i16_keep_low_m128i(self.sse, rhs.sse);
+         let high = mul_i16_keep_high_m128i(self.sse, rhs.sse);
+         i32x8 {
+          a: i32x4 { sse:unpack_low_i16_m128i(low, high) },
+          b: i32x4 { sse:unpack_high_i16_m128i(low, high) }
+        }
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+         let lhs_low = unsafe { vget_low_s16(self.neon) };
+         let rhs_low = unsafe { vget_low_s16(rhs.neon) };
+
+         let lhs_high = unsafe { vget_high_s16(self.neon) };
+         let rhs_high = unsafe { vget_high_s16(rhs.neon) };
+
+         let low = unsafe { vmull_s16(lhs_low, rhs_low) };
+         let high = unsafe { vmull_s16(lhs_high, rhs_high) };
+
+         i32x8 { a: i32x4 { neon: low }, b: i32x4 {neon: high } }
+       } else {
+        let a = self.as_array_ref();
+        let b = rhs.as_array_ref();
+         i32x8::new([
+           i32::from(a[0]) * i32::from(b[0]),
+           i32::from(a[1]) * i32::from(b[1]),
+           i32::from(a[2]) * i32::from(b[2]),
+           i32::from(a[3]) * i32::from(b[3]),
+           i32::from(a[4]) * i32::from(b[4]),
+           i32::from(a[5]) * i32::from(b[5]),
+           i32::from(a[6]) * i32::from(b[6]),
+           i32::from(a[7]) * i32::from(b[7]),
+         ])
+       }
+    }
+  }
+
   /// transpose matrix of 8x8 i16 matrix
   #[must_use]
   #[inline]
diff --git a/tests/all_tests/t_i16x8.rs b/tests/all_tests/t_i16x8.rs
index b69f11d0..8ed17944 100644
--- a/tests/all_tests/t_i16x8.rs
+++ b/tests/all_tests/t_i16x8.rs
@@ -361,3 +361,21 @@ fn impl_mul_keep_high() {
   let c: [i16; 8] = i16x8::mul_keep_high(a, b).into();
   assert_eq!(c, [0, 1, 3, 621, 0, 0, 0, -1]);
 }
+
+#[test]
+fn impl_i16x8_mul_widen() {
+  let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN, i16::MAX]);
+  let b = i16x8::from([17, -18, 190, -20, 21, -22, i16::MAX, i16::MAX]);
+  let expected = i32x8::from([
+    17,
+    -36,
+    570,
+    -80,
+    105,
+    -132,
+    (i16::MIN as i32) * (i16::MAX as i32),
+    (i16::MAX as i32) * (i16::MAX as i32),
+  ]);
+  let actual = a.mul_widen(b);
+  assert_eq!(expected, actual);
+}