Lokathor · Lokathor · Nov 19, 2024 · Oct 20, 2024 · Oct 20, 2024 · Oct 20, 2024
diff --git a/src/i32x4_.rs b/src/i32x4_.rs
@@ -490,6 +490,45 @@ impl i32x4 {
       }
     }
   }
+
+  #[inline]
+  #[must_use]
+  pub fn mul_widen(self, rhs: Self) -> i64x4 {
+    // todo: WASM simd128, but not sure it would really be faster
+    // than what the compiler comes up with.
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        let a = convert_to_i64_m256i_from_i32_m128i(self.sse);
+        let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse);
+        cast(mul_i64_low_bits_m256i(a, b))
+      } else if #[cfg(target_feature="sse4.1")] {
+          let evenp = mul_widen_i32_odd_m128i(self.sse, rhs.sse);
+
+          let oddp = mul_widen_i32_odd_m128i(
+            shr_imm_u64_m128i::<32>(self.sse),
+            shr_imm_u64_m128i::<32>(rhs.sse));
+
+          i64x4 {
+            a: i64x2 { sse: unpack_low_i64_m128i(evenp, oddp)},
+            b: i64x2 { sse: unpack_high_i64_m128i(evenp, oddp)}}
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        unsafe {
+          i64x4 { a: i64x2 { neon: vmull_s32(vget_low_s32(self.neon), vget_low_s32(rhs.neon)) },
+                  b: i64x2 { neon: vmull_s32(vget_high_s32(self.neon), vget_high_s32(rhs.neon)) } }
+        }
+      } else {
+        let a: [i32; 4] = cast(self);
+        let b: [i32; 4] = cast(rhs);
+        cast([
+          i64::from(a[0]) * i64::from(b[0]),
+          i64::from(a[1]) * i64::from(b[1]),
+          i64::from(a[2]) * i64::from(b[2]),
+          i64::from(a[3]) * i64::from(b[3]),
+        ])
+      }
+    }
+  }
+
   #[inline]
   #[must_use]
   pub fn abs(self) -> Self {

diff --git a/src/i64x4_.rs b/src/i64x4_.rs
@@ -4,11 +4,11 @@ pick! {
   if #[cfg(target_feature="avx2")] {
     #[derive(Default, Clone, Copy, PartialEq, Eq)]
     #[repr(C, align(32))]
-    pub struct i64x4 { avx2: m256i }
+    pub struct i64x4 { pub(crate) avx2: m256i }
   } else {
     #[derive(Default, Clone, Copy, PartialEq, Eq)]
     #[repr(C, align(32))]
-    pub struct i64x4 { a : i64x2, b : i64x2 }
+    pub struct i64x4 { pub(crate) a : i64x2, pub(crate) b : i64x2 }
   }
 }
 

diff --git a/src/u16x8_.rs b/src/u16x8_.rs
@@ -591,6 +591,44 @@ impl u16x8 {
     }
   }
 
+  /// Multiples two `u16x8` and return the high part of intermediate `u32x8`
+  #[inline]
+  #[must_use]
+  pub fn mul_keep_high(self, rhs: Self) -> Self {
+    pick! {
+      if #[cfg(target_feature="sse2")] {
+        Self { sse: mul_u16_keep_high_m128i(self.sse, rhs.sse) }
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        let lhs_low = unsafe { vget_low_u16(self.neon) };
+        let rhs_low = unsafe { vget_low_u16(rhs.neon) };
+
+        let lhs_high = unsafe { vget_high_u16(self.neon) };
+        let rhs_high = unsafe { vget_high_u16(rhs.neon) };
+
+        let low = unsafe { vmull_u16(lhs_low, rhs_low) };
+        let high = unsafe { vmull_u16(lhs_high, rhs_high) };
+
+        u16x8 { neon: unsafe { vuzpq_u16(vreinterpretq_u16_u32(low), vreinterpretq_u16_u32(high)).1 } }
+      } else if #[cfg(target_feature="simd128")] {
+        let low =  u32x4_extmul_low_u16x8(self.simd, rhs.simd);
+        let high = u32x4_extmul_high_u16x8(self.simd, rhs.simd);
+
+        Self { simd: u16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(low, high) }
+      } else {
+        u16x8::new([
+          ((u32::from(rhs.as_array_ref()[0]) * u32::from(self.as_array_ref()[0])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[1]) * u32::from(self.as_array_ref()[1])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[2]) * u32::from(self.as_array_ref()[2])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[3]) * u32::from(self.as_array_ref()[3])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[4]) * u32::from(self.as_array_ref()[4])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[5]) * u32::from(self.as_array_ref()[5])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[6]) * u32::from(self.as_array_ref()[6])) >> 16) as u16,
+          ((u32::from(rhs.as_array_ref()[7]) * u32::from(self.as_array_ref()[7])) >> 16) as u16,
+        ])
+      }
+    }
+  }
+
   #[inline]
   pub fn to_array(self) -> [u16; 8] {
     cast(self)

diff --git a/src/u32x4_.rs b/src/u32x4_.rs
@@ -431,7 +431,7 @@ impl u32x4 {
         Self { sse: cmp_gt_mask_i32_m128i((self ^ h).sse, (rhs ^ h).sse) }
       } else if #[cfg(target_feature="simd128")] {
         Self { simd: u32x4_gt(self.simd, rhs.simd) }
-      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
         unsafe {Self { neon: vcgtq_u32(self.neon, rhs.neon) }}
       } else {
         Self { arr: [
@@ -450,6 +450,99 @@ impl u32x4 {
     rhs.cmp_gt(self)
   }
 
+  /// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the result.
+  /// Useful for implementing divide constant value (see t_usefulness example)
+  #[inline]
+  #[must_use]
+  pub fn mul_keep_high(self, rhs: Self) -> Self {
+    // todo: WASM simd128, but not sure it would really be faster
+    // than what the compiler comes up with.
+
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        let a = convert_to_i64_m256i_from_u32_m128i(self.sse);
+        let b = convert_to_i64_m256i_from_u32_m128i(rhs.sse);
+        let r = mul_u64_low_bits_m256i(a, b);
+
+        // the compiler does a good job shuffling the lanes around
+        let b : [u32;8] = cast(r);
+        cast([b[1],b[3],b[5],b[7]])
+      } else if #[cfg(target_feature="sse2")] {
+        let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse);
+
+        let oddp = mul_widen_u32_odd_m128i(
+          shr_imm_u64_m128i::<32>(self.sse),
+          shr_imm_u64_m128i::<32>(rhs.sse));
+
+        // the compiler does a good job shuffling the lanes around
+        let a : [u32;4]= cast(evenp);
+        let b : [u32;4]= cast(oddp);
+        cast([a[1],b[1],a[3],b[3]])
+
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        unsafe {
+          let l = vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon));
+          let h = vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon));
+          u32x4 { neon: vcombine_u32(vshrn_n_u64(l,32), vshrn_n_u64(h,32)) }
+        }
+      } else {
+        let a: [u32; 4] = cast(self);
+        let b: [u32; 4] = cast(rhs);
+        cast([
+          ((u64::from(a[0]) * u64::from(b[0])) >> 32) as u32,
+          ((u64::from(a[1]) * u64::from(b[1])) >> 32) as u32,
+          ((u64::from(a[2]) * u64::from(b[2])) >> 32) as u32,
+          ((u64::from(a[3]) * u64::from(b[3])) >> 32) as u32,
+        ])
+      }
+    }
+  }
+
+  /// Multiplies corresponding 32 bit lanes and returns the 64 bit result
+  /// on the corresponding lanes.
+  ///
+  /// Effectively does two multiplies on 128 bit platforms, but is easier
+  /// to use than the even version, and runs fast on AVX2.
+  #[inline]
+  #[must_use]
+  pub fn mul_widen(self, rhs: Self) -> u64x4 {
+    // todo: WASM simd128, but not sure it would really be faster
+    // than what the compiler comes up with.
+
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        // ok to sign extend since we are throwing away the high half of the result anyway
+        let a = convert_to_i64_m256i_from_i32_m128i(self.sse);
+        let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse);
+        cast(mul_u64_low_bits_m256i(a, b))
+      } else if #[cfg(target_feature="sse2")] {
+        let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse);
+
+        let oddp = mul_widen_u32_odd_m128i(
+          shr_imm_u64_m128i::<32>(self.sse),
+          shr_imm_u64_m128i::<32>(rhs.sse));
+
+        u64x4 {
+          a: u64x2 { sse: unpack_low_i64_m128i(evenp, oddp)},
+          b: u64x2 { sse: unpack_high_i64_m128i(evenp, oddp)}}
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        unsafe {
+          u64x4 { a: u64x2 { neon: vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon)) },
+                  b: u64x2 { neon: vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon)) } }
+        }
+      } else {
+        let a: [u32; 4] = cast(self);
+        let b: [u32; 4] = cast(rhs);
+        cast([
+          u64::from(a[0]) * u64::from(b[0]),
+          u64::from(a[1]) * u64::from(b[1]),
+          u64::from(a[2]) * u64::from(b[2]),
+          u64::from(a[3]) * u64::from(b[3]),
+        ])
+      }
+    }
+  }
+
   #[inline]
   #[must_use]
   pub fn blend(self, t: Self, f: Self) -> Self {

diff --git a/src/u32x8_.rs b/src/u32x8_.rs
@@ -301,6 +301,30 @@ impl u32x8 {
     rhs.cmp_gt(self)
   }
 
+  /// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the result.
+  /// Useful for implementing divide constant value (see t_usefulness example)
+  #[inline]
+  #[must_use]
+  pub fn mul_keep_high(self, rhs: u32x8) -> u32x8 {
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        let a : [u32;8]= cast(self);
+        let b : [u32;8]= cast(rhs);
+
+        // let the compiler shuffle the values around, it does the right thing
+        let r1 : [u32;8] = cast(mul_u64_low_bits_m256i(cast([a[0], 0, a[1], 0, a[2], 0, a[3], 0]), cast([b[0], 0, b[1], 0, b[2], 0, b[3], 0])));
+        let r2 : [u32;8] = cast(mul_u64_low_bits_m256i(cast([a[4], 0, a[5], 0, a[6], 0, a[7], 0]), cast([b[4], 0, b[5], 0, b[6], 0, b[7], 0])));
+
+        cast([r1[1], r1[3], r1[5], r1[7], r2[1], r2[3], r2[5], r2[7]])
+      } else {
+        Self {
+          a : self.a.mul_keep_high(rhs.a),
+          b : self.b.mul_keep_high(rhs.b),
+        }
+      }
+    }
+  }
+
   #[inline]
   #[must_use]
   pub fn blend(self, t: Self, f: Self) -> Self {

diff --git a/tests/all_tests/t_i16x8.rs b/tests/all_tests/t_i16x8.rs
@@ -369,10 +369,27 @@ fn impl_i16x8_reduce_max() {
 
 #[test]
 fn impl_mul_keep_high() {
-  let a = i16x8::from([1, 200, 300, 4568, -1, -2, -3, -4]);
-  let b = i16x8::from([5, 600, 700, 8910, -15, -26, -37, 48]);
+  let a = i16x8::from([i16::MAX, 200, 300, 4568, -1, -2, -3, -4]);
+  let b = i16x8::from([i16::MIN, 600, 700, 8910, -15, -26, -37, 48]);
   let c: [i16; 8] = i16x8::mul_keep_high(a, b).into();
-  assert_eq!(c, [0, 1, 3, 621, 0, 0, 0, -1]);
+  assert_eq!(
+    c,
+    [
+      (i32::from(i16::MAX) * i32::from(i16::MIN) >> 16) as i16,
+      1,
+      3,
+      621,
+      0,
+      0,
+      0,
+      -1
+    ]
+  );
+
+  crate::test_random_vector_vs_scalar(
+    |a: i16x8, b| i16x8::mul_keep_high(a, b),
+    |a, b| ((i32::from(a) * i32::from(b)) >> 16) as i16,
+  );
 }
 
 #[test]

diff --git a/tests/all_tests/t_i32x4.rs b/tests/all_tests/t_i32x4.rs
@@ -266,3 +266,22 @@ fn impl_i32x4_shl_each() {
     |a, b| a.wrapping_shl(b as u32),
   );
 }
+
+#[test]
+fn impl_i32x4_mul_widen() {
+  let a = i32x4::from([1, 2, 3 * -1000000, i32::MAX]);
+  let b = i32x4::from([5, 6, 7 * -1000000, i32::MIN]);
+  let expected = i64x4::from([
+    1 * 5,
+    2 * 6,
+    3 * 7 * 1000000 * 1000000,
+    i32::MIN as i64 * i32::MAX as i64,
+  ]);
+  let actual = a.mul_widen(b);
+  assert_eq!(expected, actual);
+
+  crate::test_random_vector_vs_scalar(
+    |a: i32x4, b| a.mul_widen(b),
+    |a, b| a as i64 * b as i64,
+  );
+}
diff --git a/tests/all_tests/t_u16x8.rs b/tests/all_tests/t_u16x8.rs
@@ -218,6 +218,31 @@ fn impl_u16x8_from_u8x16_high() {
   assert_eq!(expected, actual);
 }
 
+#[test]
+fn impl_u16x8_mul_keep_high() {
+  let a = u16x8::from([u16::MAX, 200, 300, 4568, 1, 2, 3, 200]);
+  let b = u16x8::from([u16::MAX, 600, 700, 8910, 15, 26, 37, 600]);
+  let c: [u16; 8] = u16x8::mul_keep_high(a, b).into();
+  assert_eq!(
+    c,
+    [
+      (u32::from(u16::MAX) * u32::from(u16::MAX) >> 16) as u16,
+      1,
+      3,
+      621,
+      0,
+      0,
+      0,
+      1
+    ]
+  );
+
+  crate::test_random_vector_vs_scalar(
+    |a: u16x8, b| u16x8::mul_keep_high(a, b),
+    |a, b| ((u32::from(a) * u32::from(b)) >> 16) as u16,
+  );
+}
+
 #[test]
 fn impl_u16x8_mul_widen() {
   let a = u16x8::from([1, 2, 3, 4, 5, 6, i16::MAX as u16, u16::MAX]);

diff --git a/tests/all_tests/t_u32x4.rs b/tests/all_tests/t_u32x4.rs
@@ -1,4 +1,5 @@
 use std::num::Wrapping;
+
 use wide::*;
 
 #[test]
@@ -234,3 +235,42 @@ fn test_u32x4_none() {
   let a = u32x4::from([0; 4]);
   assert!(a.none());
 }
+
+#[test]
+fn impl_u32x4_mul_widen() {
+  let a = u32x4::from([1, 2, 3 * 1000000, u32::MAX]);
+  let b = u32x4::from([5, 6, 7 * 1000000, u32::MAX]);
+  let expected = u64x4::from([
+    1 * 5,
+    2 * 6,
+    3 * 7 * 1000000 * 1000000,
+    u32::MAX as u64 * u32::MAX as u64,
+  ]);
+  let actual = a.mul_widen(b);
+  assert_eq!(expected, actual);
+
+  crate::test_random_vector_vs_scalar(
+    |a: u32x4, b| a.mul_widen(b),
+    |a, b| u64::from(a) * u64::from(b),
+  );
+}
+
+#[test]
+fn impl_u32x4_mul_keep_high() {
+  let mul_high = |a: u32, b: u32| ((u64::from(a) * u64::from(b)) >> 32) as u32;
+  let a = u32x4::from([1, 2 * 10000000, 3 * 1000000, u32::MAX]);
+  let b = u32x4::from([5, 6 * 100, 7 * 1000000, u32::MAX]);
+  let expected = u32x4::from([
+    mul_high(1, 5),
+    mul_high(2 * 10000000, 6 * 100),
+    mul_high(3 * 1000000, 7 * 1000000),
+    mul_high(u32::MAX, u32::MAX),
+  ]);
+  let actual = a.mul_keep_high(b);
+  assert_eq!(expected, actual);
+
+  crate::test_random_vector_vs_scalar(
+    |a: u32x4, b| a.mul_keep_high(b),
+    |a, b| ((u64::from(a) * u64::from(b)) >> 32) as u32,
+  );
+}
diff --git a/tests/all_tests/t_u32x8.rs b/tests/all_tests/t_u32x8.rs
@@ -295,3 +295,11 @@ fn test_u32x8_none() {
   let a = u32x8::from([0; 8]);
   assert!(a.none());
 }
+
+#[test]
+fn impl_u32x8_mul_keep_high() {
+  crate::test_random_vector_vs_scalar(
+    |a: u32x8, b| u32x8::mul_keep_high(a, b),
+    |a, b| ((u64::from(a) * u64::from(b)) >> 32) as u32,
+  );
+}