From bd6850c9af234b107588bcb090adbcfed8c8e36f Mon Sep 17 00:00:00 2001
From: Alex Yusiuk <55661041+RRRadicalEdward@users.noreply.github.com>
Date: Mon, 29 Jan 2024 23:25:14 +0200
Subject: [PATCH] feat: add u8x16::narrow_i16x8 (#148)

* feat: add u8x16::narrow_i16x8

* style: run cargo fmt and fix clippy warnings
---
 src/f32x4_.rs              |  2 +-
 src/f64x2_.rs              |  2 +-
 src/f64x4_.rs              |  2 +-
 src/i16x16_.rs             |  8 ++++--
 src/i16x8_.rs              |  8 ++++--
 src/i32x4_.rs              |  2 +-
 src/i32x8_.rs              |  2 +-
 src/i64x2_.rs              |  2 +-
 src/i64x4_.rs              |  2 +-
 src/i8x16_.rs              |  2 +-
 src/i8x32_.rs              |  2 +-
 src/u16x8_.rs              |  2 +-
 src/u32x4_.rs              |  2 +-
 src/u32x8_.rs              |  2 +-
 src/u64x4_.rs              |  2 +-
 src/u8x16_.rs              | 52 +++++++++++++++++++++++++++++++++++++-
 tests/all_tests/t_u8x16.rs | 14 ++++++++--
 17 files changed, 88 insertions(+), 20 deletions(-)

diff --git a/src/f32x4_.rs b/src/f32x4_.rs
index ae9aa6de..8b5cd62d 100644
--- a/src/f32x4_.rs
+++ b/src/f32x4_.rs
@@ -1575,7 +1575,7 @@ impl f32x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[f32; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [f32; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/f64x2_.rs b/src/f64x2_.rs
index 7b637b1d..dc7b6cf9 100644
--- a/src/f64x2_.rs
+++ b/src/f64x2_.rs
@@ -1605,7 +1605,7 @@ impl f64x2 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[f64; 2] {
+  pub fn as_array_mut(&mut self) -> &mut [f64; 2] {
     cast_mut(self)
   }
 }
diff --git a/src/f64x4_.rs b/src/f64x4_.rs
index 186dbb0b..1d9ddefd 100644
--- a/src/f64x4_.rs
+++ b/src/f64x4_.rs
@@ -1470,7 +1470,7 @@ impl f64x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[f64; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [f64; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/i16x16_.rs b/src/i16x16_.rs
index af4c6667..21b3ece6 100644
--- a/src/i16x16_.rs
+++ b/src/i16x16_.rs
@@ -486,7 +486,11 @@ impl i16x16 {
   }
 
   /// Calculates partial dot product.
-  /// Multiplies packed signed 16-bit integers, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers.
+  /// Multiplies packed signed 16-bit integers, producing intermediate signed
+  /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit
+  /// integers.
+  #[inline]
+  #[must_use]
   pub fn dot(self, rhs: Self) -> i32x8 {
     pick! {
       if #[cfg(target_feature="avx2")] {
@@ -555,7 +559,7 @@ impl i16x16 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i16; 16] {
+  pub fn as_array_mut(&mut self) -> &mut [i16; 16] {
     cast_mut(self)
   }
 }
diff --git a/src/i16x8_.rs b/src/i16x8_.rs
index 6af8cf7e..625bee5a 100644
--- a/src/i16x8_.rs
+++ b/src/i16x8_.rs
@@ -781,7 +781,11 @@ impl i16x8 {
   }
 
   /// Calculates partial dot product.
-  /// Multiplies packed signed 16-bit integers, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers.
+  /// Multiplies packed signed 16-bit integers, producing intermediate signed
+  /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit
+  /// integers.
+  #[inline]
+  #[must_use]
   pub fn dot(self, rhs: Self) -> i32x4 {
     pick! {
       if #[cfg(target_feature="sse2")] {
@@ -1040,7 +1044,7 @@ impl i16x8 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i16; 8] {
+  pub fn as_array_mut(&mut self) -> &mut [i16; 8] {
     cast_mut(self)
   }
 }
diff --git a/src/i32x4_.rs b/src/i32x4_.rs
index 5b05e5fb..d36b4a2e 100644
--- a/src/i32x4_.rs
+++ b/src/i32x4_.rs
@@ -601,7 +601,7 @@ impl i32x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i32; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [i32; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/i32x8_.rs b/src/i32x8_.rs
index 2caeeec7..819297fa 100644
--- a/src/i32x8_.rs
+++ b/src/i32x8_.rs
@@ -537,7 +537,7 @@ impl i32x8 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i32; 8] {
+  pub fn as_array_mut(&mut self) -> &mut [i32; 8] {
     cast_mut(self)
   }
 }
diff --git a/src/i64x2_.rs b/src/i64x2_.rs
index 1459ca4a..52298b42 100644
--- a/src/i64x2_.rs
+++ b/src/i64x2_.rs
@@ -416,7 +416,7 @@ impl i64x2 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i64; 2] {
+  pub fn as_array_mut(&mut self) -> &mut [i64; 2] {
     cast_mut(self)
   }
 }
diff --git a/src/i64x4_.rs b/src/i64x4_.rs
index 46a6eeef..8ac18892 100644
--- a/src/i64x4_.rs
+++ b/src/i64x4_.rs
@@ -326,7 +326,7 @@ impl i64x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i64; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [i64; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/i8x16_.rs b/src/i8x16_.rs
index 9a65308b..3091c0b5 100644
--- a/src/i8x16_.rs
+++ b/src/i8x16_.rs
@@ -734,7 +734,7 @@ impl i8x16 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i8; 16] {
+  pub fn as_array_mut(&mut self) -> &mut [i8; 16] {
     cast_mut(self)
   }
 }
diff --git a/src/i8x32_.rs b/src/i8x32_.rs
index 592e33cd..afba849a 100644
--- a/src/i8x32_.rs
+++ b/src/i8x32_.rs
@@ -342,7 +342,7 @@ impl i8x32 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i8; 32] {
+  pub fn as_array_mut(&mut self) -> &mut [i8; 32] {
     cast_mut(self)
   }
 }
diff --git a/src/u16x8_.rs b/src/u16x8_.rs
index 1148cede..18b33aed 100644
--- a/src/u16x8_.rs
+++ b/src/u16x8_.rs
@@ -512,7 +512,7 @@ impl u16x8 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u16; 8] {
+  pub fn as_array_mut(&mut self) -> &mut [u16; 8] {
     cast_mut(self)
   }
 }
diff --git a/src/u32x4_.rs b/src/u32x4_.rs
index bde75b50..fe6f3ff2 100644
--- a/src/u32x4_.rs
+++ b/src/u32x4_.rs
@@ -463,7 +463,7 @@ impl u32x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u32; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [u32; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/u32x8_.rs b/src/u32x8_.rs
index 9d02aabc..d18cf190 100644
--- a/src/u32x8_.rs
+++ b/src/u32x8_.rs
@@ -279,7 +279,7 @@ impl u32x8 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u32; 8] {
+  pub fn as_array_mut(&mut self) -> &mut [u32; 8] {
     cast_mut(self)
   }
 }
diff --git a/src/u64x4_.rs b/src/u64x4_.rs
index f39fd73d..09000840 100644
--- a/src/u64x4_.rs
+++ b/src/u64x4_.rs
@@ -294,7 +294,7 @@ impl u64x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u64; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [u64; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/u8x16_.rs b/src/u8x16_.rs
index 3c2156ea..30a448aa 100644
--- a/src/u8x16_.rs
+++ b/src/u8x16_.rs
@@ -460,6 +460,8 @@ impl u8x16 {
   }
 
   /// Unpack and interleave low lanes of two u8x16
+  #[inline]
+  #[must_use]
   pub fn unpack_low(lhs: u8x16, rhs: u8x16) -> u8x16 {
     pick! {
         if #[cfg(target_feature = "sse2")] {
@@ -488,6 +490,8 @@ impl u8x16 {
   }
 
   /// Unpack and interleave high lanes of two u8x16
+  #[inline]
+  #[must_use]
   pub fn unpack_high(lhs: u8x16, rhs: u8x16) -> u8x16 {
     pick! {
         if #[cfg(target_feature = "sse2")] {
@@ -515,6 +519,52 @@ impl u8x16 {
     }
   }
 
+  /// Pack and saturate two i16x8 to u8x16
+  #[inline]
+  #[must_use]
+  pub fn narrow_i16x8(lhs: i16x8, rhs: i16x8) -> Self {
+    pick! {
+        if #[cfg(target_feature = "sse2")] {
+            u8x16 { sse: pack_i16_to_u8_m128i(lhs.sse, rhs.sse) }
+        } else if #[cfg(target_feature = "simd128")] {
+            u8x16 { simd: u8x16_narrow_i16x8(lhs.simd, rhs.simd) }
+        } else if #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] {
+            let lhs = unsafe { vqmovun_s16(lhs.neon) };
+            let rhs = unsafe { vqmovun_s16(rhs.neon) };
+            u8x16 { neon: unsafe { vcombine_u8(lhs, rhs) } }
+        } else {
+            fn clamp(a: i16) -> u8 {
+                  if a < u8::MIN as i16 {
+                      u8::MIN
+                  } else if a > u8::MAX as i16 {
+                      u8::MAX
+                  } else {
+                      a as u8
+                  }
+            }
+
+            Self { arr: [
+                clamp(lhs.as_array_ref()[0]),
+                clamp(lhs.as_array_ref()[1]),
+                clamp(lhs.as_array_ref()[2]),
+                clamp(lhs.as_array_ref()[3]),
+                clamp(lhs.as_array_ref()[4]),
+                clamp(lhs.as_array_ref()[5]),
+                clamp(lhs.as_array_ref()[6]),
+                clamp(lhs.as_array_ref()[7]),
+                clamp(rhs.as_array_ref()[0]),
+                clamp(rhs.as_array_ref()[1]),
+                clamp(rhs.as_array_ref()[2]),
+                clamp(rhs.as_array_ref()[3]),
+                clamp(rhs.as_array_ref()[4]),
+                clamp(rhs.as_array_ref()[5]),
+                clamp(rhs.as_array_ref()[6]),
+                clamp(rhs.as_array_ref()[7]),
+            ]}
+        }
+    }
+  }
+
   #[inline]
   pub fn to_array(self) -> [u8; 16] {
     cast(self)
@@ -526,7 +576,7 @@ impl u8x16 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u8; 16] {
+  pub fn as_array_mut(&mut self) -> &mut [u8; 16] {
     cast_mut(self)
   }
 }
diff --git a/tests/all_tests/t_u8x16.rs b/tests/all_tests/t_u8x16.rs
index 9815403a..93958902 100644
--- a/tests/all_tests/t_u8x16.rs
+++ b/tests/all_tests/t_u8x16.rs
@@ -167,7 +167,8 @@ fn impl_u8x16_min() {
 #[test]
 fn impl_unpack_low_u8() {
   let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-  let b = u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]);
+  let b =
+    u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]);
   let c: [u8; 16] = u8x16::unpack_low(a, b).into();
   assert_eq!(c, [0, 12, 1, 11, 2, 22, 3, 13, 4, 99, 5, 15, 6, 16, 7, 17]);
 }
@@ -175,7 +176,16 @@ fn impl_unpack_low_u8() {
 #[test]
 fn impl_unpack_high_u8() {
   let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-  let b = u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]);
+  let b =
+    u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]);
   let c: [u8; 16] = u8x16::unpack_high(a, b).into();
   assert_eq!(c, [8, 8, 9, 19, 10, 2, 11, 21, 12, 22, 13, 3, 14, 24, 15, 127]);
 }
+
+#[test]
+fn impl_narrow_i16x8() {
+  let a = i16x8::from([-1, 2, -3, 4, -5, 6, -7, 8]);
+  let b = i16x8::from([9, 10, 11, 12, 13, -14, 15, -16]);
+  let c: [u8; 16] = u8x16::narrow_i16x8(a, b).into();
+  assert_eq!(c, [0, 2, 0, 4, 0, 6, 0, 8, 9, 10, 11, 12, 13, 0, 15, 0]);
+}