feat: add u8x16::narrow_i16x8 (#148)

* feat: add u8x16::narrow_i16x8 * style: run cargo fmt and fix clippy warnings
Lokathor · Jan 29, 2024 · bd6850c · bd6850c
1 parent a048507
commit bd6850c
Show file tree

Hide file tree

Showing 17 changed files with 88 additions and 20 deletions.
diff --git a/src/f32x4_.rs b/src/f32x4_.rs
@@ -1575,7 +1575,7 @@ impl f32x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[f32; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [f32; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/f64x2_.rs b/src/f64x2_.rs
@@ -1605,7 +1605,7 @@ impl f64x2 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[f64; 2] {
+  pub fn as_array_mut(&mut self) -> &mut [f64; 2] {
     cast_mut(self)
   }
 }

diff --git a/src/f64x4_.rs b/src/f64x4_.rs
@@ -1470,7 +1470,7 @@ impl f64x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[f64; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [f64; 4] {
     cast_mut(self)
   }
 }

diff --git a/src/i16x16_.rs b/src/i16x16_.rs
@@ -486,7 +486,11 @@ impl i16x16 {
   }
 
   /// Calculates partial dot product.
-  /// Multiplies packed signed 16-bit integers, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers.
+  /// Multiplies packed signed 16-bit integers, producing intermediate signed
+  /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit
+  /// integers.
+  #[inline]
+  #[must_use]
   pub fn dot(self, rhs: Self) -> i32x8 {
     pick! {
       if #[cfg(target_feature="avx2")] {
@@ -555,7 +559,7 @@ impl i16x16 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i16; 16] {
+  pub fn as_array_mut(&mut self) -> &mut [i16; 16] {
     cast_mut(self)
   }
 }
diff --git a/src/i16x8_.rs b/src/i16x8_.rs
@@ -781,7 +781,11 @@ impl i16x8 {
   }
 
   /// Calculates partial dot product.
-  /// Multiplies packed signed 16-bit integers, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers.
+  /// Multiplies packed signed 16-bit integers, producing intermediate signed
+  /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit
+  /// integers.
+  #[inline]
+  #[must_use]
   pub fn dot(self, rhs: Self) -> i32x4 {
     pick! {
       if #[cfg(target_feature="sse2")] {
@@ -1040,7 +1044,7 @@ impl i16x8 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i16; 8] {
+  pub fn as_array_mut(&mut self) -> &mut [i16; 8] {
     cast_mut(self)
   }
 }
diff --git a/src/i32x4_.rs b/src/i32x4_.rs
@@ -601,7 +601,7 @@ impl i32x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i32; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [i32; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/i32x8_.rs b/src/i32x8_.rs
@@ -537,7 +537,7 @@ impl i32x8 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i32; 8] {
+  pub fn as_array_mut(&mut self) -> &mut [i32; 8] {
     cast_mut(self)
   }
 }

diff --git a/src/i64x2_.rs b/src/i64x2_.rs
@@ -416,7 +416,7 @@ impl i64x2 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i64; 2] {
+  pub fn as_array_mut(&mut self) -> &mut [i64; 2] {
     cast_mut(self)
   }
 }
diff --git a/src/i64x4_.rs b/src/i64x4_.rs
@@ -326,7 +326,7 @@ impl i64x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i64; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [i64; 4] {
     cast_mut(self)
   }
 }

diff --git a/src/i8x16_.rs b/src/i8x16_.rs
@@ -734,7 +734,7 @@ impl i8x16 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i8; 16] {
+  pub fn as_array_mut(&mut self) -> &mut [i8; 16] {
     cast_mut(self)
   }
 }
diff --git a/src/i8x32_.rs b/src/i8x32_.rs
@@ -342,7 +342,7 @@ impl i8x32 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[i8; 32] {
+  pub fn as_array_mut(&mut self) -> &mut [i8; 32] {
     cast_mut(self)
   }
 }
diff --git a/src/u16x8_.rs b/src/u16x8_.rs
@@ -512,7 +512,7 @@ impl u16x8 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u16; 8] {
+  pub fn as_array_mut(&mut self) -> &mut [u16; 8] {
     cast_mut(self)
   }
 }
diff --git a/src/u32x4_.rs b/src/u32x4_.rs
@@ -463,7 +463,7 @@ impl u32x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u32; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [u32; 4] {
     cast_mut(self)
   }
 }
diff --git a/src/u32x8_.rs b/src/u32x8_.rs
@@ -279,7 +279,7 @@ impl u32x8 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u32; 8] {
+  pub fn as_array_mut(&mut self) -> &mut [u32; 8] {
     cast_mut(self)
   }
 }

diff --git a/src/u64x4_.rs b/src/u64x4_.rs
@@ -294,7 +294,7 @@ impl u64x4 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u64; 4] {
+  pub fn as_array_mut(&mut self) -> &mut [u64; 4] {
     cast_mut(self)
   }
 }

diff --git a/src/u8x16_.rs b/src/u8x16_.rs
@@ -460,6 +460,8 @@ impl u8x16 {
   }
 
   /// Unpack and interleave low lanes of two u8x16
+  #[inline]
+  #[must_use]
   pub fn unpack_low(lhs: u8x16, rhs: u8x16) -> u8x16 {
     pick! {
         if #[cfg(target_feature = "sse2")] {
@@ -488,6 +490,8 @@ impl u8x16 {
   }
 
   /// Unpack and interleave high lanes of two u8x16
+  #[inline]
+  #[must_use]
   pub fn unpack_high(lhs: u8x16, rhs: u8x16) -> u8x16 {
     pick! {
         if #[cfg(target_feature = "sse2")] {
@@ -515,6 +519,52 @@ impl u8x16 {
     }
   }
 
+  /// Pack and saturate two i16x8 to u8x16
+  #[inline]
+  #[must_use]
+  pub fn narrow_i16x8(lhs: i16x8, rhs: i16x8) -> Self {
+    pick! {
+        if #[cfg(target_feature = "sse2")] {
+            u8x16 { sse: pack_i16_to_u8_m128i(lhs.sse, rhs.sse) }
+        } else if #[cfg(target_feature = "simd128")] {
+            u8x16 { simd: u8x16_narrow_i16x8(lhs.simd, rhs.simd) }
+        } else if #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] {
+            let lhs = unsafe { vqmovun_s16(lhs.neon) };
+            let rhs = unsafe { vqmovun_s16(rhs.neon) };
+            u8x16 { neon: unsafe { vcombine_u8(lhs, rhs) } }
+        } else {
+            fn clamp(a: i16) -> u8 {
+                  if a < u8::MIN as i16 {
+                      u8::MIN
+                  } else if a > u8::MAX as i16 {
+                      u8::MAX
+                  } else {
+                      a as u8
+                  }
+            }
+
+            Self { arr: [
+                clamp(lhs.as_array_ref()[0]),
+                clamp(lhs.as_array_ref()[1]),
+                clamp(lhs.as_array_ref()[2]),
+                clamp(lhs.as_array_ref()[3]),
+                clamp(lhs.as_array_ref()[4]),
+                clamp(lhs.as_array_ref()[5]),
+                clamp(lhs.as_array_ref()[6]),
+                clamp(lhs.as_array_ref()[7]),
+                clamp(rhs.as_array_ref()[0]),
+                clamp(rhs.as_array_ref()[1]),
+                clamp(rhs.as_array_ref()[2]),
+                clamp(rhs.as_array_ref()[3]),
+                clamp(rhs.as_array_ref()[4]),
+                clamp(rhs.as_array_ref()[5]),
+                clamp(rhs.as_array_ref()[6]),
+                clamp(rhs.as_array_ref()[7]),
+            ]}
+        }
+    }
+  }
+
   #[inline]
   pub fn to_array(self) -> [u8; 16] {
     cast(self)
@@ -526,7 +576,7 @@ impl u8x16 {
   }
 
   #[inline]
-  pub fn as_array_mut(&mut self) -> &mut[u8; 16] {
+  pub fn as_array_mut(&mut self) -> &mut [u8; 16] {
     cast_mut(self)
   }
 }
diff --git a/tests/all_tests/t_u8x16.rs b/tests/all_tests/t_u8x16.rs
@@ -167,15 +167,25 @@ fn impl_u8x16_min() {
 #[test]
 fn impl_unpack_low_u8() {
   let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-  let b = u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]);
+  let b =
+    u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]);
   let c: [u8; 16] = u8x16::unpack_low(a, b).into();
   assert_eq!(c, [0, 12, 1, 11, 2, 22, 3, 13, 4, 99, 5, 15, 6, 16, 7, 17]);
 }
 
 #[test]
 fn impl_unpack_high_u8() {
   let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-  let b = u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]);
+  let b =
+    u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]);
   let c: [u8; 16] = u8x16::unpack_high(a, b).into();
   assert_eq!(c, [8, 8, 9, 19, 10, 2, 11, 21, 12, 22, 13, 3, 14, 24, 15, 127]);
 }
+
+#[test]
+fn impl_narrow_i16x8() {
+  let a = i16x8::from([-1, 2, -3, 4, -5, 6, -7, 8]);
+  let b = i16x8::from([9, 10, 11, 12, 13, -14, 15, -16]);
+  let c: [u8; 16] = u8x16::narrow_i16x8(a, b).into();
+  assert_eq!(c, [0, 2, 0, 4, 0, 6, 0, 8, 9, 10, 11, 12, 13, 0, 15, 0]);
+}