From bd6850c9af234b107588bcb090adbcfed8c8e36f Mon Sep 17 00:00:00 2001 From: Alex Yusiuk <55661041+RRRadicalEdward@users.noreply.github.com> Date: Mon, 29 Jan 2024 23:25:14 +0200 Subject: [PATCH] feat: add u8x16::narrow_i16x8 (#148) * feat: add u8x16::narrow_i16x8 * style: run cargo fmt and fix clippy warnings --- src/f32x4_.rs | 2 +- src/f64x2_.rs | 2 +- src/f64x4_.rs | 2 +- src/i16x16_.rs | 8 ++++-- src/i16x8_.rs | 8 ++++-- src/i32x4_.rs | 2 +- src/i32x8_.rs | 2 +- src/i64x2_.rs | 2 +- src/i64x4_.rs | 2 +- src/i8x16_.rs | 2 +- src/i8x32_.rs | 2 +- src/u16x8_.rs | 2 +- src/u32x4_.rs | 2 +- src/u32x8_.rs | 2 +- src/u64x4_.rs | 2 +- src/u8x16_.rs | 52 +++++++++++++++++++++++++++++++++++++- tests/all_tests/t_u8x16.rs | 14 ++++++++-- 17 files changed, 88 insertions(+), 20 deletions(-) diff --git a/src/f32x4_.rs b/src/f32x4_.rs index ae9aa6de..8b5cd62d 100644 --- a/src/f32x4_.rs +++ b/src/f32x4_.rs @@ -1575,7 +1575,7 @@ impl f32x4 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[f32; 4] { + pub fn as_array_mut(&mut self) -> &mut [f32; 4] { cast_mut(self) } } diff --git a/src/f64x2_.rs b/src/f64x2_.rs index 7b637b1d..dc7b6cf9 100644 --- a/src/f64x2_.rs +++ b/src/f64x2_.rs @@ -1605,7 +1605,7 @@ impl f64x2 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[f64; 2] { + pub fn as_array_mut(&mut self) -> &mut [f64; 2] { cast_mut(self) } } diff --git a/src/f64x4_.rs b/src/f64x4_.rs index 186dbb0b..1d9ddefd 100644 --- a/src/f64x4_.rs +++ b/src/f64x4_.rs @@ -1470,7 +1470,7 @@ impl f64x4 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[f64; 4] { + pub fn as_array_mut(&mut self) -> &mut [f64; 4] { cast_mut(self) } } diff --git a/src/i16x16_.rs b/src/i16x16_.rs index af4c6667..21b3ece6 100644 --- a/src/i16x16_.rs +++ b/src/i16x16_.rs @@ -486,7 +486,11 @@ impl i16x16 { } /// Calculates partial dot product. - /// Multiplies packed signed 16-bit integers, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers. + /// Multiplies packed signed 16-bit integers, producing intermediate signed + /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit + /// integers. + #[inline] + #[must_use] pub fn dot(self, rhs: Self) -> i32x8 { pick! { if #[cfg(target_feature="avx2")] { @@ -555,7 +559,7 @@ impl i16x16 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[i16; 16] { + pub fn as_array_mut(&mut self) -> &mut [i16; 16] { cast_mut(self) } } diff --git a/src/i16x8_.rs b/src/i16x8_.rs index 6af8cf7e..625bee5a 100644 --- a/src/i16x8_.rs +++ b/src/i16x8_.rs @@ -781,7 +781,11 @@ impl i16x8 { } /// Calculates partial dot product. - /// Multiplies packed signed 16-bit integers, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers. + /// Multiplies packed signed 16-bit integers, producing intermediate signed + /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit + /// integers. + #[inline] + #[must_use] pub fn dot(self, rhs: Self) -> i32x4 { pick! { if #[cfg(target_feature="sse2")] { @@ -1040,7 +1044,7 @@ impl i16x8 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[i16; 8] { + pub fn as_array_mut(&mut self) -> &mut [i16; 8] { cast_mut(self) } } diff --git a/src/i32x4_.rs b/src/i32x4_.rs index 5b05e5fb..d36b4a2e 100644 --- a/src/i32x4_.rs +++ b/src/i32x4_.rs @@ -601,7 +601,7 @@ impl i32x4 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[i32; 4] { + pub fn as_array_mut(&mut self) -> &mut [i32; 4] { cast_mut(self) } } diff --git a/src/i32x8_.rs b/src/i32x8_.rs index 2caeeec7..819297fa 100644 --- a/src/i32x8_.rs +++ b/src/i32x8_.rs @@ -537,7 +537,7 @@ impl i32x8 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[i32; 8] { + pub fn as_array_mut(&mut self) -> &mut [i32; 8] { cast_mut(self) } } diff --git a/src/i64x2_.rs b/src/i64x2_.rs index 1459ca4a..52298b42 100644 --- a/src/i64x2_.rs +++ b/src/i64x2_.rs @@ -416,7 +416,7 @@ impl i64x2 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[i64; 2] { + pub fn as_array_mut(&mut self) -> &mut [i64; 2] { cast_mut(self) } } diff --git a/src/i64x4_.rs b/src/i64x4_.rs index 46a6eeef..8ac18892 100644 --- a/src/i64x4_.rs +++ b/src/i64x4_.rs @@ -326,7 +326,7 @@ impl i64x4 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[i64; 4] { + pub fn as_array_mut(&mut self) -> &mut [i64; 4] { cast_mut(self) } } diff --git a/src/i8x16_.rs b/src/i8x16_.rs index 9a65308b..3091c0b5 100644 --- a/src/i8x16_.rs +++ b/src/i8x16_.rs @@ -734,7 +734,7 @@ impl i8x16 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[i8; 16] { + pub fn as_array_mut(&mut self) -> &mut [i8; 16] { cast_mut(self) } } diff --git a/src/i8x32_.rs b/src/i8x32_.rs index 592e33cd..afba849a 100644 --- a/src/i8x32_.rs +++ b/src/i8x32_.rs @@ -342,7 +342,7 @@ impl i8x32 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[i8; 32] { + pub fn as_array_mut(&mut self) -> &mut [i8; 32] { cast_mut(self) } } diff --git a/src/u16x8_.rs b/src/u16x8_.rs index 1148cede..18b33aed 100644 --- a/src/u16x8_.rs +++ b/src/u16x8_.rs @@ -512,7 +512,7 @@ impl u16x8 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[u16; 8] { + pub fn as_array_mut(&mut self) -> &mut [u16; 8] { cast_mut(self) } } diff --git a/src/u32x4_.rs b/src/u32x4_.rs index bde75b50..fe6f3ff2 100644 --- a/src/u32x4_.rs +++ b/src/u32x4_.rs @@ -463,7 +463,7 @@ impl u32x4 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[u32; 4] { + pub fn as_array_mut(&mut self) -> &mut [u32; 4] { cast_mut(self) } } diff --git a/src/u32x8_.rs b/src/u32x8_.rs index 9d02aabc..d18cf190 100644 --- a/src/u32x8_.rs +++ b/src/u32x8_.rs @@ -279,7 +279,7 @@ impl u32x8 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[u32; 8] { + pub fn as_array_mut(&mut self) -> &mut [u32; 8] { cast_mut(self) } } diff --git a/src/u64x4_.rs b/src/u64x4_.rs index f39fd73d..09000840 100644 --- a/src/u64x4_.rs +++ b/src/u64x4_.rs @@ -294,7 +294,7 @@ impl u64x4 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[u64; 4] { + pub fn as_array_mut(&mut self) -> &mut [u64; 4] { cast_mut(self) } } diff --git a/src/u8x16_.rs b/src/u8x16_.rs index 3c2156ea..30a448aa 100644 --- a/src/u8x16_.rs +++ b/src/u8x16_.rs @@ -460,6 +460,8 @@ impl u8x16 { } /// Unpack and interleave low lanes of two u8x16 + #[inline] + #[must_use] pub fn unpack_low(lhs: u8x16, rhs: u8x16) -> u8x16 { pick! { if #[cfg(target_feature = "sse2")] { @@ -488,6 +490,8 @@ impl u8x16 { } /// Unpack and interleave high lanes of two u8x16 + #[inline] + #[must_use] pub fn unpack_high(lhs: u8x16, rhs: u8x16) -> u8x16 { pick! { if #[cfg(target_feature = "sse2")] { @@ -515,6 +519,52 @@ impl u8x16 { } } + /// Pack and saturate two i16x8 to u8x16 + #[inline] + #[must_use] + pub fn narrow_i16x8(lhs: i16x8, rhs: i16x8) -> Self { + pick! { + if #[cfg(target_feature = "sse2")] { + u8x16 { sse: pack_i16_to_u8_m128i(lhs.sse, rhs.sse) } + } else if #[cfg(target_feature = "simd128")] { + u8x16 { simd: u8x16_narrow_i16x8(lhs.simd, rhs.simd) } + } else if #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] { + let lhs = unsafe { vqmovun_s16(lhs.neon) }; + let rhs = unsafe { vqmovun_s16(rhs.neon) }; + u8x16 { neon: unsafe { vcombine_u8(lhs, rhs) } } + } else { + fn clamp(a: i16) -> u8 { + if a < u8::MIN as i16 { + u8::MIN + } else if a > u8::MAX as i16 { + u8::MAX + } else { + a as u8 + } + } + + Self { arr: [ + clamp(lhs.as_array_ref()[0]), + clamp(lhs.as_array_ref()[1]), + clamp(lhs.as_array_ref()[2]), + clamp(lhs.as_array_ref()[3]), + clamp(lhs.as_array_ref()[4]), + clamp(lhs.as_array_ref()[5]), + clamp(lhs.as_array_ref()[6]), + clamp(lhs.as_array_ref()[7]), + clamp(rhs.as_array_ref()[0]), + clamp(rhs.as_array_ref()[1]), + clamp(rhs.as_array_ref()[2]), + clamp(rhs.as_array_ref()[3]), + clamp(rhs.as_array_ref()[4]), + clamp(rhs.as_array_ref()[5]), + clamp(rhs.as_array_ref()[6]), + clamp(rhs.as_array_ref()[7]), + ]} + } + } + } + #[inline] pub fn to_array(self) -> [u8; 16] { cast(self) @@ -526,7 +576,7 @@ impl u8x16 { } #[inline] - pub fn as_array_mut(&mut self) -> &mut[u8; 16] { + pub fn as_array_mut(&mut self) -> &mut [u8; 16] { cast_mut(self) } } diff --git a/tests/all_tests/t_u8x16.rs b/tests/all_tests/t_u8x16.rs index 9815403a..93958902 100644 --- a/tests/all_tests/t_u8x16.rs +++ b/tests/all_tests/t_u8x16.rs @@ -167,7 +167,8 @@ fn impl_u8x16_min() { #[test] fn impl_unpack_low_u8() { let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); - let b = u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]); + let b = + u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]); let c: [u8; 16] = u8x16::unpack_low(a, b).into(); assert_eq!(c, [0, 12, 1, 11, 2, 22, 3, 13, 4, 99, 5, 15, 6, 16, 7, 17]); } @@ -175,7 +176,16 @@ fn impl_unpack_low_u8() { #[test] fn impl_unpack_high_u8() { let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); - let b = u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]); + let b = + u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]); let c: [u8; 16] = u8x16::unpack_high(a, b).into(); assert_eq!(c, [8, 8, 9, 19, 10, 2, 11, 21, 12, 22, 13, 3, 14, 24, 15, 127]); } + +#[test] +fn impl_narrow_i16x8() { + let a = i16x8::from([-1, 2, -3, 4, -5, 6, -7, 8]); + let b = i16x8::from([9, 10, 11, 12, 13, -14, 15, -16]); + let c: [u8; 16] = u8x16::narrow_i16x8(a, b).into(); + assert_eq!(c, [0, 2, 0, 4, 0, 6, 0, 8, 9, 10, 11, 12, 13, 0, 15, 0]); +}