Skip to content

Commit

Permalink
Added i16 widening mul since this is implemented badly on some platfo…
Browse files Browse the repository at this point in the history
…rms (#153)

* add reduce min/max along with tests. Also optimize i16 abs for sse2

* mulwiden

* better mulwiden

* better mulwiden

* fix test

* add must_sue and inline

* fix spacing

* remove unnecessary paren
  • Loading branch information
mcroomp authored May 13, 2024
1 parent 4b7b5f2 commit 86cb2a0
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
44 changes: 44 additions & 0 deletions src/i16x8_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -933,6 +933,50 @@ impl i16x8 {
}
}

/// multiplies two i16x8 and returns the result as a widened i32x8
#[inline]
#[must_use]
pub fn mul_widen(self, rhs: Self) -> i32x8 {
pick! {
if #[cfg(target_feature="avx2")] {
let a = convert_to_i32_m256i_from_i16_m128i(self.sse);
let b = convert_to_i32_m256i_from_i16_m128i(rhs.sse);
i32x8 { avx2: mul_i32_keep_low_m256i(a,b) }
} else if #[cfg(target_feature="sse2")] {
let low = mul_i16_keep_low_m128i(self.sse, rhs.sse);
let high = mul_i16_keep_high_m128i(self.sse, rhs.sse);
i32x8 {
a: i32x4 { sse:unpack_low_i16_m128i(low, high) },
b: i32x4 { sse:unpack_high_i16_m128i(low, high) }
}
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
let lhs_low = unsafe { vget_low_s16(self.neon) };
let rhs_low = unsafe { vget_low_s16(rhs.neon) };

let lhs_high = unsafe { vget_high_s16(self.neon) };
let rhs_high = unsafe { vget_high_s16(rhs.neon) };

let low = unsafe { vmull_s16(lhs_low, rhs_low) };
let high = unsafe { vmull_s16(lhs_high, rhs_high) };

i32x8 { a: i32x4 { neon: low }, b: i32x4 {neon: high } }
} else {
let a = self.as_array_ref();
let b = rhs.as_array_ref();
i32x8::new([
i32::from(a[0]) * i32::from(b[0]),
i32::from(a[1]) * i32::from(b[1]),
i32::from(a[2]) * i32::from(b[2]),
i32::from(a[3]) * i32::from(b[3]),
i32::from(a[4]) * i32::from(b[4]),
i32::from(a[5]) * i32::from(b[5]),
i32::from(a[6]) * i32::from(b[6]),
i32::from(a[7]) * i32::from(b[7]),
])
}
}
}

/// transpose matrix of 8x8 i16 matrix
#[must_use]
#[inline]
Expand Down
18 changes: 18 additions & 0 deletions tests/all_tests/t_i16x8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,3 +361,21 @@ fn impl_mul_keep_high() {
let c: [i16; 8] = i16x8::mul_keep_high(a, b).into();
assert_eq!(c, [0, 1, 3, 621, 0, 0, 0, -1]);
}

#[test]
fn impl_i16x8_mul_widen() {
let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN, i16::MAX]);
let b = i16x8::from([17, -18, 190, -20, 21, -22, i16::MAX, i16::MAX]);
let expected = i32x8::from([
17,
-36,
570,
-80,
105,
-132,
(i16::MIN as i32) * (i16::MAX as i32),
(i16::MAX as i32) * (i16::MAX as i32),
]);
let actual = a.mul_widen(b);
assert_eq!(expected, actual);
}

0 comments on commit 86cb2a0

Please sign in to comment.