Skip to content

Commit

Permalink
improve perf of reduce_min,reduce_max,reduce_add and add unsigned upc…
Browse files Browse the repository at this point in the history
…ast from u16x8 to i32x8
  • Loading branch information
mcroomp committed May 18, 2024
1 parent 264b466 commit 6948895
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 19 deletions.
74 changes: 63 additions & 11 deletions src/i16x8_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -654,32 +654,84 @@ impl i16x8 {
#[inline]
#[must_use]
pub fn reduce_add(self) -> i16 {
let arr: [i16; 8] = cast(self);
pick! {
if #[cfg(target_feature="sse2")] {
let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
let sum64 = add_i16_m128i(self.sse, hi64);
let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
let sum32 = add_i16_m128i(sum64, hi32);
let lo16 = shr_imm_u32_m128i::<16>(sum32);
let sum16 = add_i16_m128i(sum32, lo16);
extract_i16_as_i32_m128i::<0>(sum16) as i16
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
unsafe { vaddvq_s16(self.neon) }
} else {
let arr: [i16; 8] = cast(self);

(arr[0].wrapping_add(arr[1]).wrapping_add(arr[2].wrapping_add(arr[3])))
.wrapping_add(
arr[4].wrapping_add(arr[5]).wrapping_add(arr[6].wrapping_add(arr[7])),
)
let r = arr[0];
for i in 1..8 {
r = r.wrapping_add(arr[i]);
}

r
}
}
}

/// horizontal min of all the elements of the vector
#[inline]
#[must_use]
pub fn reduce_min(self) -> i16 {
let arr: [i16; 8] = cast(self);
pick! {
if #[cfg(target_feature="sse2")] {
let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
let sum64 = min_i16_m128i(self.sse, hi64);
let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
let sum32 = min_i16_m128i(sum64, hi32);
let lo16 = shr_imm_u32_m128i::<16>(sum32);
let sum16 = min_i16_m128i(sum32, lo16);
extract_i16_as_i32_m128i::<0>(sum16) as i16
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
unsafe { vminvq_s16(self.neon) }
} else {
let arr: [i16; 8] = cast(self);

(arr[0].min(arr[1]).min(arr[2].min(arr[3])))
.min(arr[4].min(arr[5]).min(arr[6].min(arr[7])))
let r = arr[0];
for i in 1..8 {
r = r.min(arr[i]);
}

r
}
}
}

/// horizontal max of all the elements of the vector
#[inline]
#[must_use]
pub fn reduce_max(self) -> i16 {
let arr: [i16; 8] = cast(self);
pick! {
if #[cfg(target_feature="sse2")] {
let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
let sum64 = max_i16_m128i(self.sse, hi64);
let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
let sum32 = max_i16_m128i(sum64, hi32);
let lo16 = shr_imm_u32_m128i::<16>(sum32);
let sum16 = max_i16_m128i(sum32, lo16);
extract_i16_as_i32_m128i::<0>(sum16) as i16
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
unsafe { vmaxvq_s16(self.neon) }
} else {
let arr: [i16; 8] = cast(self);

(arr[0].max(arr[1]).max(arr[2].max(arr[3])))
.max(arr[4].max(arr[5]).max(arr[6].max(arr[7])))
let r = arr[0];
for i in 1..8 {
r = r.max(arr[i]);
}

r
}
}
}

#[inline]
Expand Down
43 changes: 35 additions & 8 deletions src/i32x8_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,14 +303,41 @@ impl i32x8 {
}
} else {
i32x8::new([
v.as_array_ref()[0] as i32,
v.as_array_ref()[1] as i32,
v.as_array_ref()[2] as i32,
v.as_array_ref()[3] as i32,
v.as_array_ref()[4] as i32,
v.as_array_ref()[5] as i32,
v.as_array_ref()[6] as i32,
v.as_array_ref()[7] as i32,
i32::from(v.as_array_ref()[0]),
i32::from(v.as_array_ref()[1]),
i32::from(v.as_array_ref()[2]),
i32::from(v.as_array_ref()[3]),
i32::from(v.as_array_ref()[4]),
i32::from(v.as_array_ref()[5]),
i32::from(v.as_array_ref()[6]),
i32::from(v.as_array_ref()[7]),
])
}
}
}

/// widens and zero extends to i32x8
#[inline]
#[must_use]
pub fn from_u16x8(v: u16x8) -> Self {
pick! {
if #[cfg(target_feature="avx2")] {
i32x8 { avx2:convert_to_i32_m256i_from_u16_m128i(v.sse) }
} else if #[cfg(target_feature="sse2")] {
i32x8 {
a: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) },
b: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) },
}
} else {
i32x8::new([
i32::from(v.as_array_ref()[0]),
i32::from(v.as_array_ref()[1]),
i32::from(v.as_array_ref()[2]),
i32::from(v.as_array_ref()[3]),
i32::from(v.as_array_ref()[4]),
i32::from(v.as_array_ref()[5]),
i32::from(v.as_array_ref()[6]),
i32::from(v.as_array_ref()[7]),
])
}
}
Expand Down
11 changes: 11 additions & 0 deletions tests/all_tests/t_i32x8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,17 @@ fn impl_from_i16x8() {
assert_eq!(actual, expected);
}

#[test]
fn impl_from_u16x8() {
let a = u16x8::from([1, 2, 3, 4, 5, i16::MAX as u16, u16::MAX - 1, u16::MAX]);
let actual = i32x8::from_u16x8(a);
let expected =
i32x8::from([1, 2, 3, 4, 5, i16::MAX as i32, (u16::MAX - 1) as i32, u16::MAX as i32]);

assert_eq!(actual, expected);
}


#[test]
fn test_i16x8_move_mask() {
let a = i16x8::from([-1, 0, -2, -3, -1, 0, -2, -3]);
Expand Down

0 comments on commit 6948895

Please sign in to comment.