From 3c811c294e09880ded4edc4e35bf9f923e8979af Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 20 Sep 2023 09:12:14 -0400 Subject: [PATCH] teddy: replace _mm_extract_epi64 with transmute It turns out that _mm_extract_epi64 requires SSE 4.1. While it would be fine to just require that (virtually all CPUs have it available), the rest of Teddy only requires SSSE3. I don't love bumping the mininum required just to get the lanes out of a vector. So just replace it with a transmute. The AVX2 variant isn't impacted by this since AVX2 came with _mm256_extract_epi64. Kudos to https://github.com/llogiq/bytecount/issues/85 for making me check this. --- src/packed/vector.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/packed/vector.rs b/src/packed/vector.rs index f19b86c..ed3f890 100644 --- a/src/packed/vector.rs +++ b/src/packed/vector.rs @@ -320,7 +320,7 @@ pub(crate) trait FatVector: Vector { mod x86_64_ssse3 { use core::arch::x86_64::*; - use crate::util::int::{I32, I64, I8}; + use crate::util::int::{I32, I8}; use super::Vector; @@ -394,12 +394,14 @@ mod x86_64_ssse3 { self, mut f: impl FnMut(usize, u64) -> Option, ) -> Option { - let lane = _mm_extract_epi64(self, 0).to_bits(); - if let Some(t) = f(0, lane) { + // We could just use _mm_extract_epi64 here, but that requires + // SSE 4.1. It isn't necessarily a problem to just require SSE 4.1, + // but everything else works with SSSE3 so we stick to that subset. + let lanes: [u64; 2] = core::mem::transmute(self); + if let Some(t) = f(0, lanes[0]) { return Some(t); } - let lane = _mm_extract_epi64(self, 1).to_bits(); - if let Some(t) = f(1, lane) { + if let Some(t) = f(1, lanes[1]) { return Some(t); } None