From 76016472441f955c00bb137804771e782f105c3b Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan <me@rreverser.com>
Date: Tue, 18 Sep 2018 22:49:11 +0100
Subject: [PATCH 1/8] Comment out func used by commented out funcs

---
 benches/pathology.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/benches/pathology.rs b/benches/pathology.rs
index 9fe10a8..c516842 100644
--- a/benches/pathology.rs
+++ b/benches/pathology.rs
@@ -27,9 +27,11 @@ use test::{Bencher, black_box};
 use twoway::find_str as tw_find;
 use twoway::rfind_str as tw_rfind;
 
+/*
 pub fn is_prefix(text: &str, pattern: &str) -> bool {
     Str(pattern).is_prefix_of(text)
 }
+*/
 
 pub fn memmem(text: &str, pattern: &str) -> bool {
     #[allow(improper_ctypes)]
@@ -535,7 +537,7 @@ pub fn rfind_char_1(b: &mut Bencher) {
         t
     });
     b.bytes = haystack.len() as u64;
-} 
+}
 
 #[cfg(feature = "test-set")]
 fn bench_data() -> Vec<u8> { vec![0u8; 256 * 1024] }

From d6ada813a2519a0a0e4d442a58084068390cbe3e Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan <me@rreverser.com>
Date: Tue, 18 Sep 2018 22:50:32 +0100
Subject: [PATCH 2/8] cargo fix

---
 src/tw.rs      | 14 +++++++-------
 tests/quick.rs |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/tw.rs b/src/tw.rs
index 5725f1a..660c8c9 100644
--- a/src/tw.rs
+++ b/src/tw.rs
@@ -302,19 +302,19 @@ fn test_find() {
 
 #[test]
 fn test_max_suf_pos() {
-    assert_eq!(2, compute_max_suf_pos((b"aab")));
+    assert_eq!(2, compute_max_suf_pos(b"aab"));
 
-    assert_eq!(2, compute_max_suf_pos((b"aabaa")));
+    assert_eq!(2, compute_max_suf_pos(b"aabaa"));
 
-    assert_eq!(0, compute_max_suf_pos((b"gcagagag")));
-    assert_eq!(2, compute_max_suf_pos((b"banana")));
+    assert_eq!(0, compute_max_suf_pos(b"gcagagag"));
+    assert_eq!(2, compute_max_suf_pos(b"banana"));
 }
 
 #[test]
 fn test_maxsuf_and_period() {
-    assert_eq!((2, 1), maxsuf_and_period((b"aab")));
-    assert_eq!((2, 3), maxsuf_and_period((b"aabaa")));
-    assert_eq!((0, 7), maxsuf_and_period((b"gcagagag")));
+    assert_eq!((2, 1), maxsuf_and_period(b"aab"));
+    assert_eq!((2, 3), maxsuf_and_period(b"aabaa"));
+    assert_eq!((0, 7), maxsuf_and_period(b"gcagagag"));
 }
 
 /*
diff --git a/tests/quick.rs b/tests/quick.rs
index 2a74f2d..553d125 100644
--- a/tests/quick.rs
+++ b/tests/quick.rs
@@ -291,7 +291,7 @@ fn test_search_steps() {
         let n = &b.0;
         let tws = StrSearcher::new(hay, n);
         // Make sure it covers the whole string
-        let mut search_steps = unfold(tws, |mut tws| {
+        let mut search_steps = unfold(tws, |tws| {
             match tws.next() {
                 SearchStep::Done => None,
                 otherwise => Some(otherwise),
@@ -323,7 +323,7 @@ fn test_search_steps_rev() {
         let n = &b.0;
         let tws = StrSearcher::new(hay, n);
         // Make sure it covers the whole string
-        let mut search_steps = unfold(tws, |mut tws| {
+        let mut search_steps = unfold(tws, |tws| {
             match tws.next_back() {
                 SearchStep::Done => None,
                 otherwise => Some(otherwise),

From 7befdde6f7117f8687367aa6478546f685566a61 Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan <me@rreverser.com>
Date: Tue, 18 Sep 2018 23:25:34 +0100
Subject: [PATCH 3/8] Switch pcmp module to stable Rust intrinsics

---
 src/lib.rs  |  5 ++---
 src/pcmp.rs | 60 ++++++++++++-----------------------------------------
 2 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 1639e26..5217871 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,5 @@
 #![cfg_attr(not(test), no_std)]
 #![cfg_attr(feature = "pattern", feature(pattern))]
-#![cfg_attr(feature = "pcmp", feature(asm))]
 
 #[cfg(not(test))]
 extern crate core as std;
@@ -492,7 +491,7 @@ impl TwoWaySearcher {
     }
 
     /// Return the zero-based critical position and period of the provided needle.
-    /// 
+    ///
     /// The returned period is incorrect when the actual period is "long." In
     /// that case the approximation must be computed separately.
     #[inline(always)]
@@ -913,7 +912,7 @@ fn test_contains() {
     assert!(contains(h, n));
     assert!(contains_rev(h, n));
 
-    let h = "\u{0}\u{0}\u{0}\u{0}"; 
+    let h = "\u{0}\u{0}\u{0}\u{0}";
     let n = "\u{0}";
     assert!(contains(h, n));
     assert!(contains_rev(h, n));
diff --git a/src/pcmp.rs b/src/pcmp.rs
index bc5960b..e924ff8 100644
--- a/src/pcmp.rs
+++ b/src/pcmp.rs
@@ -25,10 +25,11 @@ fn zip<I, J>(i: I, j: J) -> Zip<I::IntoIter, J::IntoIter>
     i.into_iter().zip(j)
 }
 
-/// `pcmpestri` flags
-const EQUAL_ANY: u8 = 0b0000;
-const EQUAL_EACH: u8 = 0b1000;
-const EQUAL_ORDERED: u8 = 0b1100;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
 
 /// `pcmpestri`
 ///
@@ -42,27 +43,9 @@ unsafe fn pcmpestri_16(text: *const u8, offset: usize, text_len: usize,
                        needle_1: u64, needle_2: u64, needle_len: usize) -> u32 {
     //debug_assert!(text_len + offset <= text.len()); // saturates at 16
     //debug_assert!(needle_len <= 16); // saturates at 16
-    let res: u32;
-    // 0xC = 12, Equal Ordered comparison
-    //
-    // movlhps xmm0, xmm1  Move low word of xmm1 to high word of xmm0
-    asm!("movlhps $1, $2
-          pcmpestri $1, [$3 + $4], $5"
-         : // output operands
-         "={ecx}"(res)
-         : // input operands
-         "x"(needle_1),        // operand 1 = needle  `x` = sse register
-         "x"(needle_2),        // operand 1 = needle
-         "r"(text), // operand 2 pointer = haystack
-         "r"(offset),        // operand 2 offset
-         "i"(EQUAL_ORDERED),
-         "{rax}"(needle_len),// length of operand 1 = needle
-         "{rdx}"(text_len)   // length of operand 2 = haystack
-         : // clobbers
-         "cc"
-         : "intel" // options
-    );
-    res
+    let needle = _mm_set_epi64x(needle_2 as _, needle_1 as _);
+    let text = _mm_loadu_si128(text.offset(offset as _) as *const _);
+    _mm_cmpestri(needle, needle_len as _, text, text_len as _, _SIDD_CMP_EQUAL_ORDERED) as _
 }
 
 /// `pcmpestrm`
@@ -79,27 +62,10 @@ unsafe fn pcmpestrm_eq_each(text: *const u8, offset: usize, text_len: usize,
     // NOTE: needle *must* be readable for 16 bytes
     //debug_assert!(text_len + offset <= text.len()); // saturates at 16
     //debug_assert!(needle_len <= 16); // saturates at 16
-    let res: u64;
-    // 0xC = 12, Equal Ordered comparison
-    //
-    // movlhps xmm0, xmm1  Move low word of xmm1 to high word of xmm0
-    asm!("movdqu xmm0, [$1 + $2]
-          pcmpestrm xmm0, [$3 + $4], $5"
-         : // output operands
-         "={xmm0}"(res)
-         : // input operands
-         "r"(needle),         // operand 1 = needle
-         "r"(noffset),        // operand 1 = needle offset
-         "r"(text), // operand 2 pointer = haystack
-         "r"(offset),        // operand 2 offset
-         "i"(EQUAL_EACH),
-         "{rax}"(needle_len),// length of operand 1 = needle
-         "{rdx}"(text_len)   // length of operand 2 = haystack
-         : // clobbers
-         "cc"
-         : "intel" // options
-    );
-    res
+    let needle = _mm_loadu_si128(needle.offset(noffset as _) as *const _);
+    let text = _mm_loadu_si128(text.offset(offset as _) as *const _);
+    let mask = _mm_cmpestrm(needle, needle_len as _, text, text_len as _, _SIDD_CMP_EQUAL_EACH);
+    _mm_extract_epi64(mask, 0) as _
 }
 
 
@@ -527,7 +493,7 @@ fn pat128(pat: &[u8]) -> (u64, u64) {
 }
 
 /// Find longest shared prefix, return its length
-/// 
+///
 /// Alignment safe: works for any text, pat.
 pub fn shared_prefix(text: &[u8], pat: &[u8]) -> usize {
     let tp = text.as_ptr();

From bf728b28ad7ac5af1248d01c6d7552b45789ee37 Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan <me@rreverser.com>
Date: Tue, 18 Sep 2018 23:50:01 +0100
Subject: [PATCH 4/8] Add x86 fallback for absent _mm_extract_epi64

---
 src/pcmp.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/pcmp.rs b/src/pcmp.rs
index e924ff8..69805b2 100644
--- a/src/pcmp.rs
+++ b/src/pcmp.rs
@@ -65,7 +65,16 @@ unsafe fn pcmpestrm_eq_each(text: *const u8, offset: usize, text_len: usize,
     let needle = _mm_loadu_si128(needle.offset(noffset as _) as *const _);
     let text = _mm_loadu_si128(text.offset(offset as _) as *const _);
     let mask = _mm_cmpestrm(needle, needle_len as _, text, text_len as _, _SIDD_CMP_EQUAL_EACH);
-    _mm_extract_epi64(mask, 0) as _
+
+    #[cfg(target_arch = "x86")] {
+        let mut res: u64 = ::std::mem::uninitialized();
+        _mm_storel_epi64(&mut res, mask);
+        res
+    }
+
+    #[cfg(target_arch = "x86_64")] {
+        _mm_extract_epi64(mask, 0) as _
+    }
 }
 
 

From e30ad65c629ae8551a05cfd86daf39882451f9fc Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan <me@rreverser.com>
Date: Wed, 19 Sep 2018 01:09:44 +0100
Subject: [PATCH 5/8] Remove compile-time pcmp feature

Switch to compile-time cfg for just x86/x86-64 and runtime detection for SSE 4.2.

Also fix no_std fallbacks, including for detection (without `use_std` CPU features can't be detected at runtime, but it will be still possible to enable pcmp with explicit `-C target-feature=+sse4.2`).
---
 Cargo.toml                 | 11 ++++-------
 README.rst                 |  4 +---
 benches/pathology.rs       |  1 -
 fuzz/Cargo.toml            |  3 ---
 fuzz/run1_pcmp.sh          |  2 +-
 fuzz/run_substring_pcmp.sh |  2 +-
 src/lib.rs                 | 24 ++++++++----------------
 src/pcmp.rs                |  9 +++++++++
 tests/quick.rs             |  5 -----
 9 files changed, 24 insertions(+), 37 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index a8f83d4..1655828 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ name = "twoway"
 version = "0.1.8"
 authors = ["bluss"]
 
-description = "Fast substring search for strings and byte strings. Optional SSE4.2 acceleration (requires nightly and cargo feature flag pcmp) using pcmpestri. Memchr is the only mandatory dependency. The two way algorithm is also used by rust's libstd itself, but here it is exposed both for byte strings, using memchr, and optionally using a SSE4.2 accelerated version."
+description = "Fast substring search for strings and byte strings. Optional SSE4.2 acceleration (if detected at runtime) using pcmpestri. Memchr is the only mandatory dependency. The two way algorithm is also used by rust's libstd itself, but here it is exposed both for byte strings, using memchr, and optionally using a SSE4.2 accelerated version."
 
 license = "MIT/Apache-2.0"
 repository = "https://github.com/bluss/twoway"
@@ -14,7 +14,7 @@ categories = ["algorithms", "no-std"]
 
 [dependencies]
 memchr = { version = "2.0", default-features = false }
-unchecked-index = { version = "0.2.2", optional = true }
+unchecked-index = { version = "0.2.2" }
 jetscii = {version = "0.3", features= ["unstable"], optional = true }
 galil-seiferas = { version = "0.1.1", optional = true }
 
@@ -31,14 +31,11 @@ quickcheck = { version = "0.5", default-features = false }
 default = ["use_std"]
 use_std = ["memchr/use_std"]
 
-# pcmpestri, requires nightly
-pcmp = ["unchecked-index"]
-
 # Internal features for testing & benchmarking & development
 pattern = []
 test-set = []
-benchmarks = ["galil-seiferas", "pattern", "unchecked-index"]
-all = ["jetscii", "pcmp", "pattern", "test-set"]
+benchmarks = ["galil-seiferas", "pattern"]
+all = ["jetscii", "pattern", "test-set"]
 
 
 [package.metadata.release]
diff --git a/README.rst b/README.rst
index 1484a01..be460db 100644
--- a/README.rst
+++ b/README.rst
@@ -22,11 +22,9 @@ This is the same code as is included in Rust's libstd to “power” ``str::find
 but here it is exposed with some improvements:
 
 - Available for byte string searches using ``&[u8]``
-- Having an optional SSE4.2 accelerated version which is even faster.
+- Having an optional SSE4.2 accelerated version (if detected at runtime) which is even faster.
 - Using ``memchr`` for the single byte case, which is ultra fast.
 
-Use cargo feature ``pcmp`` to enable SSE4.2 / pcmpestri accelerated version (only the forward search).
-
 - ``twoway::find_bytes(text: &[u8], pattern: &[u8]) -> Option<usize>``
 - ``twoway::rfind_bytes(text: &[u8], pattern: &[u8]) -> Option<usize>``
 - ``twoway::find_str(text: &str, pattern: &str) -> Option<usize>``
diff --git a/benches/pathology.rs b/benches/pathology.rs
index c516842..754be60 100644
--- a/benches/pathology.rs
+++ b/benches/pathology.rs
@@ -164,7 +164,6 @@ macro_rules! bench_contains_vs_tw {
             }
             */
 
-            #[cfg(feature = "pcmp")]
             #[bench]
             pub fn pcmp_find(b: &mut Bencher) {
                 let haystack = black_box($hay);
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index f22b780..256ceda 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -13,9 +13,6 @@ path = ".."
 [dependencies.libfuzzer-sys]
 git = "https://github.com/rust-fuzz/libfuzzer-sys.git"
 
-[features]
-pcmp = ["twoway/pcmp"]
-
 # Prevent this from interfering with workspaces
 [workspace]
 members = ["."]
diff --git a/fuzz/run1_pcmp.sh b/fuzz/run1_pcmp.sh
index d0bfd20..8e6cc73 100755
--- a/fuzz/run1_pcmp.sh
+++ b/fuzz/run1_pcmp.sh
@@ -2,4 +2,4 @@
 
 DIR=$(dirname "$0")
 V=$(cat "$DIR"/nightly-version)
-cargo +$V fuzz run --features=pcmp -O -a fuzz_target_1 -- -only_ascii=1 -max_len=5000 "$@"
+cargo +$V fuzz run -O -a fuzz_target_1 -- -only_ascii=1 -max_len=5000 "$@"
diff --git a/fuzz/run_substring_pcmp.sh b/fuzz/run_substring_pcmp.sh
index cd83886..a528a84 100755
--- a/fuzz/run_substring_pcmp.sh
+++ b/fuzz/run_substring_pcmp.sh
@@ -2,4 +2,4 @@
 
 DIR=$(dirname "$0")
 V=$(cat "$DIR"/nightly-version)
-cargo +$V fuzz run --features=pcmp -O substring -- -only_ascii=1 -max_len=256 "$@"
+cargo +$V fuzz run -O substring -- -only_ascii=1 -max_len=256 "$@"
diff --git a/src/lib.rs b/src/lib.rs
index 5217871..c24316a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,7 +1,7 @@
-#![cfg_attr(not(test), no_std)]
+#![cfg_attr(not(feature = "use_std"), no_std)]
 #![cfg_attr(feature = "pattern", feature(pattern))]
 
-#[cfg(not(test))]
+#[cfg(not(feature = "use_std"))]
 extern crate core as std;
 
 use std::cmp;
@@ -10,7 +10,7 @@ use std::usize;
 extern crate memchr;
 
 mod tw;
-#[cfg(feature = "pcmp")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub mod pcmp;
 pub mod bmh;
 #[cfg(feature = "test-set")]
@@ -27,7 +27,7 @@ use std::str::pattern::{
 
 /// `find_str` finds the first ocurrence of `pattern` in the `text`.
 ///
-/// Uses the SSE42 version if it is compiled in.
+/// Uses the SSE42 version if it is available at runtime.
 #[inline]
 pub fn find_str(text: &str, pattern: &str) -> Option<usize> {
     find_bytes(text.as_bytes(), pattern.as_bytes())
@@ -35,19 +35,11 @@ pub fn find_str(text: &str, pattern: &str) -> Option<usize> {
 
 /// `find_bytes` finds the first ocurrence of `pattern` in the `text`.
 ///
-/// Uses the SSE42 version if it is compiled in.
-#[cfg(feature = "pcmp")]
-#[inline]
+/// Uses the SSE42 version if it is available at runtime.
 pub fn find_bytes(text: &[u8], pattern: &[u8]) -> Option<usize> {
-    pcmp::find(text, pattern)
-}
-
-/// `find_bytes` finds the first ocurrence of `pattern` in the `text`.
-///
-/// Uses the SSE42 version if it is compiled in.
-#[cfg(not(feature = "pcmp"))]
-pub fn find_bytes(text: &[u8], pattern: &[u8]) -> Option<usize> {
-    if pattern.is_empty() {
+    if cfg!(any(target_arch = "x86", target_arch = "x86_64")) && pcmp::is_supported() {
+        pcmp::find(text, pattern)
+    } else if pattern.is_empty() {
         Some(0)
     } else if pattern.len() == 1 {
         memchr::memchr(pattern[0], text)
diff --git a/src/pcmp.rs b/src/pcmp.rs
index 69805b2..8b0afc9 100644
--- a/src/pcmp.rs
+++ b/src/pcmp.rs
@@ -283,6 +283,15 @@ fn find_short_pat(text: &[u8], pat: &[u8]) -> Option<usize> {
     }
 }
 
+/// `is_supported` checks whether necessary SSE 4.2 feature is supported on current CPU.
+pub fn is_supported() -> bool {
+    if cfg!(feature = "use_std") {
+        is_x86_feature_detected!("sse4.2")
+    } else {
+        cfg!(target_feature = "sse4.2")
+    }
+}
+
 /// `find` finds the first ocurrence of `pattern` in the `text`.
 ///
 /// This is the SSE42 accelerated version.
diff --git a/tests/quick.rs b/tests/quick.rs
index 553d125..5405a70 100644
--- a/tests/quick.rs
+++ b/tests/quick.rs
@@ -485,7 +485,6 @@ fn test_find_rev_period() {
 }
 
 
-#[cfg(feature = "pcmp")]
 // pcmpestr tests
 #[test]
 fn test_pcmp_contains() {
@@ -498,7 +497,6 @@ fn test_pcmp_contains() {
     quickcheck(prop as fn(_, _) -> _);
 }
 
-#[cfg(feature = "pcmp")]
 #[test]
 fn test_pcmp_contains_plus() {
     fn prop(a: Text, b: Short<Text>) -> TestResult {
@@ -514,7 +512,6 @@ fn test_pcmp_contains_plus() {
     quickcheck(prop as fn(_, _) -> _);
 }
 
-#[cfg(feature = "pcmp")]
 // pcmpestr tests
 #[test]
 fn test_pcmp_find() {
@@ -527,7 +524,6 @@ fn test_pcmp_find() {
     quickcheck(prop as fn(_, _) -> _);
 }
 
-#[cfg(feature = "pcmp")]
 // pcmpestr tests
 #[test]
 fn test_pcmp_find_simple() {
@@ -540,7 +536,6 @@ fn test_pcmp_find_simple() {
     quickcheck(prop as fn(_, _) -> _);
 }
 
-#[cfg(feature = "pcmp")]
 // pcmpestr tests
 #[test]
 fn test_pcmp_find_period() {

From 26ef11653b7b90068d29d387e63db0483426c34b Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan <me@rreverser.com>
Date: Wed, 19 Sep 2018 01:40:41 +0100
Subject: [PATCH 6/8] Preconstruct SIMD vector for needle

---
 src/pcmp.rs | 54 ++++++++++++++++++-----------------------------------
 1 file changed, 18 insertions(+), 36 deletions(-)

diff --git a/src/pcmp.rs b/src/pcmp.rs
index 8b0afc9..5115f5a 100644
--- a/src/pcmp.rs
+++ b/src/pcmp.rs
@@ -12,7 +12,6 @@ extern crate memchr;
 
 use std::cmp;
 use std::iter::Zip;
-use std::ptr;
 
 use self::unchecked_index::get_unchecked;
 
@@ -40,10 +39,9 @@ use std::arch::x86_64::*;
 /// Return value: least index for start of (partial) match, (16 if no match).
 #[inline(always)]
 unsafe fn pcmpestri_16(text: *const u8, offset: usize, text_len: usize,
-                       needle_1: u64, needle_2: u64, needle_len: usize) -> u32 {
+                       needle: __m128i, needle_len: usize) -> u32 {
     //debug_assert!(text_len + offset <= text.len()); // saturates at 16
     //debug_assert!(needle_len <= 16); // saturates at 16
-    let needle = _mm_set_epi64x(needle_2 as _, needle_1 as _);
     let text = _mm_loadu_si128(text.offset(offset as _) as *const _);
     _mm_cmpestri(needle, needle_len as _, text, text_len as _, _SIDD_CMP_EQUAL_ORDERED) as _
 }
@@ -85,15 +83,13 @@ fn first_start_of_match(text: &[u8], pat: &[u8]) -> Option<(usize, usize)> {
     // not safe for text that is non aligned and ends at page boundary
     let patl = pat.len();
     assert!(patl <= 16);
-    // load pat as a little endian word
-    let (patw1, patw2) = pat128(pat);
-    first_start_of_match_inner(text, pat, patw1, patw2)
+    first_start_of_match_inner(text, pat, pat128(pat))
 }
 
 /// Safe wrapper around pcmpestri to find first match of `pat` in `text`.
-/// `p1`, `p2` are the first two words of `pat` and *must* match.
+/// `p` contains the first two words of `pat` and *must* match.
 /// Length given by length of `pat`, only first 16 bytes considered.
-fn first_start_of_match_inner(text: &[u8], pat: &[u8], p1: u64, p2: u64) -> Option<(usize, usize)> {
+fn first_start_of_match_inner(text: &[u8], pat: &[u8], p: __m128i) -> Option<(usize, usize)> {
     // align the text pointer
     let tp = text.as_ptr();
     let tp_align_offset = tp as usize & 0xF;
@@ -135,7 +131,7 @@ fn first_start_of_match_inner(text: &[u8], pat: &[u8], p1: u64, p2: u64) -> Opti
     while text.len() >= offset - tp_align_offset + patl {
         unsafe {
             let tlen = text.len() - (offset - tp_align_offset);
-            let ret = pcmpestri_16(tp_aligned, offset, tlen, p1, p2, patl) as usize;
+            let ret = pcmpestri_16(tp_aligned, offset, tlen, p, patl) as usize;
             if ret == 16 {
                 offset += 16;
             } else {
@@ -151,7 +147,7 @@ fn first_start_of_match_inner(text: &[u8], pat: &[u8], p1: u64, p2: u64) -> Opti
 /// safe to search unaligned for first start of match
 ///
 /// unsafe because the end of text must not be close (within 16 bytes) of a page boundary
-unsafe fn first_start_of_match_unaligned(text: &[u8], pat_len: usize, p1: u64, p2: u64) -> Option<(usize, usize)> {
+unsafe fn first_start_of_match_unaligned(text: &[u8], pat_len: usize, p: __m128i) -> Option<(usize, usize)> {
     let tp = text.as_ptr();
     debug_assert!(pat_len <= 16);
     debug_assert!(pat_len <= text.len());
@@ -160,7 +156,7 @@ unsafe fn first_start_of_match_unaligned(text: &[u8], pat_len: usize, p1: u64, p
 
     while text.len() - pat_len >= offset {
         let tlen = text.len() - offset;
-        let ret = pcmpestri_16(tp, offset, tlen, p1, p2, pat_len) as usize;
+        let ret = pcmpestri_16(tp, offset, tlen, p, pat_len) as usize;
         if ret == 16 {
             offset += 16;
         } else {
@@ -224,7 +220,7 @@ fn find_short_pat(text: &[u8], pat: &[u8]) -> Option<usize> {
         return find_2byte_pat(text, pat);
     }
     */
-    let (r1, _) = pat128(pat);
+    let r = pat128(pat);
 
     // safe part of text -- everything but the last 16 bytes
     let safetext = &text[..cmp::max(text.len(), 16) - 16];
@@ -235,7 +231,7 @@ fn find_short_pat(text: &[u8], pat: &[u8]) -> Option<usize> {
             break;
         }
         // find the next occurence
-        match unsafe { first_start_of_match_unaligned(&safetext[pos..], pat.len(), r1, 0) } {
+        match unsafe { first_start_of_match_unaligned(&safetext[pos..], pat.len(), r) } {
             None => break, // no matches
             Some((mpos, mlen)) => {
                 pos += mpos;
@@ -261,7 +257,7 @@ fn find_short_pat(text: &[u8], pat: &[u8]) -> Option<usize> {
             return None;
         }
         // find the next occurence
-        match first_start_of_match_inner(&text[pos..], pat, r1, 0) {
+        match first_start_of_match_inner(&text[pos..], pat, r) {
             None => return None, // no matches
             Some((mpos, mlen)) => {
                 pos += mpos;
@@ -331,7 +327,7 @@ pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
     let (right16, _right17) = right.split_at(cmp::min(16, right.len()));
     assert!(right.len() != 0);
 
-    let (r1, r2) = pat128(right);
+    let r = pat128(right);
 
     // safe part of text -- everything but the last 16 bytes
     let safetext = &text[..cmp::max(text.len(), 16) - 16];
@@ -345,7 +341,7 @@ pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
             }
             // find the next occurence of the right half
             let start = crit_pos;
-            match unsafe { first_start_of_match_unaligned(&safetext[pos + start..], right16.len(), r1, r2) } {
+            match unsafe { first_start_of_match_unaligned(&safetext[pos + start..], right16.len(), r) } {
                 None => break, // no matches
                 Some((mpos, mlen)) => {
                     pos += mpos;
@@ -383,7 +379,7 @@ pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
             //println!("memory trace pos={}, memory={}", pos, memory);
             let mut pfxlen = if memory == 0 {
                 let start = crit_pos;
-                match unsafe { first_start_of_match_unaligned(&safetext[pos + start..], right16.len(), r1, r2) } {
+                match unsafe { first_start_of_match_unaligned(&safetext[pos + start..], right16.len(), r) } {
                     None => break, // no matches
                     Some((mpos, mlen)) => {
                         pos += mpos;
@@ -425,7 +421,7 @@ pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
         }
         // find the next occurence of the right half
         let start = crit_pos;
-        match first_start_of_match_inner(&text[pos + start..], right16, r1, r2) {
+        match first_start_of_match_inner(&text[pos + start..], right16, r) {
             None => return None, // no matches
             Some((mpos, mlen)) => {
                 pos += mpos;
@@ -490,24 +486,10 @@ fn test_find() {
 
 }
 
-/// Load the first 16 bytes of `pat` into two words, little endian
-fn pat128(pat: &[u8]) -> (u64, u64) {
-    // load pat as a little endian word
-    let (mut p1, mut p2) = (0, 0);
-    unsafe {
-        let patl = pat.len();
-        ptr::copy_nonoverlapping(&pat[0],
-                                 &mut p1 as *mut _ as *mut _,
-                                 cmp::min(8, patl));
-
-        if patl > 8 {
-            ptr::copy_nonoverlapping(&pat[8],
-                                     &mut p2 as *mut _ as *mut _,
-                                     cmp::min(16, patl) - 8);
-
-        }
-    }
-    (p1, p2)
+/// Load the first 16 bytes of `pat` into a SIMD vector.
+#[inline(always)]
+fn pat128(pat: &[u8]) -> __m128i {
+    unsafe { _mm_loadu_si128(pat.as_ptr() as *const _) }
 }
 
 /// Find longest shared prefix, return its length

From d94de1f1765ee636ca2d082939203e14e50b439e Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan <me@rreverser.com>
Date: Wed, 19 Sep 2018 18:47:00 +0100
Subject: [PATCH 7/8] Fix cfg for pcmp::is_supported

It should be compilation attribute to avoid that block altogether.
---
 src/lib.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index c24316a..78ab84d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -37,9 +37,12 @@ pub fn find_str(text: &str, pattern: &str) -> Option<usize> {
 ///
 /// Uses the SSE42 version if it is available at runtime.
 pub fn find_bytes(text: &[u8], pattern: &[u8]) -> Option<usize> {
-    if cfg!(any(target_arch = "x86", target_arch = "x86_64")) && pcmp::is_supported() {
-        pcmp::find(text, pattern)
-    } else if pattern.is_empty() {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        if pcmp::is_supported() {
+            return pcmp::find(text, pattern);
+        }
+    }
+    if pattern.is_empty() {
         Some(0)
     } else if pattern.len() == 1 {
         memchr::memchr(pattern[0], text)

From 6dedfa22a6528776969b29ea04d4ecf5992881d8 Mon Sep 17 00:00:00 2001
From: Ingvar Stepanyan <me@rreverser.com>
Date: Wed, 19 Sep 2018 19:22:03 +0100
Subject: [PATCH 8/8] Add target_feature everywhere...

This looks ugly due to infecting every private function in the pcmp chain, but apparently is required for inlining and does help performance: https://github.com/rust-lang/rust/issues/54353#issuecomment-422856654
---
 src/lib.rs  |  12 +++--
 src/pcmp.rs | 138 +++++++++++++++++++++++++++-------------------------
 2 files changed, 80 insertions(+), 70 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 78ab84d..c6dd818 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -37,16 +37,18 @@ pub fn find_str(text: &str, pattern: &str) -> Option<usize> {
 ///
 /// Uses the SSE42 version if it is available at runtime.
 pub fn find_bytes(text: &[u8], pattern: &[u8]) -> Option<usize> {
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
-        if pcmp::is_supported() {
-            return pcmp::find(text, pattern);
-        }
-    }
     if pattern.is_empty() {
         Some(0)
+    } else if text.len() < pattern.len() {
+        return None;
     } else if pattern.len() == 1 {
         memchr::memchr(pattern[0], text)
     } else {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+            if pcmp::is_supported() {
+                return unsafe { pcmp::find_inner(text, pattern) };
+            }
+        }
         let mut searcher = TwoWaySearcher::new(pattern, text.len());
         let is_long = searcher.memory == usize::MAX;
         // write out `true` and `false` cases to encourage the compiler
diff --git a/src/pcmp.rs b/src/pcmp.rs
index 5115f5a..b5bc0f4 100644
--- a/src/pcmp.rs
+++ b/src/pcmp.rs
@@ -37,7 +37,7 @@ use std::arch::x86_64::*;
 /// PCMPESTRI xmm1, xmm2/m128, imm8
 ///
 /// Return value: least index for start of (partial) match, (16 if no match).
-#[inline(always)]
+#[target_feature(enable = "sse4.2")]
 unsafe fn pcmpestri_16(text: *const u8, offset: usize, text_len: usize,
                        needle: __m128i, needle_len: usize) -> u32 {
     //debug_assert!(text_len + offset <= text.len()); // saturates at 16
@@ -53,7 +53,7 @@ unsafe fn pcmpestri_16(text: *const u8, offset: usize, text_len: usize,
 /// PCMPESTRM xmm1, xmm2/m128, imm8
 ///
 /// Return value: bitmask in the 16 lsb of the return value.
-#[inline(always)]
+#[target_feature(enable = "sse4.2")]
 unsafe fn pcmpestrm_eq_each(text: *const u8, offset: usize, text_len: usize,
                             needle: *const u8, noffset: usize, needle_len: usize) -> u64 {
     // NOTE: text *must* be readable for 16 bytes
@@ -83,26 +83,26 @@ fn first_start_of_match(text: &[u8], pat: &[u8]) -> Option<(usize, usize)> {
     // not safe for text that is non aligned and ends at page boundary
     let patl = pat.len();
     assert!(patl <= 16);
-    first_start_of_match_inner(text, pat, pat128(pat))
+    unsafe { first_start_of_match_inner(text, pat, pat128(pat)) }
 }
 
 /// Safe wrapper around pcmpestri to find first match of `pat` in `text`.
 /// `p` contains the first two words of `pat` and *must* match.
 /// Length given by length of `pat`, only first 16 bytes considered.
-fn first_start_of_match_inner(text: &[u8], pat: &[u8], p: __m128i) -> Option<(usize, usize)> {
+#[target_feature(enable = "sse4.2")]
+unsafe fn first_start_of_match_inner(text: &[u8], pat: &[u8], p: __m128i) -> Option<(usize, usize)> {
     // align the text pointer
     let tp = text.as_ptr();
     let tp_align_offset = tp as usize & 0xF;
     let init_len;
     let tp_aligned;
-    unsafe {
-        if tp_align_offset != 0 {
-            init_len = 16 - tp_align_offset;
-            tp_aligned = tp.offset(-(tp_align_offset as isize));
-        } else {
-            init_len = 0;
-            tp_aligned = tp;
-        };
+
+    if tp_align_offset != 0 {
+        init_len = 16 - tp_align_offset;
+        tp_aligned = tp.offset(-(tp_align_offset as isize));
+    } else {
+        init_len = 0;
+        tp_aligned = tp;
     }
 
     let patl = pat.len();
@@ -129,15 +129,13 @@ fn first_start_of_match_inner(text: &[u8], pat: &[u8], p: __m128i) -> Option<(us
         offset += 16;
     }
     while text.len() >= offset - tp_align_offset + patl {
-        unsafe {
-            let tlen = text.len() - (offset - tp_align_offset);
-            let ret = pcmpestri_16(tp_aligned, offset, tlen, p, patl) as usize;
-            if ret == 16 {
-                offset += 16;
-            } else {
-                let match_len = cmp::min(patl, 16 - ret);
-                return Some((offset - tp_align_offset + ret, match_len));
-            }
+        let tlen = text.len() - (offset - tp_align_offset);
+        let ret = pcmpestri_16(tp_aligned, offset, tlen, p, patl) as usize;
+        if ret == 16 {
+            offset += 16;
+        } else {
+            let match_len = cmp::min(patl, 16 - ret);
+            return Some((offset - tp_align_offset + ret, match_len));
         }
     }
 
@@ -213,7 +211,8 @@ fn find_2byte_pat(text: &[u8], pat: &[u8]) -> Option<(usize, usize)> {
 }
 
 /// Simd text search optimized for short patterns (<= 8 bytes)
-fn find_short_pat(text: &[u8], pat: &[u8]) -> Option<usize> {
+#[target_feature(enable = "sse4.2")]
+unsafe fn find_short_pat(text: &[u8], pat: &[u8]) -> Option<usize> {
     debug_assert!(pat.len() <= 8);
     /*
     if pat.len() == 2 {
@@ -231,7 +230,7 @@ fn find_short_pat(text: &[u8], pat: &[u8]) -> Option<usize> {
             break;
         }
         // find the next occurence
-        match unsafe { first_start_of_match_unaligned(&safetext[pos..], pat.len(), r) } {
+        match first_start_of_match_unaligned(&safetext[pos..], pat.len(), r) {
             None => break, // no matches
             Some((mpos, mlen)) => {
                 pos += mpos;
@@ -292,18 +291,22 @@ pub fn is_supported() -> bool {
 ///
 /// This is the SSE42 accelerated version.
 pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
-    let pat = pattern;
-    if pat.len() == 0 {
-        return Some(0);
-    }
+    assert!(is_supported());
 
-    if text.len() < pat.len() {
+    if pattern.is_empty() {
+        return Some(0);
+    } else if text.len() < pattern.len() {
         return None;
+    } else if pattern.len() == 1 {
+        return memchr::memchr(pattern[0], text);
+    } else {
+        unsafe { find_inner(text, pattern) }
     }
+}
 
-    if pat.len() == 1 {
-        return memchr::memchr(pat[0], text);
-    } else if pat.len() <= 6 {
+#[target_feature(enable = "sse4.2")]
+pub(crate) unsafe fn find_inner(text: &[u8], pat: &[u8]) -> Option<usize> {
+    if pat.len() <= 6 {
         return find_short_pat(text, pat);
     }
 
@@ -341,13 +344,13 @@ pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
             }
             // find the next occurence of the right half
             let start = crit_pos;
-            match unsafe { first_start_of_match_unaligned(&safetext[pos + start..], right16.len(), r) } {
+            match first_start_of_match_unaligned(&safetext[pos + start..], right16.len(), r) {
                 None => break, // no matches
                 Some((mpos, mlen)) => {
                     pos += mpos;
                     let mut pfxlen = mlen;
                     if pfxlen < right.len() {
-                        pfxlen += shared_prefix(&text[pos + start + mlen..], &right[mlen..]);
+                        pfxlen += shared_prefix_inner(&text[pos + start + mlen..], &right[mlen..]);
                     }
                     if pfxlen != right.len() {
                         // partial match
@@ -379,7 +382,7 @@ pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
             //println!("memory trace pos={}, memory={}", pos, memory);
             let mut pfxlen = if memory == 0 {
                 let start = crit_pos;
-                match unsafe { first_start_of_match_unaligned(&safetext[pos + start..], right16.len(), r) } {
+                match first_start_of_match_unaligned(&safetext[pos + start..], right16.len(), r) {
                     None => break, // no matches
                     Some((mpos, mlen)) => {
                         pos += mpos;
@@ -390,7 +393,7 @@ pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
                 memory - crit_pos
             };
             if pfxlen < right.len() {
-                pfxlen += shared_prefix(&text[pos + crit_pos + pfxlen..], &right[pfxlen..]);
+                pfxlen += shared_prefix_inner(&text[pos + crit_pos + pfxlen..], &right[pfxlen..]);
             }
             if pfxlen != right.len() {
                 // partial match
@@ -427,7 +430,7 @@ pub fn find(text: &[u8], pattern: &[u8]) -> Option<usize> {
                 pos += mpos;
                 let mut pfxlen = mlen;
                 if pfxlen < right.len() {
-                    pfxlen += shared_prefix(&text[pos + start + mlen..], &right[mlen..]);
+                    pfxlen += shared_prefix_inner(&text[pos + start + mlen..], &right[mlen..]);
                 }
                 if pfxlen != right.len() {
                     // partial match
@@ -496,44 +499,49 @@ fn pat128(pat: &[u8]) -> __m128i {
 ///
 /// Alignment safe: works for any text, pat.
 pub fn shared_prefix(text: &[u8], pat: &[u8]) -> usize {
+    assert!(is_supported());
+
+    unsafe { shared_prefix_inner(text, pat) }
+}
+
+#[target_feature(enable = "sse4.2")]
+unsafe fn shared_prefix_inner(text: &[u8], pat: &[u8]) -> usize {
     let tp = text.as_ptr();
     let tlen = text.len();
     let pp = pat.as_ptr();
     let plen = pat.len();
     let len = cmp::min(tlen, plen);
 
-    unsafe {
-        // TODO: do non-aligned prefix manually too(?) aligned text or pat..
-        // all but the end we can process with pcmpestrm
-        let initial_part = len.saturating_sub(16);
-        let mut prefix_len = 0;
-        let mut offset = 0;
-        while offset < initial_part {
-            let initial_tail = initial_part - offset;
-            let mask = pcmpestrm_eq_each(tp, offset, initial_tail, pp, offset, initial_tail);
-            // find zero in the first 16 bits
-            if mask != 0xffff {
-                let first_bit_set = (mask ^ 0xffff).trailing_zeros() as usize;
-                prefix_len += first_bit_set;
-                return prefix_len;
-            } else {
-                prefix_len += cmp::min(initial_tail, 16);
-            }
-            offset += 16;
+    // TODO: do non-aligned prefix manually too(?) aligned text or pat..
+    // all but the end we can process with pcmpestrm
+    let initial_part = len.saturating_sub(16);
+    let mut prefix_len = 0;
+    let mut offset = 0;
+    while offset < initial_part {
+        let initial_tail = initial_part - offset;
+        let mask = pcmpestrm_eq_each(tp, offset, initial_tail, pp, offset, initial_tail);
+        // find zero in the first 16 bits
+        if mask != 0xffff {
+            let first_bit_set = (mask ^ 0xffff).trailing_zeros() as usize;
+            prefix_len += first_bit_set;
+            return prefix_len;
+        } else {
+            prefix_len += cmp::min(initial_tail, 16);
         }
-        // so one block left, the last (up to) 16 bytes
-        // unchecked slicing .. we don't want panics in this function
-        let text_suffix = get_unchecked(text, prefix_len..len);
-        let pat_suffix = get_unchecked(pat, prefix_len..len);
-        for (&a, &b) in zip(text_suffix, pat_suffix) {
-            if a != b {
-                break;
-            }
-            prefix_len += 1;
+        offset += 16;
+    }
+    // so one block left, the last (up to) 16 bytes
+    // unchecked slicing .. we don't want panics in this function
+    let text_suffix = get_unchecked(text, prefix_len..len);
+    let pat_suffix = get_unchecked(pat, prefix_len..len);
+    for (&a, &b) in zip(text_suffix, pat_suffix) {
+        if a != b {
+            break;
         }
-
-        prefix_len
+        prefix_len += 1;
     }
+
+    prefix_len
 }
 
 #[test]