From c36db9f21dc5249e6fe9f27a6c821585ce5e92e3 Mon Sep 17 00:00:00 2001
From: Yacin Tmimi <yacintmimi@gmail.com>
Date: Tue, 9 Apr 2024 00:21:20 -0400
Subject: [PATCH 1/2] remove `packed_simd` in favor of `std::simd`

Fixes 91

As mentioned in the `packed_simd` README, the crate is superseded by
`#![feature(portable_simd)]`.
---
 Cargo.toml          |  3 +--
 src/lib.rs          |  2 ++
 src/simd/generic.rs | 29 +++++++++++++++--------------
 3 files changed, 18 insertions(+), 16 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 1b24de7..38b1bc7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,12 +18,11 @@ appveyor = { repository = "llogiq/bytecount" }
 bench = false
 
 [features]
-generic-simd = ["packed_simd"]
+generic-simd = []
 runtime-dispatch-simd = []
 html_report = []
 
 [dependencies]
-packed_simd = { version = "0.3.8", optional = true }
 
 [dev-dependencies]
 quickcheck = "1.0"
diff --git a/src/lib.rs b/src/lib.rs
index e24262b..ba7408c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -31,6 +31,8 @@
 //! [`naive_count_32`](fn.naive_count_32.html) method can be faster
 //! still on small strings.
 
+#![cfg_attr(feature = "generic-simd", feature(portable_simd))]
+
 #![deny(missing_docs)]
 #![cfg_attr(not(feature = "runtime-dispatch-simd"), no_std)]
 
diff --git a/src/simd/generic.rs b/src/simd/generic.rs
index 640ccd8..cc2dd69 100644
--- a/src/simd/generic.rs
+++ b/src/simd/generic.rs
@@ -1,11 +1,12 @@
-extern crate packed_simd;
+
 
 #[cfg(not(feature = "runtime-dispatch-simd"))]
-use core::mem;
+use core::{mem, simd};
+
 #[cfg(feature = "runtime-dispatch-simd")]
-use std::mem;
+use std::{mem, simd};
 
-use self::packed_simd::{u8x32, u8x64, FromCast};
+use simd::{u8x32, u8x64, cmp::SimdPartialEq, num::SimdInt};
 
 const MASK: [u8; 64] = [
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -14,20 +15,20 @@ const MASK: [u8; 64] = [
 ];
 
 unsafe fn u8x64_from_offset(slice: &[u8], offset: usize) -> u8x64 {
-    u8x64::from_slice_unaligned_unchecked(slice.get_unchecked(offset..))
+    u8x64::from_slice(slice.get_unchecked(offset..))
 }
 unsafe fn u8x32_from_offset(slice: &[u8], offset: usize) -> u8x32 {
-    u8x32::from_slice_unaligned_unchecked(slice.get_unchecked(offset..))
+    u8x32::from_slice(slice.get_unchecked(offset..))
 }
 
 fn sum_x64(u8s: &u8x64) -> usize {
     let mut store = [0; mem::size_of::<u8x64>()];
-    u8s.write_to_slice_unaligned(&mut store);
+    u8s.copy_to_slice(&mut store);
     store.iter().map(|&e| e as usize).sum()
 }
 fn sum_x32(u8s: &u8x32) -> usize {
     let mut store = [0; mem::size_of::<u8x32>()];
-    u8s.write_to_slice_unaligned(&mut store);
+    u8s.copy_to_slice(&mut store);
     store.iter().map(|&e| e as usize).sum()
 }
 
@@ -44,7 +45,7 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
         while haystack.len() >= offset + 64 * 255 {
             let mut counts = u8x64::splat(0);
             for _ in 0..255 {
-                counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64));
+                counts -= u8x64_from_offset(haystack, offset).simd_eq(needles_x64).to_int().cast();
                 offset += 64;
             }
             count += sum_x64(&counts);
@@ -54,7 +55,7 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
         if haystack.len() >= offset + 64 * 128 {
             let mut counts = u8x64::splat(0);
             for _ in 0..128 {
-                counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64));
+                counts -= u8x64_from_offset(haystack, offset).simd_eq(needles_x64).to_int().cast();
                 offset += 64;
             }
             count += sum_x64(&counts);
@@ -66,7 +67,7 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
         let mut counts = u8x32::splat(0);
         for i in 0..(haystack.len() - offset) / 32 {
             counts -=
-                u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32));
+                u8x32_from_offset(haystack, offset + i * 32).simd_eq(needles_x32).to_int().cast();
         }
         count += sum_x32(&counts);
 
@@ -74,7 +75,7 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
         counts = u8x32::splat(0);
         if haystack.len() % 32 != 0 {
             counts -=
-                u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32))
+                u8x32_from_offset(haystack, haystack.len() - 32).simd_eq(needles_x32).to_int().cast()
                     & u8x32_from_offset(&MASK, haystack.len() % 32);
         }
         count += sum_x32(&counts);
@@ -84,11 +85,11 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
 }
 
 fn is_leading_utf8_byte_x64(u8s: u8x64) -> u8x64 {
-    u8x64::from_cast((u8s & u8x64::splat(0b1100_0000)).ne(u8x64::splat(0b1000_0000)))
+    (u8s & u8x64::splat(0b1100_0000)).simd_ne(u8x64::splat(0b1000_0000)).to_int().cast()
 }
 
 fn is_leading_utf8_byte_x32(u8s: u8x32) -> u8x32 {
-    u8x32::from_cast((u8s & u8x32::splat(0b1100_0000)).ne(u8x32::splat(0b1000_0000)))
+    (u8s & u8x32::splat(0b1100_0000)).simd_ne(u8x32::splat(0b1000_0000)).to_int().cast()
 }
 
 pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize {

From 32a098dfd2f058bcf4f42ec354bb71b18c76da0d Mon Sep 17 00:00:00 2001
From: Yacin Tmimi <yacintmimi@gmail.com>
Date: Mon, 15 Apr 2024 21:33:11 -0400
Subject: [PATCH 2/2] update docs to mention `std::simd` and
 `#![feature(portable_simd)]`

This replaces the simd docs that used to reference packed_simd.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ee028de..12301e9 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Your users can then compile with runtime dispatch using:
 cargo build --release --features runtime-dispatch-simd
 ```
 
-The second, `generic-simd`, uses `packed_simd` to provide a fast
+The second, `generic-simd`, uses [`std::simd`](https://doc.rust-lang.org/std/simd/index.html) and [`#![feature(portable_simd)]`](https://github.com/rust-lang/rust/issues/86656) to provide a fast
 architecture-agnostic SIMD codepath, but requires running on nightly.
 
 Your users can compile with this codepath using: