Merge ce1d7e8 into dd7fc44

simd-lite · Sep 13, 2024 · a10a17e · a10a17e
2 parents dd7fc44 + ce1d7e8
commit a10a17e
Show file tree

Hide file tree

Showing 8 changed files with 119 additions and 130 deletions.
diff --git a/src/impls/avx2/stage1.rs b/src/impls/avx2/stage1.rs
@@ -1,16 +1,16 @@
 #![allow(dead_code)]
-use crate::{static_cast_i32, static_cast_i64, static_cast_u32, Stage1Parse};
+use crate::{static_cast_i32, static_cast_i64, static_cast_u32, Stage1Parse, SIMDINPUT_LENGTH};
 #[cfg(target_arch = "x86")]
 use std::arch::x86 as arch;
 
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64 as arch;
 
 use arch::{
-    __m256i, _mm256_add_epi32, _mm256_and_si256, _mm256_cmpeq_epi8, _mm256_loadu_si256,
-    _mm256_max_epu8, _mm256_movemask_epi8, _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8,
-    _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi32, _mm256_storeu_si256,
-    _mm_clmulepi64_si128, _mm_set1_epi8, _mm_set_epi64x,
+    __m256i, _mm256_add_epi32, _mm256_and_si256, _mm256_cmpeq_epi8, _mm256_load_si256,
+    _mm256_loadu_si256, _mm256_max_epu8, _mm256_movemask_epi8, _mm256_set1_epi8, _mm256_set_epi32,
+    _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi32,
+    _mm256_storeu_si256, _mm_clmulepi64_si128, _mm_set1_epi8, _mm_set_epi64x,
 };
 
 macro_rules! low_nibble_mask {
@@ -41,13 +41,13 @@ impl Stage1Parse for SimdInput {
     type Utf8Validator = simdutf8::basic::imp::x86::avx2::ChunkedUtf8ValidatorImp;
     type SimdRepresentation = __m256i;
     #[cfg_attr(not(feature = "no-inline"), inline)]
-    // _mm256_loadu_si256 does not need alignment
+    // _mm256_loadu_si256 does not need alignment we allign our input so we can use _mm256_loadu_si256
     #[allow(clippy::cast_ptr_alignment)]
     #[target_feature(enable = "avx2")]
-    unsafe fn new(ptr: &[u8]) -> Self {
+    unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
         Self {
-            v0: _mm256_loadu_si256(ptr.as_ptr().cast::<__m256i>()),
-            v1: _mm256_loadu_si256(ptr.as_ptr().add(32).cast::<__m256i>()),
+            v0: _mm256_load_si256(ptr.as_ptr().cast::<__m256i>()),
+            v1: _mm256_load_si256(ptr.as_ptr().add(32).cast::<__m256i>()),
         }
     }
 

diff --git a/src/impls/native/stage1.rs b/src/impls/native/stage1.rs
@@ -1,6 +1,6 @@
 #![allow(clippy::cast_lossless, clippy::cast_sign_loss)]
 
-use crate::{static_cast_i32, Stage1Parse};
+use crate::{static_cast_i32, Stage1Parse, SIMDINPUT_LENGTH};
 
 type V128 = [u8; 16];
 
@@ -296,12 +296,12 @@ pub(crate) struct SimdInput {
 impl Stage1Parse for SimdInput {
     type Utf8Validator = super::ChunkedUtf8ValidatorImp;
     type SimdRepresentation = V128;
-    unsafe fn new(ptr: &[u8]) -> Self {
+    unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
         SimdInput {
-            v0: *(ptr.as_ptr().cast::<V128>()),
-            v1: *(ptr.as_ptr().add(16).cast::<V128>()),
-            v2: *(ptr.as_ptr().add(32).cast::<V128>()),
-            v3: *(ptr.as_ptr().add(48).cast::<V128>()),
+            v0: ptr.as_ptr().cast::<V128>().read(),
+            v1: ptr.as_ptr().add(16).cast::<V128>().read(),
+            v2: ptr.as_ptr().add(32).cast::<V128>().read(),
+            v3: ptr.as_ptr().add(48).cast::<V128>().read(),
         }
     }
 

diff --git a/src/impls/neon/stage1.rs b/src/impls/neon/stage1.rs
@@ -1,4 +1,4 @@
-use crate::{static_cast_i32, Stage1Parse};
+use crate::{static_cast_i32, Stage1Parse, SIMDINPUT_LENGTH};
 use std::arch::aarch64::{
     int32x4_t, int8x16_t, uint8x16_t, vaddq_s32, vandq_u8, vceqq_u8, vcleq_u8, vdupq_n_s8,
     vgetq_lane_u64, vld1q_u8, vmovq_n_u8, vpaddq_u8, vqtbl1q_u8, vreinterpretq_u64_u8,
@@ -38,9 +38,6 @@ pub unsafe fn neon_movemask_bulk(
 
 // /NEON-SPECIFIC
 
-//pub const SIMDJSON_PADDING: usize = mem::size_of::<uint8x16_t>() * 4;
-//pub const SIMDINPUT_LENGTH: usize = 64;
-
 #[derive(Debug)]
 pub(crate) struct SimdInput {
     v0: uint8x16_t,
@@ -53,12 +50,12 @@ impl Stage1Parse for SimdInput {
     type Utf8Validator = simdutf8::basic::imp::aarch64::neon::ChunkedUtf8ValidatorImp;
     type SimdRepresentation = int8x16_t;
     #[cfg_attr(not(feature = "no-inline"), inline)]
-    unsafe fn new(ptr: &[u8]) -> Self {
+    unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
         Self {
-            v0: vld1q_u8(ptr.as_ptr().cast::<u8>()),
-            v1: vld1q_u8(ptr.as_ptr().add(16).cast::<u8>()),
-            v2: vld1q_u8(ptr.as_ptr().add(32).cast::<u8>()),
-            v3: vld1q_u8(ptr.as_ptr().add(48).cast::<u8>()),
+            v0: vld1q_u8(ptr.as_ptr()),
+            v1: vld1q_u8(ptr.as_ptr().add(16)),
+            v2: vld1q_u8(ptr.as_ptr().add(32)),
+            v3: vld1q_u8(ptr.as_ptr().add(48)),
         }
     }
 

diff --git a/src/impls/portable/stage1.rs b/src/impls/portable/stage1.rs
@@ -1,6 +1,6 @@
 use std::simd::{prelude::*, ToBitMask};
 
-use crate::{static_cast_i32, Stage1Parse};
+use crate::{static_cast_i32, Stage1Parse, SIMDINPUT_LENGTH};
 #[derive(Debug)]
 pub(crate) struct SimdInput {
     v: u8x64,
@@ -10,9 +10,9 @@ impl Stage1Parse for SimdInput {
     type Utf8Validator = simdutf8::basic::imp::portable::ChunkedUtf8ValidatorImp;
     type SimdRepresentation = u8x64;
     #[cfg_attr(not(feature = "no-inline"), inline)]
-    unsafe fn new(ptr: &[u8]) -> Self {
+    unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
         Self {
-            v: u8x64::from_array(*ptr.as_ptr().cast::<[u8; 64]>()),
+            v: u8x64::from_array(ptr),
         }
     }
 

diff --git a/src/impls/simd128/stage1.rs b/src/impls/simd128/stage1.rs
@@ -1,4 +1,4 @@
-use crate::Stage1Parse;
+use crate::{Stage1Parse, SIMDINPUT_LENGTH};
 use std::arch::wasm32::{
     i8x16_splat, u32x4, u32x4_add, u32x4_splat, u8x16, u8x16_bitmask, u8x16_eq, u8x16_le,
     u8x16_shr, u8x16_splat, u8x16_swizzle, v128, v128_and, v128_load, v128_store,
@@ -18,7 +18,7 @@ impl Stage1Parse for SimdInput {
 
     #[cfg_attr(not(feature = "no-inline"), inline)]
     #[allow(clippy::cast_ptr_alignment)]
-    unsafe fn new(ptr: &[u8]) -> Self {
+    unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
         Self {
             v0: v128_load(ptr.as_ptr().cast::<v128>()),
             v1: v128_load(ptr.as_ptr().add(16).cast::<v128>()),

diff --git a/src/impls/sse42/stage1.rs b/src/impls/sse42/stage1.rs
@@ -1,4 +1,4 @@
-use crate::{static_cast_i32, static_cast_u32, Stage1Parse};
+use crate::{static_cast_i32, static_cast_u32, Stage1Parse, SIMDINPUT_LENGTH};
 #[cfg(target_arch = "x86")]
 use std::arch::x86 as arch;
 
@@ -7,16 +7,17 @@ use std::arch::x86_64 as arch;
 
 #[cfg(target_arch = "x86")]
 use arch::{
-    __m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_cmpgt_epi8, _mm_loadu_si128,
-    _mm_max_epu8, _mm_movemask_epi8, _mm_or_si128, _mm_set1_epi8, _mm_set_epi32, _mm_setr_epi8,
-    _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi32, _mm_storeu_si128, _mm_testz_si128,
+    __m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_cmpgt_epi8, _mm_load_si128,
+    _mm_loadu_si128, _mm_max_epu8, _mm_movemask_epi8, _mm_or_si128, _mm_set1_epi8, _mm_set_epi32,
+    _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi32, _mm_storeu_si128,
+    _mm_testz_si128,
 };
 
 #[cfg(target_arch = "x86_64")]
 use arch::{
-    __m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_max_epu8,
-    _mm_movemask_epi8, _mm_set1_epi8, _mm_set_epi32, _mm_setr_epi8, _mm_setzero_si128,
-    _mm_shuffle_epi8, _mm_srli_epi32, _mm_storeu_si128,
+    __m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_load_si128, _mm_loadu_si128,
+    _mm_max_epu8, _mm_movemask_epi8, _mm_set1_epi8, _mm_set_epi32, _mm_setr_epi8,
+    _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi32, _mm_storeu_si128,
 };
 
 macro_rules! low_nibble_mask {
@@ -45,12 +46,12 @@ impl Stage1Parse for SimdInput {
     #[target_feature(enable = "sse4.2")]
     #[cfg_attr(not(feature = "no-inline"), inline)]
     #[allow(clippy::cast_ptr_alignment)]
-    unsafe fn new(ptr: &[u8]) -> Self {
+    unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
         Self {
-            v0: _mm_loadu_si128(ptr.as_ptr().cast::<arch::__m128i>()),
-            v1: _mm_loadu_si128(ptr.as_ptr().add(16).cast::<arch::__m128i>()),
-            v2: _mm_loadu_si128(ptr.as_ptr().add(32).cast::<arch::__m128i>()),
-            v3: _mm_loadu_si128(ptr.as_ptr().add(48).cast::<arch::__m128i>()),
+            v0: _mm_load_si128(ptr.as_ptr().cast::<arch::__m128i>()),
+            v1: _mm_load_si128(ptr.as_ptr().add(16).cast::<arch::__m128i>()),
+            v2: _mm_load_si128(ptr.as_ptr().add(32).cast::<arch::__m128i>()),
+            v3: _mm_load_si128(ptr.as_ptr().add(48).cast::<arch::__m128i>()),
         }
     }