Skip to content

Commit

Permalink
Merge ce1d7e8 into dd7fc44
Browse files Browse the repository at this point in the history
  • Loading branch information
Licenser authored Sep 13, 2024
2 parents dd7fc44 + ce1d7e8 commit a10a17e
Show file tree
Hide file tree
Showing 8 changed files with 119 additions and 130 deletions.
18 changes: 9 additions & 9 deletions src/impls/avx2/stage1.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#![allow(dead_code)]
use crate::{static_cast_i32, static_cast_i64, static_cast_u32, Stage1Parse};
use crate::{static_cast_i32, static_cast_i64, static_cast_u32, Stage1Parse, SIMDINPUT_LENGTH};
#[cfg(target_arch = "x86")]
use std::arch::x86 as arch;

#[cfg(target_arch = "x86_64")]
use std::arch::x86_64 as arch;

use arch::{
__m256i, _mm256_add_epi32, _mm256_and_si256, _mm256_cmpeq_epi8, _mm256_loadu_si256,
_mm256_max_epu8, _mm256_movemask_epi8, _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8,
_mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi32, _mm256_storeu_si256,
_mm_clmulepi64_si128, _mm_set1_epi8, _mm_set_epi64x,
__m256i, _mm256_add_epi32, _mm256_and_si256, _mm256_cmpeq_epi8, _mm256_load_si256,
_mm256_loadu_si256, _mm256_max_epu8, _mm256_movemask_epi8, _mm256_set1_epi8, _mm256_set_epi32,
_mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi32,
_mm256_storeu_si256, _mm_clmulepi64_si128, _mm_set1_epi8, _mm_set_epi64x,
};

macro_rules! low_nibble_mask {
Expand Down Expand Up @@ -41,13 +41,13 @@ impl Stage1Parse for SimdInput {
type Utf8Validator = simdutf8::basic::imp::x86::avx2::ChunkedUtf8ValidatorImp;
type SimdRepresentation = __m256i;
#[cfg_attr(not(feature = "no-inline"), inline)]
// _mm256_loadu_si256 does not need alignment
// _mm256_loadu_si256 does not need alignment we allign our input so we can use _mm256_loadu_si256
#[allow(clippy::cast_ptr_alignment)]
#[target_feature(enable = "avx2")]
unsafe fn new(ptr: &[u8]) -> Self {
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
Self {
v0: _mm256_loadu_si256(ptr.as_ptr().cast::<__m256i>()),
v1: _mm256_loadu_si256(ptr.as_ptr().add(32).cast::<__m256i>()),
v0: _mm256_load_si256(ptr.as_ptr().cast::<__m256i>()),
v1: _mm256_load_si256(ptr.as_ptr().add(32).cast::<__m256i>()),
}
}

Expand Down
12 changes: 6 additions & 6 deletions src/impls/native/stage1.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#![allow(clippy::cast_lossless, clippy::cast_sign_loss)]

use crate::{static_cast_i32, Stage1Parse};
use crate::{static_cast_i32, Stage1Parse, SIMDINPUT_LENGTH};

type V128 = [u8; 16];

Expand Down Expand Up @@ -296,12 +296,12 @@ pub(crate) struct SimdInput {
impl Stage1Parse for SimdInput {
type Utf8Validator = super::ChunkedUtf8ValidatorImp;
type SimdRepresentation = V128;
unsafe fn new(ptr: &[u8]) -> Self {
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
SimdInput {
v0: *(ptr.as_ptr().cast::<V128>()),
v1: *(ptr.as_ptr().add(16).cast::<V128>()),
v2: *(ptr.as_ptr().add(32).cast::<V128>()),
v3: *(ptr.as_ptr().add(48).cast::<V128>()),
v0: ptr.as_ptr().cast::<V128>().read(),
v1: ptr.as_ptr().add(16).cast::<V128>().read(),
v2: ptr.as_ptr().add(32).cast::<V128>().read(),
v3: ptr.as_ptr().add(48).cast::<V128>().read(),
}
}

Expand Down
15 changes: 6 additions & 9 deletions src/impls/neon/stage1.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{static_cast_i32, Stage1Parse};
use crate::{static_cast_i32, Stage1Parse, SIMDINPUT_LENGTH};
use std::arch::aarch64::{
int32x4_t, int8x16_t, uint8x16_t, vaddq_s32, vandq_u8, vceqq_u8, vcleq_u8, vdupq_n_s8,
vgetq_lane_u64, vld1q_u8, vmovq_n_u8, vpaddq_u8, vqtbl1q_u8, vreinterpretq_u64_u8,
Expand Down Expand Up @@ -38,9 +38,6 @@ pub unsafe fn neon_movemask_bulk(

// /NEON-SPECIFIC

//pub const SIMDJSON_PADDING: usize = mem::size_of::<uint8x16_t>() * 4;
//pub const SIMDINPUT_LENGTH: usize = 64;

#[derive(Debug)]
pub(crate) struct SimdInput {
v0: uint8x16_t,
Expand All @@ -53,12 +50,12 @@ impl Stage1Parse for SimdInput {
type Utf8Validator = simdutf8::basic::imp::aarch64::neon::ChunkedUtf8ValidatorImp;
type SimdRepresentation = int8x16_t;
#[cfg_attr(not(feature = "no-inline"), inline)]
unsafe fn new(ptr: &[u8]) -> Self {
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
Self {
v0: vld1q_u8(ptr.as_ptr().cast::<u8>()),
v1: vld1q_u8(ptr.as_ptr().add(16).cast::<u8>()),
v2: vld1q_u8(ptr.as_ptr().add(32).cast::<u8>()),
v3: vld1q_u8(ptr.as_ptr().add(48).cast::<u8>()),
v0: vld1q_u8(ptr.as_ptr()),
v1: vld1q_u8(ptr.as_ptr().add(16)),
v2: vld1q_u8(ptr.as_ptr().add(32)),
v3: vld1q_u8(ptr.as_ptr().add(48)),
}
}

Expand Down
6 changes: 3 additions & 3 deletions src/impls/portable/stage1.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::simd::{prelude::*, ToBitMask};

use crate::{static_cast_i32, Stage1Parse};
use crate::{static_cast_i32, Stage1Parse, SIMDINPUT_LENGTH};
#[derive(Debug)]
pub(crate) struct SimdInput {
v: u8x64,
Expand All @@ -10,9 +10,9 @@ impl Stage1Parse for SimdInput {
type Utf8Validator = simdutf8::basic::imp::portable::ChunkedUtf8ValidatorImp;
type SimdRepresentation = u8x64;
#[cfg_attr(not(feature = "no-inline"), inline)]
unsafe fn new(ptr: &[u8]) -> Self {
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
Self {
v: u8x64::from_array(*ptr.as_ptr().cast::<[u8; 64]>()),
v: u8x64::from_array(ptr),
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/impls/simd128/stage1.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::Stage1Parse;
use crate::{Stage1Parse, SIMDINPUT_LENGTH};
use std::arch::wasm32::{
i8x16_splat, u32x4, u32x4_add, u32x4_splat, u8x16, u8x16_bitmask, u8x16_eq, u8x16_le,
u8x16_shr, u8x16_splat, u8x16_swizzle, v128, v128_and, v128_load, v128_store,
Expand All @@ -18,7 +18,7 @@ impl Stage1Parse for SimdInput {

#[cfg_attr(not(feature = "no-inline"), inline)]
#[allow(clippy::cast_ptr_alignment)]
unsafe fn new(ptr: &[u8]) -> Self {
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
Self {
v0: v128_load(ptr.as_ptr().cast::<v128>()),
v1: v128_load(ptr.as_ptr().add(16).cast::<v128>()),
Expand Down
25 changes: 13 additions & 12 deletions src/impls/sse42/stage1.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{static_cast_i32, static_cast_u32, Stage1Parse};
use crate::{static_cast_i32, static_cast_u32, Stage1Parse, SIMDINPUT_LENGTH};
#[cfg(target_arch = "x86")]
use std::arch::x86 as arch;

Expand All @@ -7,16 +7,17 @@ use std::arch::x86_64 as arch;

#[cfg(target_arch = "x86")]
use arch::{
__m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_cmpgt_epi8, _mm_loadu_si128,
_mm_max_epu8, _mm_movemask_epi8, _mm_or_si128, _mm_set1_epi8, _mm_set_epi32, _mm_setr_epi8,
_mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi32, _mm_storeu_si128, _mm_testz_si128,
__m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_cmpgt_epi8, _mm_load_si128,
_mm_loadu_si128, _mm_max_epu8, _mm_movemask_epi8, _mm_or_si128, _mm_set1_epi8, _mm_set_epi32,
_mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi32, _mm_storeu_si128,
_mm_testz_si128,
};

#[cfg(target_arch = "x86_64")]
use arch::{
__m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_max_epu8,
_mm_movemask_epi8, _mm_set1_epi8, _mm_set_epi32, _mm_setr_epi8, _mm_setzero_si128,
_mm_shuffle_epi8, _mm_srli_epi32, _mm_storeu_si128,
__m128i, _mm_add_epi32, _mm_and_si128, _mm_cmpeq_epi8, _mm_load_si128, _mm_loadu_si128,
_mm_max_epu8, _mm_movemask_epi8, _mm_set1_epi8, _mm_set_epi32, _mm_setr_epi8,
_mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi32, _mm_storeu_si128,
};

macro_rules! low_nibble_mask {
Expand Down Expand Up @@ -45,12 +46,12 @@ impl Stage1Parse for SimdInput {
#[target_feature(enable = "sse4.2")]
#[cfg_attr(not(feature = "no-inline"), inline)]
#[allow(clippy::cast_ptr_alignment)]
unsafe fn new(ptr: &[u8]) -> Self {
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
Self {
v0: _mm_loadu_si128(ptr.as_ptr().cast::<arch::__m128i>()),
v1: _mm_loadu_si128(ptr.as_ptr().add(16).cast::<arch::__m128i>()),
v2: _mm_loadu_si128(ptr.as_ptr().add(32).cast::<arch::__m128i>()),
v3: _mm_loadu_si128(ptr.as_ptr().add(48).cast::<arch::__m128i>()),
v0: _mm_load_si128(ptr.as_ptr().cast::<arch::__m128i>()),
v1: _mm_load_si128(ptr.as_ptr().add(16).cast::<arch::__m128i>()),
v2: _mm_load_si128(ptr.as_ptr().add(32).cast::<arch::__m128i>()),
v3: _mm_load_si128(ptr.as_ptr().add(48).cast::<arch::__m128i>()),
}
}

Expand Down
Loading

0 comments on commit a10a17e

Please sign in to comment.