From 378861a989e7a04ccebfeecf48b41df0ec63ccc7 Mon Sep 17 00:00:00 2001 From: DaniPopes <57450786+DaniPopes@users.noreply.github.com> Date: Tue, 28 May 2024 00:26:36 +0300 Subject: [PATCH] feat: implement check for arm and portable-simd (#10) --- .github/workflows/ci.yml | 10 +- README.md | 197 +++++++++++++++++++++----------------- benches/bench/main.rs | 44 +++++++++ src/arch/aarch64.rs | 42 +++++++- src/arch/generic.rs | 11 +-- src/arch/portable_simd.rs | 30 ++++-- src/arch/x86.rs | 56 +++++------ src/lib.rs | 2 +- 8 files changed, 248 insertions(+), 144 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da5b4a3..06283d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,11 +40,11 @@ jobs: - uses: Swatinem/rust-cache@v2 - run: cargo build - - run: cargo test - run: cargo build --no-default-features - - run: cargo test --tests --no-default-features - - run: cargo test --tests --no-default-features --features force-generic - - run: cargo test --tests --no-default-features --features nightly,portable-simd + - run: cargo test + - run: cargo test --no-default-features + - run: cargo test --no-default-features --features force-generic + - run: cargo test --no-default-features --features nightly,portable-simd if: matrix.rust == 'nightly' - run: cargo bench --no-run if: matrix.rust == 'nightly' @@ -65,8 +65,6 @@ jobs: - uses: dtolnay/rust-toolchain@miri with: target: ${{ matrix.target }} - - uses: Swatinem/rust-cache@v2 - - run: cargo miri setup --target ${{ matrix.target }} ${{ matrix.flags }} - run: cargo miri test --target ${{ matrix.target }} ${{ matrix.flags }} fuzz: diff --git a/README.md b/README.md index 2190525..d6dadf7 100644 --- a/README.md +++ b/README.md @@ -27,100 +27,119 @@ This crate is 10 to 50 times faster than [`hex`] in encoding and decoding, and 100+ times faster than `libstd` in formatting. The following benchmarks were ran on an AMD Ryzen 9 7950X, compiled with -`1.78.0-nightly (a4472498d 2024-02-15)` on `x86_64-unknown-linux-gnu`. +`1.80.0-nightly (bdbbb6c6a 2024-05-26)` on `x86_64-unknown-linux-gnu`. You can run these benchmarks with `cargo bench --features std` on a nightly compiler. ```log -test decode::const_hex::bench1_32b ... bench: 16 ns/iter (+/- 5) -test decode::const_hex::bench2_256b ... bench: 37 ns/iter (+/- 0) -test decode::const_hex::bench3_2k ... bench: 232 ns/iter (+/- 2) -test decode::const_hex::bench4_16k ... bench: 1,672 ns/iter (+/- 12) -test decode::const_hex::bench5_128k ... bench: 12,979 ns/iter (+/- 91) -test decode::const_hex::bench6_1m ... bench: 104,751 ns/iter (+/- 2,068) -test decode::faster_hex::bench1_32b ... bench: 15 ns/iter (+/- 0) -test decode::faster_hex::bench2_256b ... bench: 54 ns/iter (+/- 1) -test decode::faster_hex::bench3_2k ... bench: 253 ns/iter (+/- 3) -test decode::faster_hex::bench4_16k ... bench: 1,831 ns/iter (+/- 20) -test decode::faster_hex::bench5_128k ... bench: 14,120 ns/iter (+/- 57) -test decode::faster_hex::bench6_1m ... bench: 115,291 ns/iter (+/- 1,325) -test decode::hex::bench1_32b ... bench: 104 ns/iter (+/- 1) -test decode::hex::bench2_256b ... bench: 697 ns/iter (+/- 7) -test decode::hex::bench3_2k ... bench: 5,189 ns/iter (+/- 86) -test decode::hex::bench4_16k ... bench: 42,355 ns/iter (+/- 21,853) -test decode::hex::bench5_128k ... bench: 765,278 ns/iter (+/- 4,091) -test decode::hex::bench6_1m ... bench: 6,161,416 ns/iter (+/- 64,954) - -test decode_to_slice::const_hex::bench1_32b ... bench: 5 ns/iter (+/- 0) -test decode_to_slice::const_hex::bench2_256b ... bench: 26 ns/iter (+/- 0) -test decode_to_slice::const_hex::bench3_2k ... bench: 210 ns/iter (+/- 10) -test decode_to_slice::const_hex::bench4_16k ... bench: 1,667 ns/iter (+/- 13) -test decode_to_slice::const_hex::bench5_128k ... bench: 13,043 ns/iter (+/- 19) -test decode_to_slice::const_hex::bench6_1m ... bench: 105,883 ns/iter (+/- 1,427) -test decode_to_slice::faster_hex::bench1_32b ... bench: 6 ns/iter (+/- 0) -test decode_to_slice::faster_hex::bench2_256b ... bench: 28 ns/iter (+/- 0) -test decode_to_slice::faster_hex::bench3_2k ... bench: 214 ns/iter (+/- 2) -test decode_to_slice::faster_hex::bench4_16k ... bench: 1,710 ns/iter (+/- 6) -test decode_to_slice::faster_hex::bench5_128k ... bench: 13,304 ns/iter (+/- 37) -test decode_to_slice::faster_hex::bench6_1m ... bench: 110,276 ns/iter (+/- 3,475) -test decode_to_slice::hex::bench1_32b ... bench: 38 ns/iter (+/- 2) -test decode_to_slice::hex::bench2_256b ... bench: 300 ns/iter (+/- 185) -test decode_to_slice::hex::bench3_2k ... bench: 2,717 ns/iter (+/- 64) -test decode_to_slice::hex::bench4_16k ... bench: 19,257 ns/iter (+/- 530) -test decode_to_slice::hex::bench5_128k ... bench: 624,172 ns/iter (+/- 15,725) -test decode_to_slice::hex::bench6_1m ... bench: 5,333,915 ns/iter (+/- 298,093) - -test encode::const_hex::bench1_32b ... bench: 6 ns/iter (+/- 0) -test encode::const_hex::bench2_256b ... bench: 10 ns/iter (+/- 0) -test encode::const_hex::bench3_2k ... bench: 72 ns/iter (+/- 1) -test encode::const_hex::bench4_16k ... bench: 462 ns/iter (+/- 4) -test encode::const_hex::bench5_128k ... bench: 3,600 ns/iter (+/- 28) -test encode::const_hex::bench6_1m ... bench: 29,447 ns/iter (+/- 858) -test encode::faster_hex::bench1_32b ... bench: 17 ns/iter (+/- 0) -test encode::faster_hex::bench2_256b ... bench: 37 ns/iter (+/- 3) -test encode::faster_hex::bench3_2k ... bench: 102 ns/iter (+/- 1) -test encode::faster_hex::bench4_16k ... bench: 614 ns/iter (+/- 6) -test encode::faster_hex::bench5_128k ... bench: 4,764 ns/iter (+/- 12) -test encode::faster_hex::bench6_1m ... bench: 40,894 ns/iter (+/- 1,223) -test encode::hex::bench1_32b ... bench: 112 ns/iter (+/- 0) -test encode::hex::bench2_256b ... bench: 812 ns/iter (+/- 5) -test encode::hex::bench3_2k ... bench: 6,404 ns/iter (+/- 26) -test encode::hex::bench4_16k ... bench: 51,039 ns/iter (+/- 595) -test encode::hex::bench5_128k ... bench: 408,378 ns/iter (+/- 23,022) -test encode::hex::bench6_1m ... bench: 3,571,916 ns/iter (+/- 142,828) - -test encode_to_slice::const_hex::bench1_32b ... bench: 1 ns/iter (+/- 0) -test encode_to_slice::const_hex::bench2_256b ... bench: 6 ns/iter (+/- 0) -test encode_to_slice::const_hex::bench3_2k ... bench: 53 ns/iter (+/- 0) -test encode_to_slice::const_hex::bench4_16k ... bench: 452 ns/iter (+/- 3) -test encode_to_slice::const_hex::bench5_128k ... bench: 3,550 ns/iter (+/- 10) -test encode_to_slice::const_hex::bench6_1m ... bench: 29,605 ns/iter (+/- 916) -test encode_to_slice::faster_hex::bench1_32b ... bench: 4 ns/iter (+/- 0) -test encode_to_slice::faster_hex::bench2_256b ... bench: 7 ns/iter (+/- 0) -test encode_to_slice::faster_hex::bench3_2k ... bench: 47 ns/iter (+/- 0) -test encode_to_slice::faster_hex::bench4_16k ... bench: 402 ns/iter (+/- 5) -test encode_to_slice::faster_hex::bench5_128k ... bench: 3,121 ns/iter (+/- 25) -test encode_to_slice::faster_hex::bench6_1m ... bench: 26,171 ns/iter (+/- 573) -test encode_to_slice::hex::bench1_32b ... bench: 11 ns/iter (+/- 0) -test encode_to_slice::hex::bench2_256b ... bench: 118 ns/iter (+/- 0) -test encode_to_slice::hex::bench3_2k ... bench: 994 ns/iter (+/- 4) -test encode_to_slice::hex::bench4_16k ... bench: 8,065 ns/iter (+/- 31) -test encode_to_slice::hex::bench5_128k ... bench: 63,982 ns/iter (+/- 2,026) -test encode_to_slice::hex::bench6_1m ... bench: 515,171 ns/iter (+/- 2,789) - -test format::const_hex::bench1_32b ... bench: 9 ns/iter (+/- 0) -test format::const_hex::bench2_256b ... bench: 18 ns/iter (+/- 0) -test format::const_hex::bench3_2k ... bench: 119 ns/iter (+/- 1) -test format::const_hex::bench4_16k ... bench: 1,157 ns/iter (+/- 3) -test format::const_hex::bench5_128k ... bench: 9,560 ns/iter (+/- 443) -test format::const_hex::bench6_1m ... bench: 85,479 ns/iter (+/- 1,498) -test format::std::bench1_32b ... bench: 374 ns/iter (+/- 6) -test format::std::bench2_256b ... bench: 2,952 ns/iter (+/- 10) -test format::std::bench3_2k ... bench: 23,767 ns/iter (+/- 61) -test format::std::bench4_16k ... bench: 183,579 ns/iter (+/- 2,078) -test format::std::bench5_128k ... bench: 1,498,391 ns/iter (+/- 8,445) -test format::std::bench6_1m ... bench: 11,965,082 ns/iter (+/- 43,784) +test check::const_hex::bench1_32b ... bench: 9.79 ns/iter (+/- 2.22) +test check::const_hex::bench2_256b ... bench: 20.22 ns/iter (+/- 6.41) +test check::const_hex::bench3_2k ... bench: 124.57 ns/iter (+/- 9.68) +test check::const_hex::bench4_16k ... bench: 927.53 ns/iter (+/- 81.94) +test check::const_hex::bench5_128k ... bench: 7,300.16 ns/iter (+/- 153.57) +test check::const_hex::bench6_1m ... bench: 59,886.62 ns/iter (+/- 1,312.15) +test check::faster_hex::bench1_32b ... bench: 2.93 ns/iter (+/- 0.20) +test check::faster_hex::bench2_256b ... bench: 15.54 ns/iter (+/- 0.77) +test check::faster_hex::bench3_2k ... bench: 121.17 ns/iter (+/- 0.70) +test check::faster_hex::bench4_16k ... bench: 945.17 ns/iter (+/- 12.48) +test check::faster_hex::bench5_128k ... bench: 7,632.07 ns/iter (+/- 172.68) +test check::faster_hex::bench6_1m ... bench: 61,427.03 ns/iter (+/- 732.21) +test check::naive::bench1_32b ... bench: 26.15 ns/iter (+/- 1.04) +test check::naive::bench2_256b ... bench: 225.69 ns/iter (+/- 9.02) +test check::naive::bench3_2k ... bench: 1,952.05 ns/iter (+/- 20.46) +test check::naive::bench4_16k ... bench: 20,329.63 ns/iter (+/- 455.31) +test check::naive::bench5_128k ... bench: 503,460.10 ns/iter (+/- 8,930.00) +test check::naive::bench6_1m ... bench: 4,252,331.00 ns/iter (+/- 191,519.79) + +test decode::const_hex::bench1_32b ... bench: 19.49 ns/iter (+/- 0.82) +test decode::const_hex::bench2_256b ... bench: 41.47 ns/iter (+/- 1.45) +test decode::const_hex::bench3_2k ... bench: 236.69 ns/iter (+/- 2.75) +test decode::const_hex::bench4_16k ... bench: 1,682.25 ns/iter (+/- 18.07) +test decode::const_hex::bench5_128k ... bench: 13,096.94 ns/iter (+/- 138.05) +test decode::const_hex::bench6_1m ... bench: 105,360.67 ns/iter (+/- 2,771.85) +test decode::faster_hex::bench1_32b ... bench: 16.49 ns/iter (+/- 0.12) +test decode::faster_hex::bench2_256b ... bench: 54.92 ns/iter (+/- 1.74) +test decode::faster_hex::bench3_2k ... bench: 246.11 ns/iter (+/- 3.25) +test decode::faster_hex::bench4_16k ... bench: 1,839.88 ns/iter (+/- 22.36) +test decode::faster_hex::bench5_128k ... bench: 14,376.52 ns/iter (+/- 203.97) +test decode::faster_hex::bench6_1m ... bench: 116,345.50 ns/iter (+/- 1,377.06) +test decode::hex::bench1_32b ... bench: 101.38 ns/iter (+/- 3.41) +test decode::hex::bench2_256b ... bench: 655.85 ns/iter (+/- 17.69) +test decode::hex::bench3_2k ... bench: 4,830.90 ns/iter (+/- 73.58) +test decode::hex::bench4_16k ... bench: 37,976.46 ns/iter (+/- 443.65) +test decode::hex::bench5_128k ... bench: 734,983.30 ns/iter (+/- 4,297.49) +test decode::hex::bench6_1m ... bench: 5,901,860.30 ns/iter (+/- 22,806.40) + +test decode_to_slice::const_hex::bench1_32b ... bench: 11.03 ns/iter (+/- 0.50) +test decode_to_slice::const_hex::bench2_256b ... bench: 28.93 ns/iter (+/- 1.38) +test decode_to_slice::const_hex::bench3_2k ... bench: 210.49 ns/iter (+/- 5.65) +test decode_to_slice::const_hex::bench4_16k ... bench: 1,670.55 ns/iter (+/- 17.87) +test decode_to_slice::const_hex::bench5_128k ... bench: 13,094.65 ns/iter (+/- 115.89) +test decode_to_slice::const_hex::bench6_1m ... bench: 105,036.12 ns/iter (+/- 1,722.56) +test decode_to_slice::faster_hex::bench1_32b ... bench: 6.26 ns/iter (+/- 0.10) +test decode_to_slice::faster_hex::bench2_256b ... bench: 28.73 ns/iter (+/- 0.55) +test decode_to_slice::faster_hex::bench3_2k ... bench: 213.70 ns/iter (+/- 2.83) +test decode_to_slice::faster_hex::bench4_16k ... bench: 1,718.21 ns/iter (+/- 23.97) +test decode_to_slice::faster_hex::bench5_128k ... bench: 13,530.96 ns/iter (+/- 97.41) +test decode_to_slice::faster_hex::bench6_1m ... bench: 107,708.79 ns/iter (+/- 1,425.58) +test decode_to_slice::hex::bench1_32b ... bench: 39.07 ns/iter (+/- 1.85) +test decode_to_slice::hex::bench2_256b ... bench: 311.73 ns/iter (+/- 10.92) +test decode_to_slice::hex::bench3_2k ... bench: 2,515.69 ns/iter (+/- 63.09) +test decode_to_slice::hex::bench4_16k ... bench: 20,899.92 ns/iter (+/- 442.95) +test decode_to_slice::hex::bench5_128k ... bench: 634,859.00 ns/iter (+/- 6,609.09) +test decode_to_slice::hex::bench6_1m ... bench: 5,338,354.20 ns/iter (+/- 51,493.30) + +test encode::const_hex::bench1_32b ... bench: 6.92 ns/iter (+/- 0.11) +test encode::const_hex::bench2_256b ... bench: 11.41 ns/iter (+/- 0.14) +test encode::const_hex::bench3_2k ... bench: 73.59 ns/iter (+/- 1.52) +test encode::const_hex::bench4_16k ... bench: 461.00 ns/iter (+/- 5.17) +test encode::const_hex::bench5_128k ... bench: 3,527.28 ns/iter (+/- 46.86) +test encode::const_hex::bench6_1m ... bench: 29,402.78 ns/iter (+/- 1,032.91) +test encode::faster_hex::bench1_32b ... bench: 17.30 ns/iter (+/- 0.30) +test encode::faster_hex::bench2_256b ... bench: 39.07 ns/iter (+/- 0.73) +test encode::faster_hex::bench3_2k ... bench: 102.93 ns/iter (+/- 1.92) +test encode::faster_hex::bench4_16k ... bench: 651.65 ns/iter (+/- 1.55) +test encode::faster_hex::bench5_128k ... bench: 5,074.50 ns/iter (+/- 22.97) +test encode::faster_hex::bench6_1m ... bench: 46,227.11 ns/iter (+/- 945.52) +test encode::hex::bench1_32b ... bench: 100.46 ns/iter (+/- 0.93) +test encode::hex::bench2_256b ... bench: 717.75 ns/iter (+/- 5.91) +test encode::hex::bench3_2k ... bench: 5,660.67 ns/iter (+/- 246.73) +test encode::hex::bench4_16k ... bench: 44,981.79 ns/iter (+/- 340.51) +test encode::hex::bench5_128k ... bench: 359,401.72 ns/iter (+/- 1,689.97) +test encode::hex::bench6_1m ... bench: 2,966,947.20 ns/iter (+/- 165,738.51) + +test encode_to_slice::const_hex::bench1_32b ... bench: 1.55 ns/iter (+/- 0.04) +test encode_to_slice::const_hex::bench2_256b ... bench: 6.73 ns/iter (+/- 0.08) +test encode_to_slice::const_hex::bench3_2k ... bench: 54.23 ns/iter (+/- 2.06) +test encode_to_slice::const_hex::bench4_16k ... bench: 471.90 ns/iter (+/- 21.13) +test encode_to_slice::const_hex::bench5_128k ... bench: 3,730.44 ns/iter (+/- 113.80) +test encode_to_slice::const_hex::bench6_1m ... bench: 29,247.93 ns/iter (+/- 926.38) +test encode_to_slice::faster_hex::bench1_32b ... bench: 3.95 ns/iter (+/- 0.26) +test encode_to_slice::faster_hex::bench2_256b ... bench: 7.49 ns/iter (+/- 0.47) +test encode_to_slice::faster_hex::bench3_2k ... bench: 48.56 ns/iter (+/- 1.95) +test encode_to_slice::faster_hex::bench4_16k ... bench: 424.65 ns/iter (+/- 11.39) +test encode_to_slice::faster_hex::bench5_128k ... bench: 3,317.47 ns/iter (+/- 103.44) +test encode_to_slice::faster_hex::bench6_1m ... bench: 26,079.18 ns/iter (+/- 889.52) +test encode_to_slice::hex::bench1_32b ... bench: 11.99 ns/iter (+/- 0.30) +test encode_to_slice::hex::bench2_256b ... bench: 119.07 ns/iter (+/- 2.85) +test encode_to_slice::hex::bench3_2k ... bench: 999.68 ns/iter (+/- 26.35) +test encode_to_slice::hex::bench4_16k ... bench: 8,049.82 ns/iter (+/- 105.51) +test encode_to_slice::hex::bench5_128k ... bench: 65,186.25 ns/iter (+/- 758.98) +test encode_to_slice::hex::bench6_1m ... bench: 511,447.00 ns/iter (+/- 4,866.41) + +test format::const_hex::bench1_32b ... bench: 9.84 ns/iter (+/- 0.21) +test format::const_hex::bench2_256b ... bench: 17.90 ns/iter (+/- 0.55) +test format::const_hex::bench3_2k ... bench: 119.47 ns/iter (+/- 3.87) +test format::const_hex::bench4_16k ... bench: 1,161.94 ns/iter (+/- 20.11) +test format::const_hex::bench5_128k ... bench: 9,580.23 ns/iter (+/- 188.10) +test format::const_hex::bench6_1m ... bench: 84,316.47 ns/iter (+/- 1,407.10) +test format::std::bench1_32b ... bench: 371.37 ns/iter (+/- 3.63) +test format::std::bench2_256b ... bench: 2,987.01 ns/iter (+/- 41.26) +test format::std::bench3_2k ... bench: 23,989.24 ns/iter (+/- 373.84) +test format::std::bench4_16k ... bench: 192,881.92 ns/iter (+/- 9,266.64) +test format::std::bench5_128k ... bench: 1,554,062.80 ns/iter (+/- 10,998.30) +test format::std::bench6_1m ... bench: 12,378,548.00 ns/iter (+/- 247,626.30) ``` ## Acknowledgements diff --git a/benches/bench/main.rs b/benches/bench/main.rs index cf23706..4a9315c 100644 --- a/benches/bench/main.rs +++ b/benches/bench/main.rs @@ -29,6 +29,50 @@ impl fmt::Display for StdFormat { macro_rules! benches { ($($name:ident($enc:expr, $dec:expr))*) => { + mod check { + use super::*; + + mod const_hex { + use super::*; + + $( + #[bench] + fn $name(b: &mut Bencher) { + b.iter(|| { + ::const_hex::check(black_box($dec)) + }); + } + )* + } + + mod faster_hex { + use super::*; + + $( + #[bench] + fn $name(b: &mut Bencher) { + b.iter(|| { + ::faster_hex::hex_check(black_box($dec.as_bytes())) + }); + } + )* + } + + mod naive { + use super::*; + + $( + #[bench] + fn $name(b: &mut Bencher) { + b.iter(|| { + let dec = black_box($dec.as_bytes()); + dec.iter().all(u8::is_ascii_hexdigit) + }); + } + )* + } + } + #[cfg(feature = "alloc")] mod decode { use super::*; diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs index da56117..80b4fbb 100644 --- a/src/arch/aarch64.rs +++ b/src/arch/aarch64.rs @@ -4,7 +4,7 @@ use super::generic; use crate::get_chars_table; use core::arch::aarch64::*; -pub(crate) const USE_CHECK_FN: bool = false; +pub(crate) const USE_CHECK_FN: bool = true; const CHUNK_SIZE: usize = core::mem::size_of::(); cfg_if::cfg_if! { @@ -63,6 +63,44 @@ pub(crate) unsafe fn encode_neon(input: &[u8], output: *mut u } } -pub(crate) use generic::check; +#[inline] +pub(crate) fn check(input: &[u8]) -> bool { + if cfg!(miri) || !has_neon() || input.len() < CHUNK_SIZE { + return generic::check(input); + } + unsafe { check_neon(input) } +} + +#[target_feature(enable = "neon")] +pub(crate) unsafe fn check_neon(input: &[u8]) -> bool { + let ascii_zero = vdupq_n_u8(b'0' - 1); + let ascii_nine = vdupq_n_u8(b'9' + 1); + let ascii_ua = vdupq_n_u8(b'A' - 1); + let ascii_uf = vdupq_n_u8(b'F' + 1); + let ascii_la = vdupq_n_u8(b'a' - 1); + let ascii_lf = vdupq_n_u8(b'f' + 1); + + let (prefix, chunks, suffix) = input.align_to::(); + generic::check(prefix) + && chunks.iter().all(|&chunk| { + let ge0 = vcgtq_u8(chunk, ascii_zero); + let le9 = vcltq_u8(chunk, ascii_nine); + let valid_digit = vandq_u8(ge0, le9); + + let geua = vcgtq_u8(chunk, ascii_ua); + let leuf = vcltq_u8(chunk, ascii_uf); + let valid_upper = vandq_u8(geua, leuf); + + let gela = vcgtq_u8(chunk, ascii_la); + let lelf = vcltq_u8(chunk, ascii_lf); + let valid_lower = vandq_u8(gela, lelf); + + let valid_letter = vorrq_u8(valid_lower, valid_upper); + let valid_mask = vorrq_u8(valid_digit, valid_letter); + vminvq_u8(valid_mask) == 0xFF + }) + && generic::check(suffix) +} + pub(crate) use generic::decode_checked; pub(crate) use generic::decode_unchecked; diff --git a/src/arch/generic.rs b/src/arch/generic.rs index 3cf2caa..b2e40e9 100644 --- a/src/arch/generic.rs +++ b/src/arch/generic.rs @@ -24,8 +24,8 @@ pub(crate) unsafe fn encode(input: &[u8], output: *mut u8) { /// Default check function. #[inline] pub(crate) const fn check(mut input: &[u8]) -> bool { - while let [byte, rest @ ..] = input { - if HEX_DECODE_LUT[*byte as usize] == NIL { + while let &[byte, ref rest @ ..] = input { + if HEX_DECODE_LUT[byte as usize] == NIL { return false; } input = rest; @@ -48,8 +48,9 @@ pub(crate) unsafe fn decode_checked(input: &[u8], output: &mut [u8]) -> bool { /// /// Assumes `output.len() == input.len() / 2` and that the input is valid hex. pub(crate) unsafe fn decode_unchecked(input: &[u8], output: &mut [u8]) { - let r = unsafe { decode_maybe_check::(input, output) }; - debug_assert!(r); + #[allow(unused_braces)] // False positive on older rust versions. + let success = unsafe { decode_maybe_check::<{ cfg!(debug_assertions) }>(input, output) }; + debug_assert!(success); } /// Default decoding function. Checks input validity if `CHECK` is `true`, otherwise assumes it. @@ -67,8 +68,6 @@ unsafe fn decode_maybe_check(input: &[u8], output: &mut [u8]) if $var == NIL { return false; } - } else { - debug_assert_ne!($var, NIL, "invalid hex input"); } }; } diff --git a/src/arch/portable_simd.rs b/src/arch/portable_simd.rs index 8ce9b60..6572467 100644 --- a/src/arch/portable_simd.rs +++ b/src/arch/portable_simd.rs @@ -1,10 +1,12 @@ use super::generic; use crate::get_chars_table; -use core::simd::u8x16; +use core::simd::prelude::*; use core::slice; -pub(crate) const USE_CHECK_FN: bool = false; -const CHUNK_SIZE: usize = core::mem::size_of::(); +type Simd = u8x16; + +pub(crate) const USE_CHECK_FN: bool = true; +const CHUNK_SIZE: usize = core::mem::size_of::(); pub(crate) unsafe fn encode(input: &[u8], output: *mut u8) { let mut i = 0; @@ -14,18 +16,18 @@ pub(crate) unsafe fn encode(input: &[u8], output: *mut u8) { unsafe { generic::encode::(prefix, output) }; i += prefix.len() * 2; - let hex_table = u8x16::from_array(*get_chars_table::()); + let hex_table = Simd::from_array(*get_chars_table::()); for &chunk in chunks { // Load input bytes and mask to nibbles. - let mut lo = chunk & u8x16::splat(15); - let mut hi = chunk >> u8x16::splat(4); + let mut lo = chunk & Simd::splat(15); + let mut hi = chunk >> Simd::splat(4); // Lookup the corresponding ASCII hex digit for each nibble. lo = hex_table.swizzle_dyn(lo); hi = hex_table.swizzle_dyn(hi); // Interleave the nibbles ([hi[0], lo[0], hi[1], lo[1], ...]). - let (hex_lo, hex_hi) = u8x16::interleave(hi, lo); + let (hex_lo, hex_hi) = Simd::interleave(hi, lo); // Store result into the output buffer. // SAFETY: ensured by caller. @@ -41,6 +43,18 @@ pub(crate) unsafe fn encode(input: &[u8], output: *mut u8) { unsafe { generic::encode::(suffix, output.add(i)) }; } -pub(crate) use generic::check; +pub(crate) fn check(input: &[u8]) -> bool { + let (prefix, chunks, suffix) = input.as_simd::(); + generic::check(prefix) + && chunks.iter().all(|&chunk| { + let valid_digit = chunk.simd_ge(Simd::splat(b'0')) & chunk.simd_le(Simd::splat(b'9')); + let valid_upper = chunk.simd_ge(Simd::splat(b'A')) & chunk.simd_le(Simd::splat(b'F')); + let valid_lower = chunk.simd_ge(Simd::splat(b'a')) & chunk.simd_le(Simd::splat(b'f')); + let valid = valid_digit | valid_upper | valid_lower; + valid.all() + }) + && generic::check(suffix) +} + pub(crate) use generic::decode_checked; pub(crate) use generic::decode_unchecked; diff --git a/src/arch/x86.rs b/src/arch/x86.rs index c9c26ac..8c5d180 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86.rs @@ -12,8 +12,6 @@ pub(crate) const USE_CHECK_FN: bool = true; const CHUNK_SIZE_SSE: usize = core::mem::size_of::<__m128i>(); const CHUNK_SIZE_AVX: usize = core::mem::size_of::<__m256i>(); -const T_MASK: i32 = 65535; - cfg_if::cfg_if! { if #[cfg(feature = "std")] { #[inline(always)] @@ -58,11 +56,11 @@ unsafe fn encode_ssse3(input: &[u8], output: *mut u8) { let input_remainder = input_chunks.remainder(); let mut i = 0; - for input_chunk in input_chunks { + for chunk in input_chunks { // Load input bytes and mask to nibbles. - let input_bytes = _mm_loadu_si128(input_chunk.as_ptr().cast()); - let mut lo = _mm_and_si128(input_bytes, mask_lo); - let mut hi = _mm_srli_epi32::<4>(_mm_and_si128(input_bytes, mask_hi)); + let chunk = _mm_loadu_si128(chunk.as_ptr().cast()); + let mut lo = _mm_and_si128(chunk, mask_lo); + let mut hi = _mm_srli_epi32::<4>(_mm_and_si128(chunk, mask_hi)); // Lookup the corresponding ASCII hex digit for each nibble. lo = _mm_shuffle_epi8(hex_table, lo); @@ -101,32 +99,26 @@ unsafe fn check_sse2(input: &[u8]) -> bool { let ascii_la = _mm_set1_epi8((b'a' - 1) as i8); let ascii_lf = _mm_set1_epi8((b'f' + 1) as i8); - let input_chunks = input.chunks_exact(CHUNK_SIZE_SSE); - let input_remainder = input_chunks.remainder(); - for input_chunk in input_chunks { - let unchecked = _mm_loadu_si128(input_chunk.as_ptr().cast()); - - let gt0 = _mm_cmpgt_epi8(unchecked, ascii_zero); - let lt9 = _mm_cmplt_epi8(unchecked, ascii_nine); - let valid_digit = _mm_and_si128(gt0, lt9); - - let gtua = _mm_cmpgt_epi8(unchecked, ascii_ua); - let ltuf = _mm_cmplt_epi8(unchecked, ascii_uf); - - let gtla = _mm_cmpgt_epi8(unchecked, ascii_la); - let ltlf = _mm_cmplt_epi8(unchecked, ascii_lf); - - let valid_lower = _mm_and_si128(gtla, ltlf); - let valid_upper = _mm_and_si128(gtua, ltuf); - let valid_letter = _mm_or_si128(valid_lower, valid_upper); - - let ret = _mm_movemask_epi8(_mm_or_si128(valid_digit, valid_letter)); - if ret != T_MASK { - return false; - } - } - - generic::check(input_remainder) + let (prefix, chunks, suffix) = input.align_to::<__m128i>(); + generic::check(prefix) + && chunks.iter().all(|&chunk| { + let ge0 = _mm_cmpgt_epi8(chunk, ascii_zero); + let le9 = _mm_cmplt_epi8(chunk, ascii_nine); + let valid_digit = _mm_and_si128(ge0, le9); + + let geua = _mm_cmpgt_epi8(chunk, ascii_ua); + let leuf = _mm_cmplt_epi8(chunk, ascii_uf); + let valid_upper = _mm_and_si128(geua, leuf); + + let gela = _mm_cmpgt_epi8(chunk, ascii_la); + let lelf = _mm_cmplt_epi8(chunk, ascii_lf); + let valid_lower = _mm_and_si128(gela, lelf); + + let valid_letter = _mm_or_si128(valid_lower, valid_upper); + let valid_mask = _mm_movemask_epi8(_mm_or_si128(valid_digit, valid_letter)); + valid_mask == 0xffff + }) + && generic::check(suffix) } #[inline] diff --git a/src/lib.rs b/src/lib.rs index 8c044b0..208cf62 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,7 @@ #![cfg_attr( feature = "nightly", feature(core_intrinsics, inline_const), - allow(internal_features) + allow(internal_features, stable_features) )] #![cfg_attr(feature = "portable-simd", feature(portable_simd))] #![warn(