From 4ef54f175d109d2a6d48e740ee3f5a594a9df780 Mon Sep 17 00:00:00 2001 From: DaniPopes <57450786+DaniPopes@users.noreply.github.com> Date: Sun, 29 Sep 2024 15:59:33 +0200 Subject: [PATCH] feat: check with unaligned SIMD chunks (#12) --- README.md | 210 +++++++++++++++++++------------------- benches/bench/main.rs | 2 +- src/arch/aarch64.rs | 39 ++++--- src/arch/generic.rs | 13 +++ src/arch/portable_simd.rs | 17 ++- src/arch/x86.rs | 39 ++++--- src/lib.rs | 2 +- 7 files changed, 163 insertions(+), 159 deletions(-) diff --git a/README.md b/README.md index ad21817..20b7600 100644 --- a/README.md +++ b/README.md @@ -34,113 +34,113 @@ You can run these benchmarks with `cargo bench --features std` on a nightly compiler. ```log -test check::const_hex::bench1_32b ... bench: 7.36 ns/iter (+/- 0.34) -test check::const_hex::bench2_256b ... bench: 19.39 ns/iter (+/- 0.27) -test check::const_hex::bench3_2k ... bench: 121.85 ns/iter (+/- 15.13) -test check::const_hex::bench4_16k ... bench: 903.95 ns/iter (+/- 13.53) -test check::const_hex::bench5_128k ... bench: 7,121.20 ns/iter (+/- 57.48) -test check::const_hex::bench6_1m ... bench: 57,834.53 ns/iter (+/- 1,000.67) -test check::faster_hex::bench1_32b ... bench: 2.75 ns/iter (+/- 0.03) -test check::faster_hex::bench2_256b ... bench: 14.95 ns/iter (+/- 0.45) -test check::faster_hex::bench3_2k ... bench: 123.08 ns/iter (+/- 4.92) -test check::faster_hex::bench4_16k ... bench: 983.89 ns/iter (+/- 18.29) -test check::faster_hex::bench5_128k ... bench: 7,806.75 ns/iter (+/- 234.99) -test check::faster_hex::bench6_1m ... bench: 64,115.09 ns/iter (+/- 754.27) -test check::naive::bench1_32b ... bench: 18.52 ns/iter (+/- 3.59) -test check::naive::bench2_256b ... bench: 187.49 ns/iter (+/- 6.94) -test check::naive::bench3_2k ... bench: 1,953.95 ns/iter (+/- 52.85) -test check::naive::bench4_16k ... bench: 17,243.26 ns/iter (+/- 3,391.35) -test check::naive::bench5_128k ... bench: 493,272.86 ns/iter (+/- 11,374.41) -test check::naive::bench6_1m ... bench: 4,193,959.30 ns/iter (+/- 180,118.90) - -test decode::const_hex::bench1_32b ... bench: 19.77 ns/iter (+/- 0.80) -test decode::const_hex::bench2_256b ... bench: 41.15 ns/iter (+/- 1.48) -test decode::const_hex::bench3_2k ... bench: 235.43 ns/iter (+/- 2.39) -test decode::const_hex::bench4_16k ... bench: 1,703.37 ns/iter (+/- 5.44) -test decode::const_hex::bench5_128k ... bench: 13,097.29 ns/iter (+/- 54.88) -test decode::const_hex::bench6_1m ... bench: 105,834.33 ns/iter (+/- 1,860.67) -test decode::faster_hex::bench1_32b ... bench: 17.09 ns/iter (+/- 0.26) -test decode::faster_hex::bench2_256b ... bench: 55.30 ns/iter (+/- 0.56) -test decode::faster_hex::bench3_2k ... bench: 249.42 ns/iter (+/- 7.53) -test decode::faster_hex::bench4_16k ... bench: 1,867.34 ns/iter (+/- 12.68) -test decode::faster_hex::bench5_128k ... bench: 14,542.82 ns/iter (+/- 114.09) -test decode::faster_hex::bench6_1m ... bench: 118,627.86 ns/iter (+/- 2,471.00) -test decode::hex::bench1_32b ... bench: 111.69 ns/iter (+/- 7.82) -test decode::hex::bench2_256b ... bench: 728.81 ns/iter (+/- 18.34) -test decode::hex::bench3_2k ... bench: 5,263.46 ns/iter (+/- 87.04) -test decode::hex::bench4_16k ... bench: 42,284.40 ns/iter (+/- 2,312.96) -test decode::hex::bench5_128k ... bench: 800,810.80 ns/iter (+/- 7,695.87) -test decode::hex::bench6_1m ... bench: 6,442,642.10 ns/iter (+/- 38,417.90) - -test decode_to_slice::const_hex::bench1_32b ... bench: 9.90 ns/iter (+/- 2.75) -test decode_to_slice::const_hex::bench2_256b ... bench: 29.02 ns/iter (+/- 1.99) -test decode_to_slice::const_hex::bench3_2k ... bench: 210.05 ns/iter (+/- 8.65) -test decode_to_slice::const_hex::bench4_16k ... bench: 1,667.70 ns/iter (+/- 12.13) -test decode_to_slice::const_hex::bench5_128k ... bench: 13,083.20 ns/iter (+/- 96.53) -test decode_to_slice::const_hex::bench6_1m ... bench: 108,756.59 ns/iter (+/- 2,321.92) -test decode_to_slice::faster_hex::bench1_32b ... bench: 6.67 ns/iter (+/- 0.26) -test decode_to_slice::faster_hex::bench2_256b ... bench: 29.25 ns/iter (+/- 0.46) -test decode_to_slice::faster_hex::bench3_2k ... bench: 218.65 ns/iter (+/- 2.40) -test decode_to_slice::faster_hex::bench4_16k ... bench: 1,743.88 ns/iter (+/- 18.52) -test decode_to_slice::faster_hex::bench5_128k ... bench: 13,694.73 ns/iter (+/- 36.07) -test decode_to_slice::faster_hex::bench6_1m ... bench: 110,733.30 ns/iter (+/- 1,679.82) -test decode_to_slice::hex::bench1_32b ... bench: 37.57 ns/iter (+/- 0.85) -test decode_to_slice::hex::bench2_256b ... bench: 287.52 ns/iter (+/- 23.10) -test decode_to_slice::hex::bench3_2k ... bench: 2,705.00 ns/iter (+/- 26.99) -test decode_to_slice::hex::bench4_16k ... bench: 21,850.53 ns/iter (+/- 191.97) -test decode_to_slice::hex::bench5_128k ... bench: 614,217.67 ns/iter (+/- 2,237.99) -test decode_to_slice::hex::bench6_1m ... bench: 5,357,921.20 ns/iter (+/- 240,508.79) - -test encode::const_hex::bench1_32b ... bench: 7.00 ns/iter (+/- 0.37) -test encode::const_hex::bench2_256b ... bench: 11.83 ns/iter (+/- 0.05) -test encode::const_hex::bench3_2k ... bench: 73.28 ns/iter (+/- 0.30) -test encode::const_hex::bench4_16k ... bench: 467.14 ns/iter (+/- 26.32) -test encode::const_hex::bench5_128k ... bench: 3,760.74 ns/iter (+/- 69.40) -test encode::const_hex::bench6_1m ... bench: 29,080.93 ns/iter (+/- 532.47) -test encode::faster_hex::bench1_32b ... bench: 17.25 ns/iter (+/- 0.17) -test encode::faster_hex::bench2_256b ... bench: 39.03 ns/iter (+/- 0.77) -test encode::faster_hex::bench3_2k ... bench: 102.46 ns/iter (+/- 1.27) -test encode::faster_hex::bench4_16k ... bench: 655.39 ns/iter (+/- 2.28) -test encode::faster_hex::bench5_128k ... bench: 5,233.70 ns/iter (+/- 11.75) -test encode::faster_hex::bench6_1m ... bench: 43,802.73 ns/iter (+/- 1,115.53) -test encode::hex::bench1_32b ... bench: 102.98 ns/iter (+/- 0.75) -test encode::hex::bench2_256b ... bench: 721.27 ns/iter (+/- 4.31) -test encode::hex::bench3_2k ... bench: 5,659.67 ns/iter (+/- 18.84) -test encode::hex::bench4_16k ... bench: 45,138.29 ns/iter (+/- 352.13) -test encode::hex::bench5_128k ... bench: 361,400.70 ns/iter (+/- 1,472.30) -test encode::hex::bench6_1m ... bench: 3,210,824.02 ns/iter (+/- 207,640.35) +test check::const_hex::bench1_32b ... bench: 2.85 ns/iter (+/- 0.44) +test check::const_hex::bench2_256b ... bench: 15.36 ns/iter (+/- 0.44) +test check::const_hex::bench3_2k ... bench: 117.55 ns/iter (+/- 1.72) +test check::const_hex::bench4_16k ... bench: 915.78 ns/iter (+/- 56.34) +test check::const_hex::bench5_128k ... bench: 7,269.26 ns/iter (+/- 80.62) +test check::const_hex::bench6_1m ... bench: 58,975.63 ns/iter (+/- 707.02) +test check::faster_hex::bench1_32b ... bench: 2.70 ns/iter (+/- 0.01) +test check::faster_hex::bench2_256b ... bench: 14.45 ns/iter (+/- 1.44) +test check::faster_hex::bench3_2k ... bench: 123.58 ns/iter (+/- 1.09) +test check::faster_hex::bench4_16k ... bench: 960.32 ns/iter (+/- 6.34) +test check::faster_hex::bench5_128k ... bench: 7,709.14 ns/iter (+/- 77.69) +test check::faster_hex::bench6_1m ... bench: 62,165.54 ns/iter (+/- 1,167.78) +test check::naive::bench1_32b ... bench: 16.41 ns/iter (+/- 1.90) +test check::naive::bench2_256b ... bench: 221.25 ns/iter (+/- 3.47) +test check::naive::bench3_2k ... bench: 2,493.23 ns/iter (+/- 154.04) +test check::naive::bench4_16k ... bench: 16,221.81 ns/iter (+/- 384.98) +test check::naive::bench5_128k ... bench: 485,588.59 ns/iter (+/- 10,260.00) +test check::naive::bench6_1m ... bench: 3,895,089.20 ns/iter (+/- 45,589.05) + +test decode::const_hex::bench1_32b ... bench: 17.33 ns/iter (+/- 0.38) +test decode::const_hex::bench2_256b ... bench: 38.17 ns/iter (+/- 1.07) +test decode::const_hex::bench3_2k ... bench: 235.07 ns/iter (+/- 3.27) +test decode::const_hex::bench4_16k ... bench: 1,681.14 ns/iter (+/- 17.25) +test decode::const_hex::bench5_128k ... bench: 13,097.65 ns/iter (+/- 101.14) +test decode::const_hex::bench6_1m ... bench: 105,945.60 ns/iter (+/- 2,703.49) +test decode::faster_hex::bench1_32b ... bench: 17.91 ns/iter (+/- 0.40) +test decode::faster_hex::bench2_256b ... bench: 54.53 ns/iter (+/- 1.41) +test decode::faster_hex::bench3_2k ... bench: 245.35 ns/iter (+/- 3.89) +test decode::faster_hex::bench4_16k ... bench: 1,836.62 ns/iter (+/- 25.01) +test decode::faster_hex::bench5_128k ... bench: 14,471.53 ns/iter (+/- 184.29) +test decode::faster_hex::bench6_1m ... bench: 116,688.27 ns/iter (+/- 1,539.72) +test decode::hex::bench1_32b ... bench: 109.14 ns/iter (+/- 1.88) +test decode::hex::bench2_256b ... bench: 712.92 ns/iter (+/- 14.25) +test decode::hex::bench3_2k ... bench: 5,196.66 ns/iter (+/- 102.67) +test decode::hex::bench4_16k ... bench: 41,308.30 ns/iter (+/- 917.60) +test decode::hex::bench5_128k ... bench: 786,648.00 ns/iter (+/- 6,589.60) +test decode::hex::bench6_1m ... bench: 6,316,271.50 ns/iter (+/- 22,712.18) + +test decode_to_slice::const_hex::bench1_32b ... bench: 5.14 ns/iter (+/- 0.39) +test decode_to_slice::const_hex::bench2_256b ... bench: 26.18 ns/iter (+/- 0.26) +test decode_to_slice::const_hex::bench3_2k ... bench: 206.71 ns/iter (+/- 1.88) +test decode_to_slice::const_hex::bench4_16k ... bench: 1,666.49 ns/iter (+/- 15.67) +test decode_to_slice::const_hex::bench5_128k ... bench: 12,979.03 ns/iter (+/- 80.40) +test decode_to_slice::const_hex::bench6_1m ... bench: 107,213.20 ns/iter (+/- 4,024.91) +test decode_to_slice::faster_hex::bench1_32b ... bench: 6.51 ns/iter (+/- 0.04) +test decode_to_slice::faster_hex::bench2_256b ... bench: 28.66 ns/iter (+/- 0.29) +test decode_to_slice::faster_hex::bench3_2k ... bench: 217.84 ns/iter (+/- 1.03) +test decode_to_slice::faster_hex::bench4_16k ... bench: 1,730.89 ns/iter (+/- 19.36) +test decode_to_slice::faster_hex::bench5_128k ... bench: 13,439.63 ns/iter (+/- 92.19) +test decode_to_slice::faster_hex::bench6_1m ... bench: 109,432.90 ns/iter (+/- 1,526.01) +test decode_to_slice::hex::bench1_32b ... bench: 38.44 ns/iter (+/- 1.30) +test decode_to_slice::hex::bench2_256b ... bench: 290.78 ns/iter (+/- 16.09) +test decode_to_slice::hex::bench3_2k ... bench: 2,663.51 ns/iter (+/- 48.22) +test decode_to_slice::hex::bench4_16k ... bench: 19,016.95 ns/iter (+/- 514.35) +test decode_to_slice::hex::bench5_128k ... bench: 612,840.31 ns/iter (+/- 6,561.70) +test decode_to_slice::hex::bench6_1m ... bench: 5,098,572.75 ns/iter (+/- 120,113.94) + +test encode::const_hex::bench1_32b ... bench: 6.94 ns/iter (+/- 0.06) +test encode::const_hex::bench2_256b ... bench: 11.84 ns/iter (+/- 0.07) +test encode::const_hex::bench3_2k ... bench: 78.36 ns/iter (+/- 0.89) +test encode::const_hex::bench4_16k ... bench: 475.29 ns/iter (+/- 11.56) +test encode::const_hex::bench5_128k ... bench: 3,577.27 ns/iter (+/- 70.48) +test encode::const_hex::bench6_1m ... bench: 29,996.00 ns/iter (+/- 668.44) +test encode::faster_hex::bench1_32b ... bench: 17.31 ns/iter (+/- 0.37) +test encode::faster_hex::bench2_256b ... bench: 39.39 ns/iter (+/- 0.76) +test encode::faster_hex::bench3_2k ... bench: 106.60 ns/iter (+/- 1.41) +test encode::faster_hex::bench4_16k ... bench: 653.21 ns/iter (+/- 5.40) +test encode::faster_hex::bench5_128k ... bench: 5,260.68 ns/iter (+/- 88.46) +test encode::faster_hex::bench6_1m ... bench: 44,520.36 ns/iter (+/- 1,200.74) +test encode::hex::bench1_32b ... bench: 102.77 ns/iter (+/- 0.82) +test encode::hex::bench2_256b ... bench: 720.90 ns/iter (+/- 22.52) +test encode::hex::bench3_2k ... bench: 5,672.44 ns/iter (+/- 287.53) +test encode::hex::bench4_16k ... bench: 38,988.71 ns/iter (+/- 6,457.99) +test encode::hex::bench5_128k ... bench: 364,376.25 ns/iter (+/- 51,416.85) +test encode::hex::bench6_1m ... bench: 2,959,499.88 ns/iter (+/- 410,006.38) test encode_to_slice::const_hex::bench1_32b ... bench: 1.56 ns/iter (+/- 0.00) -test encode_to_slice::const_hex::bench2_256b ... bench: 6.72 ns/iter (+/- 0.03) -test encode_to_slice::const_hex::bench3_2k ... bench: 58.79 ns/iter (+/- 1.45) -test encode_to_slice::const_hex::bench4_16k ... bench: 510.57 ns/iter (+/- 11.70) -test encode_to_slice::const_hex::bench5_128k ... bench: 4,030.22 ns/iter (+/- 76.92) -test encode_to_slice::const_hex::bench6_1m ... bench: 35,273.20 ns/iter (+/- 583.54) -test encode_to_slice::faster_hex::bench1_32b ... bench: 4.52 ns/iter (+/- 0.03) -test encode_to_slice::faster_hex::bench2_256b ... bench: 8.09 ns/iter (+/- 0.02) -test encode_to_slice::faster_hex::bench3_2k ... bench: 53.83 ns/iter (+/- 1.28) -test encode_to_slice::faster_hex::bench4_16k ... bench: 450.39 ns/iter (+/- 6.73) -test encode_to_slice::faster_hex::bench5_128k ... bench: 3,444.01 ns/iter (+/- 17.74) -test encode_to_slice::faster_hex::bench6_1m ... bench: 29,645.36 ns/iter (+/- 535.00) -test encode_to_slice::hex::bench1_32b ... bench: 12.08 ns/iter (+/- 0.11) -test encode_to_slice::hex::bench2_256b ... bench: 119.24 ns/iter (+/- 0.48) -test encode_to_slice::hex::bench3_2k ... bench: 988.01 ns/iter (+/- 11.35) -test encode_to_slice::hex::bench4_16k ... bench: 8,044.36 ns/iter (+/- 54.57) -test encode_to_slice::hex::bench5_128k ... bench: 64,068.07 ns/iter (+/- 954.12) -test encode_to_slice::hex::bench6_1m ... bench: 517,206.80 ns/iter (+/- 4,775.29) - -test format::const_hex::bench1_32b ... bench: 10.15 ns/iter (+/- 0.14) -test format::const_hex::bench2_256b ... bench: 17.32 ns/iter (+/- 1.00) -test format::const_hex::bench3_2k ... bench: 116.15 ns/iter (+/- 5.37) -test format::const_hex::bench4_16k ... bench: 1,102.71 ns/iter (+/- 6.87) -test format::const_hex::bench5_128k ... bench: 8,784.66 ns/iter (+/- 108.90) -test format::const_hex::bench6_1m ... bench: 77,741.10 ns/iter (+/- 2,452.30) -test format::std::bench1_32b ... bench: 385.04 ns/iter (+/- 2.50) -test format::std::bench2_256b ... bench: 2,979.01 ns/iter (+/- 226.14) -test format::std::bench3_2k ... bench: 24,019.65 ns/iter (+/- 118.96) -test format::std::bench4_16k ... bench: 200,691.74 ns/iter (+/- 1,243.46) -test format::std::bench5_128k ... bench: 1,565,830.30 ns/iter (+/- 96,284.89) -test format::std::bench6_1m ... bench: 12,532,954.20 ns/iter (+/- 400,001.89) +test encode_to_slice::const_hex::bench2_256b ... bench: 6.75 ns/iter (+/- 0.03) +test encode_to_slice::const_hex::bench3_2k ... bench: 58.32 ns/iter (+/- 0.23) +test encode_to_slice::const_hex::bench4_16k ... bench: 518.24 ns/iter (+/- 4.91) +test encode_to_slice::const_hex::bench5_128k ... bench: 4,003.77 ns/iter (+/- 28.57) +test encode_to_slice::const_hex::bench6_1m ... bench: 34,519.64 ns/iter (+/- 656.35) +test encode_to_slice::faster_hex::bench1_32b ... bench: 4.54 ns/iter (+/- 0.01) +test encode_to_slice::faster_hex::bench2_256b ... bench: 8.11 ns/iter (+/- 0.05) +test encode_to_slice::faster_hex::bench3_2k ... bench: 52.10 ns/iter (+/- 0.64) +test encode_to_slice::faster_hex::bench4_16k ... bench: 475.81 ns/iter (+/- 6.50) +test encode_to_slice::faster_hex::bench5_128k ... bench: 3,425.49 ns/iter (+/- 15.01) +test encode_to_slice::faster_hex::bench6_1m ... bench: 28,725.82 ns/iter (+/- 839.95) +test encode_to_slice::hex::bench1_32b ... bench: 12.01 ns/iter (+/- 0.12) +test encode_to_slice::hex::bench2_256b ... bench: 121.68 ns/iter (+/- 1.98) +test encode_to_slice::hex::bench3_2k ... bench: 989.33 ns/iter (+/- 3.83) +test encode_to_slice::hex::bench4_16k ... bench: 8,087.93 ns/iter (+/- 25.25) +test encode_to_slice::hex::bench5_128k ... bench: 64,323.94 ns/iter (+/- 249.97) +test encode_to_slice::hex::bench6_1m ... bench: 515,710.80 ns/iter (+/- 2,232.59) + +test format::const_hex::bench1_32b ... bench: 10.26 ns/iter (+/- 0.19) +test format::const_hex::bench2_256b ... bench: 18.28 ns/iter (+/- 0.86) +test format::const_hex::bench3_2k ... bench: 116.95 ns/iter (+/- 2.17) +test format::const_hex::bench4_16k ... bench: 1,122.29 ns/iter (+/- 6.25) +test format::const_hex::bench5_128k ... bench: 8,903.81 ns/iter (+/- 111.29) +test format::const_hex::bench6_1m ... bench: 77,476.15 ns/iter (+/- 1,498.93) +test format::std::bench1_32b ... bench: 370.27 ns/iter (+/- 2.52) +test format::std::bench2_256b ... bench: 2,910.66 ns/iter (+/- 40.35) +test format::std::bench3_2k ... bench: 22,554.52 ns/iter (+/- 263.93) +test format::std::bench4_16k ... bench: 182,692.06 ns/iter (+/- 3,494.64) +test format::std::bench5_128k ... bench: 1,475,988.90 ns/iter (+/- 21,895.90) +test format::std::bench6_1m ... bench: 11,834,234.60 ns/iter (+/- 139,230.20) ``` ## Acknowledgements diff --git a/benches/bench/main.rs b/benches/bench/main.rs index 4a9315c..06275df 100644 --- a/benches/bench/main.rs +++ b/benches/bench/main.rs @@ -39,7 +39,7 @@ macro_rules! benches { #[bench] fn $name(b: &mut Bencher) { b.iter(|| { - ::const_hex::check(black_box($dec)) + ::const_hex::check_raw(black_box($dec)) }); } )* diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs index 80b4fbb..a04bad5 100644 --- a/src/arch/aarch64.rs +++ b/src/arch/aarch64.rs @@ -65,7 +65,7 @@ pub(crate) unsafe fn encode_neon(input: &[u8], output: *mut u #[inline] pub(crate) fn check(input: &[u8]) -> bool { - if cfg!(miri) || !has_neon() || input.len() < CHUNK_SIZE { + if cfg!(miri) || !has_neon() { return generic::check(input); } unsafe { check_neon(input) } @@ -80,26 +80,23 @@ pub(crate) unsafe fn check_neon(input: &[u8]) -> bool { let ascii_la = vdupq_n_u8(b'a' - 1); let ascii_lf = vdupq_n_u8(b'f' + 1); - let (prefix, chunks, suffix) = input.align_to::(); - generic::check(prefix) - && chunks.iter().all(|&chunk| { - let ge0 = vcgtq_u8(chunk, ascii_zero); - let le9 = vcltq_u8(chunk, ascii_nine); - let valid_digit = vandq_u8(ge0, le9); - - let geua = vcgtq_u8(chunk, ascii_ua); - let leuf = vcltq_u8(chunk, ascii_uf); - let valid_upper = vandq_u8(geua, leuf); - - let gela = vcgtq_u8(chunk, ascii_la); - let lelf = vcltq_u8(chunk, ascii_lf); - let valid_lower = vandq_u8(gela, lelf); - - let valid_letter = vorrq_u8(valid_lower, valid_upper); - let valid_mask = vorrq_u8(valid_digit, valid_letter); - vminvq_u8(valid_mask) == 0xFF - }) - && generic::check(suffix) + generic::check_unaligned_chunks(input, |chunk| { + let ge0 = vcgtq_u8(chunk, ascii_zero); + let le9 = vcltq_u8(chunk, ascii_nine); + let valid_digit = vandq_u8(ge0, le9); + + let geua = vcgtq_u8(chunk, ascii_ua); + let leuf = vcltq_u8(chunk, ascii_uf); + let valid_upper = vandq_u8(geua, leuf); + + let gela = vcgtq_u8(chunk, ascii_la); + let lelf = vcltq_u8(chunk, ascii_lf); + let valid_lower = vandq_u8(gela, lelf); + + let valid_letter = vorrq_u8(valid_lower, valid_upper); + let valid_mask = vorrq_u8(valid_digit, valid_letter); + vminvq_u8(valid_mask) == 0xFF + }) } pub(crate) use generic::decode_checked; diff --git a/src/arch/generic.rs b/src/arch/generic.rs index b2e40e9..2dfb967 100644 --- a/src/arch/generic.rs +++ b/src/arch/generic.rs @@ -33,6 +33,19 @@ pub(crate) const fn check(mut input: &[u8]) -> bool { true } +/// Runs the given check function on unaligned chunks of `T` in `input`, with the remainder passed +/// to the generic [`check`]. +#[inline] +#[allow(dead_code)] +pub(crate) fn check_unaligned_chunks( + input: &[u8], + mut check_chunk: impl FnMut(T) -> bool, +) -> bool { + let mut chunks = input.chunks_exact(core::mem::size_of::()); + chunks.all(|chunk| check_chunk(unsafe { chunk.as_ptr().cast::().read_unaligned() })) + && check(chunks.remainder()) +} + /// Default checked decoding function. /// /// # Safety diff --git a/src/arch/portable_simd.rs b/src/arch/portable_simd.rs index 6572467..64bd040 100644 --- a/src/arch/portable_simd.rs +++ b/src/arch/portable_simd.rs @@ -44,16 +44,13 @@ pub(crate) unsafe fn encode(input: &[u8], output: *mut u8) { } pub(crate) fn check(input: &[u8]) -> bool { - let (prefix, chunks, suffix) = input.as_simd::(); - generic::check(prefix) - && chunks.iter().all(|&chunk| { - let valid_digit = chunk.simd_ge(Simd::splat(b'0')) & chunk.simd_le(Simd::splat(b'9')); - let valid_upper = chunk.simd_ge(Simd::splat(b'A')) & chunk.simd_le(Simd::splat(b'F')); - let valid_lower = chunk.simd_ge(Simd::splat(b'a')) & chunk.simd_le(Simd::splat(b'f')); - let valid = valid_digit | valid_upper | valid_lower; - valid.all() - }) - && generic::check(suffix) + generic::check_unaligned_chunks::(input, |chunk| { + let valid_digit = chunk.simd_ge(Simd::splat(b'0')) & chunk.simd_le(Simd::splat(b'9')); + let valid_upper = chunk.simd_ge(Simd::splat(b'A')) & chunk.simd_le(Simd::splat(b'F')); + let valid_lower = chunk.simd_ge(Simd::splat(b'a')) & chunk.simd_le(Simd::splat(b'f')); + let valid = valid_digit | valid_upper | valid_lower; + valid.all() + }) } pub(crate) use generic::decode_checked; diff --git a/src/arch/x86.rs b/src/arch/x86.rs index 8c5d180..bdf5082 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86.rs @@ -83,7 +83,7 @@ unsafe fn encode_ssse3(input: &[u8], output: *mut u8) { #[inline] pub(crate) fn check(input: &[u8]) -> bool { - if !has_sse2() || input.len() < CHUNK_SIZE_SSE { + if !has_sse2() { return generic::check(input); } unsafe { check_sse2(input) } @@ -99,26 +99,23 @@ unsafe fn check_sse2(input: &[u8]) -> bool { let ascii_la = _mm_set1_epi8((b'a' - 1) as i8); let ascii_lf = _mm_set1_epi8((b'f' + 1) as i8); - let (prefix, chunks, suffix) = input.align_to::<__m128i>(); - generic::check(prefix) - && chunks.iter().all(|&chunk| { - let ge0 = _mm_cmpgt_epi8(chunk, ascii_zero); - let le9 = _mm_cmplt_epi8(chunk, ascii_nine); - let valid_digit = _mm_and_si128(ge0, le9); - - let geua = _mm_cmpgt_epi8(chunk, ascii_ua); - let leuf = _mm_cmplt_epi8(chunk, ascii_uf); - let valid_upper = _mm_and_si128(geua, leuf); - - let gela = _mm_cmpgt_epi8(chunk, ascii_la); - let lelf = _mm_cmplt_epi8(chunk, ascii_lf); - let valid_lower = _mm_and_si128(gela, lelf); - - let valid_letter = _mm_or_si128(valid_lower, valid_upper); - let valid_mask = _mm_movemask_epi8(_mm_or_si128(valid_digit, valid_letter)); - valid_mask == 0xffff - }) - && generic::check(suffix) + generic::check_unaligned_chunks(input, |chunk| { + let ge0 = _mm_cmpgt_epi8(chunk, ascii_zero); + let le9 = _mm_cmplt_epi8(chunk, ascii_nine); + let valid_digit = _mm_and_si128(ge0, le9); + + let geua = _mm_cmpgt_epi8(chunk, ascii_ua); + let leuf = _mm_cmplt_epi8(chunk, ascii_uf); + let valid_upper = _mm_and_si128(geua, leuf); + + let gela = _mm_cmpgt_epi8(chunk, ascii_la); + let lelf = _mm_cmplt_epi8(chunk, ascii_lf); + let valid_lower = _mm_and_si128(gela, lelf); + + let valid_letter = _mm_or_si128(valid_lower, valid_upper); + let valid_mask = _mm_movemask_epi8(_mm_or_si128(valid_digit, valid_letter)); + valid_mask == 0xffff + }) } #[inline] diff --git a/src/lib.rs b/src/lib.rs index 98dca66..e3918f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -666,7 +666,7 @@ const unsafe fn invalid_hex_error(input: &[u8]) -> FromHexError { if cfg!(debug_assertions) { panic!("input was valid but `check` failed") } else { - core::hint::unreachable_unchecked() + unsafe { core::hint::unreachable_unchecked() } } } };