diff --git a/src/aarch64.rs b/src/aarch64.rs new file mode 100644 index 0000000..772f473 --- /dev/null +++ b/src/aarch64.rs @@ -0,0 +1,50 @@ +use crate::default; +use core::arch::aarch64::*; + +const CHUNK_SIZE: usize = core::mem::size_of::(); + +/// Hex encoding function using aarch64 intrisics. +/// +/// # Safety +/// +/// `output` must be a valid pointer to at least `2 * input.len()` bytes. +// SAFETY: this is only compiled when the target feature is enabled. +#[target_feature(enable = "neon")] +pub(super) unsafe fn encode(input: &[u8], output: *mut u8) { + if input.len() < CHUNK_SIZE { + return default::encode::(input, output); + } + + // Load table and construct masks. + let hex_table = vld1q_u8(super::get_chars_table::().as_ptr().cast_const()); + let mask_lo = vdupq_n_u8(0x0F); + let mask_hi = vdupq_n_u8(0xF0); + + let input_chunks = input.chunks_exact(CHUNK_SIZE); + let input_remainder = input_chunks.remainder(); + + let mut i = 0; + for input_chunk in input_chunks { + // Load input bytes and mask to nibbles. + let input_bytes = vld1q_u8(input_chunk.as_ptr() as *const u8); + let mut lo = vandq_u8(input_bytes, mask_lo); + let mut hi = vshrq_n_u8(vandq_u8(input_bytes, mask_hi), 4); + + // Lookup the corresponding ASCII hex digit for each nibble. + lo = vqtbl1q_u8(hex_table, lo); + hi = vqtbl1q_u8(hex_table, hi); + + // Interleave the nibbles ([hi[0], lo[0], hi[1], lo[1], ...]). + let result = vzipq_u8(hi, lo); + + // Store result into the output buffer. + vst2q_u8(output.add(i), result); + i += CHUNK_SIZE * 2; + } + + if !input_remainder.is_empty() { + default::encode::(input_remainder, output.add(i)); + } +} + +pub(super) use default::decode; diff --git a/src/lib.rs b/src/lib.rs index df0a33c..1a4c6d4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,6 +46,9 @@ cfg_if! { } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { mod x86; use x86 as imp; + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + mod aarch64; + use aarch64 as imp; } else { use default as imp; }