DaniPopes · DaniPopes · Aug 28, 2023 · Aug 28, 2023 · Aug 28, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -58,9 +58,8 @@ jobs:
         target: [x86_64-unknown-linux-gnu, aarch64-unknown-linux-gnu]
         flags: [--no-default-features, "", --all-features]
         exclude:
-          # miri doesn't implement some aarch intrinsics (`llvm.aarch64.neon.tbl1.v16i8`)
+          # miri doesn't implement neon intrinsics.
           - target: aarch64-unknown-linux-gnu
-            flags: --all-features
     env:
       MIRIFLAGS: -Zmiri-strict-provenance
     steps:

diff --git a/src/aarch64.rs b/src/aarch64.rs
@@ -0,0 +1,52 @@
+use crate::generic;
+use core::arch::aarch64::*;
+
+const CHUNK_SIZE: usize = core::mem::size_of::<uint8x16_t>();
+
+/// Hex encoding function using aarch64 intrisics.
+///
+/// # Safety
+///
+/// `output` must be a valid pointer to at least `2 * input.len()` bytes.
+// SAFETY: this is only compiled when the target feature is enabled.
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn encode<const UPPER: bool>(input: &[u8], output: *mut u8) {
+    if input.len() < CHUNK_SIZE {
+        return generic::encode::<UPPER>(input, output);
+    }
+
+    // Load table and construct masks.
+    let hex_table = vld1q_u8(super::get_chars_table::<UPPER>().as_ptr());
+    let mask_lo = vdupq_n_u8(0x0F);
+    let mask_hi = vdupq_n_u8(0xF0);
+
+    let input_chunks = input.chunks_exact(CHUNK_SIZE);
+    let input_remainder = input_chunks.remainder();
+
+    let mut i = 0;
+    for input_chunk in input_chunks {
+        // Load input bytes and mask to nibbles.
+        let input_bytes = vld1q_u8(input_chunk.as_ptr() as *const u8);
+        let mut lo = vandq_u8(input_bytes, mask_lo);
+        let mut hi = vshrq_n_u8(vandq_u8(input_bytes, mask_hi), 4);
+
+        // Lookup the corresponding ASCII hex digit for each nibble.
+        lo = vqtbl1q_u8(hex_table, lo);
+        hi = vqtbl1q_u8(hex_table, hi);
+
+        // Interleave the nibbles ([hi[0], lo[0], hi[1], lo[1], ...]).
+        let hex_lo = vzip1q_u8(hi, lo);
+        let hex_hi = vzip2q_u8(hi, lo);
+
+        // Store result into the output buffer.
+        vst1q_u8(output.add(i), hex_lo);
+        vst1q_u8(output.add(i + CHUNK_SIZE), hex_hi);
+        i += CHUNK_SIZE * 2;
+    }
+
+    if !input_remainder.is_empty() {
+        generic::encode::<UPPER>(input_remainder, output.add(i));
+    }
+}
+
+pub(super) use generic::decode;
diff --git a/src/lib.rs b/src/lib.rs
@@ -46,6 +46,9 @@ cfg_if! {
     } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
         mod x86;
         use x86 as imp;
+    } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+        mod aarch64;
+        use aarch64 as imp;
     } else {
         use generic as imp;
     }