Skip to content

Commit

Permalink
Merge pull request #2 from DaniPopes/aarch64
Browse files Browse the repository at this point in the history
`aarch64` backend implementation
  • Loading branch information
DaniPopes authored Aug 28, 2023
2 parents 34710dc + be51e05 commit 26a6b50
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 2 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,8 @@ jobs:
target: [x86_64-unknown-linux-gnu, aarch64-unknown-linux-gnu]
flags: [--no-default-features, "", --all-features]
exclude:
# miri doesn't implement some aarch intrinsics (`llvm.aarch64.neon.tbl1.v16i8`)
# miri doesn't implement neon intrinsics.
- target: aarch64-unknown-linux-gnu
flags: --all-features
env:
MIRIFLAGS: -Zmiri-strict-provenance
steps:
Expand Down
52 changes: 52 additions & 0 deletions src/aarch64.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
use crate::generic;
use core::arch::aarch64::*;

const CHUNK_SIZE: usize = core::mem::size_of::<uint8x16_t>();

/// Hex encoding function using aarch64 intrisics.
///
/// # Safety
///
/// `output` must be a valid pointer to at least `2 * input.len()` bytes.
// SAFETY: this is only compiled when the target feature is enabled.
#[target_feature(enable = "neon")]
pub(super) unsafe fn encode<const UPPER: bool>(input: &[u8], output: *mut u8) {
if input.len() < CHUNK_SIZE {
return generic::encode::<UPPER>(input, output);
}

// Load table and construct masks.
let hex_table = vld1q_u8(super::get_chars_table::<UPPER>().as_ptr());
let mask_lo = vdupq_n_u8(0x0F);
let mask_hi = vdupq_n_u8(0xF0);

let input_chunks = input.chunks_exact(CHUNK_SIZE);
let input_remainder = input_chunks.remainder();

let mut i = 0;
for input_chunk in input_chunks {
// Load input bytes and mask to nibbles.
let input_bytes = vld1q_u8(input_chunk.as_ptr() as *const u8);
let mut lo = vandq_u8(input_bytes, mask_lo);
let mut hi = vshrq_n_u8(vandq_u8(input_bytes, mask_hi), 4);

// Lookup the corresponding ASCII hex digit for each nibble.
lo = vqtbl1q_u8(hex_table, lo);
hi = vqtbl1q_u8(hex_table, hi);

// Interleave the nibbles ([hi[0], lo[0], hi[1], lo[1], ...]).
let hex_lo = vzip1q_u8(hi, lo);
let hex_hi = vzip2q_u8(hi, lo);

// Store result into the output buffer.
vst1q_u8(output.add(i), hex_lo);
vst1q_u8(output.add(i + CHUNK_SIZE), hex_hi);
i += CHUNK_SIZE * 2;
}

if !input_remainder.is_empty() {
generic::encode::<UPPER>(input_remainder, output.add(i));
}
}

pub(super) use generic::decode;
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ cfg_if! {
} else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
mod x86;
use x86 as imp;
} else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
mod aarch64;
use aarch64 as imp;
} else {
use generic as imp;
}
Expand Down

0 comments on commit 26a6b50

Please sign in to comment.