From 3014c6928501a8bd6a1aea4e347bec9e7fe6852f Mon Sep 17 00:00:00 2001 From: Coda Hale Date: Sat, 29 Oct 2022 19:01:29 -0600 Subject: [PATCH 1/4] chacha20: Add full NEON backend. Observed performance changes on an Apple M1 Air: ``` name chacha-soft ns/iter chacha-neon ns/iter diff ns/iter diff % speedup chacha12_bench1_16b 34 (470 MB/s) 28 (571 MB/s) -6 -17.65% x 1.21 chacha12_bench2_256b 477 (536 MB/s) 132 (1939 MB/s) -345 -72.33% x 3.61 chacha12_bench3_1kib 1,897 (539 MB/s) 503 (2035 MB/s) -1,394 -73.48% x 3.77 chacha12_bench4_16kib 30,811 (531 MB/s) 7,914 (2070 MB/s) -22,897 -74.31% x 3.89 chacha20_bench1_16b 51 (313 MB/s) 47 (340 MB/s) -4 -7.84% x 1.09 chacha20_bench2_256b 777 (329 MB/s) 212 (1207 MB/s) -565 -72.72% x 3.67 chacha20_bench3_1kib 3,088 (331 MB/s) 821 (1247 MB/s) -2,267 -73.41% x 3.76 chacha20_bench4_16kib 50,251 (326 MB/s) 13,001 (1260 MB/s) -37,250 -74.13% x 3.87 chacha8_bench1_16b 26 (615 MB/s) 19 (842 MB/s) -7 -26.92% x 1.37 chacha8_bench2_256b 335 (764 MB/s) 92 (2782 MB/s) -243 -72.54% x 3.64 chacha8_bench3_1kib 1,328 (771 MB/s) 344 (2976 MB/s) -984 -74.10% x 3.86 chacha8_bench4_16kib 21,184 (773 MB/s) 5,371 (3050 MB/s) -15,813 -74.65% x 3.94 ``` --- chacha20/src/backends.rs | 2 + chacha20/src/backends/neon.rs | 912 ++++++++++++---------------------- chacha20/src/lib.rs | 4 + 3 files changed, 330 insertions(+), 588 deletions(-) diff --git a/chacha20/src/backends.rs b/chacha20/src/backends.rs index 0d49d319..c5bbb226 100644 --- a/chacha20/src/backends.rs +++ b/chacha20/src/backends.rs @@ -15,6 +15,8 @@ cfg_if! { pub(crate) mod sse2; } } + } else if #[cfg(any(target_arch = "aarch64"))] { + pub(crate) mod neon; } else { pub(crate) mod soft; } diff --git a/chacha20/src/backends/neon.rs b/chacha20/src/backends/neon.rs index 4524b2ca..6df6cdaf 100644 --- a/chacha20/src/backends/neon.rs +++ b/chacha20/src/backends/neon.rs @@ -1,619 +1,355 @@ -//! The ChaCha20 core function. Defined in RFC 8439 Section 2.3. -//! -//! -//! -//! NEON-optimized implementation for aarch64 CPUs adapted from the SUPERCOP `dolbeau` -//! backend (public domain). - -use crate::{rounds::Rounds, BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE}; +//! NEON-optimized implementation for aarch64 CPUs adapted from the Crypto++ `chacha_simd` +//! implementation by Jack Lloyd and Jeffrey Walton (public domain). +use crate::{Block, StreamClosure, Unsigned, STATE_WORDS}; +use cipher::{ + consts::{U4, U64}, + BlockSizeUser, ParBlocks, ParBlocksSizeUser, StreamBackend, +}; use core::marker::PhantomData; -/// Size of buffers passed to `generate` and `apply_keystream` for this backend, which -/// operates on four blocks in parallel for optimal performance. -pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE * 4; - use core::arch::aarch64::*; -/// A composite type storing four consecutive ChaCha20 words from a single block. -#[derive(Clone, Copy)] -struct StateWord(uint32x4_t); - -impl StateWord { - #[inline] - #[target_feature(enable = "neon")] - unsafe fn from_bytes(bytes: &[u8]) -> Self { - debug_assert_eq!(bytes.len(), 16); - - // We read the bytes into a [u32; 4] because vld1q_u32 requires aligned memory. - StateWord(vld1q_u32( - [ - u32::from_le_bytes(bytes[0..4].try_into().unwrap()), - u32::from_le_bytes(bytes[4..8].try_into().unwrap()), - u32::from_le_bytes(bytes[8..12].try_into().unwrap()), - u32::from_le_bytes(bytes[12..16].try_into().unwrap()), - ] - .as_ptr(), - )) - } - - #[inline] - #[target_feature(enable = "neon")] - unsafe fn store(self, bytes: &mut [u8]) { - debug_assert_eq!(bytes.len(), 16); - - // We write the word into a [u32; 4] because vst1q_u32 requires aligned memory. - let mut out = [0u32; 4]; - vst1q_u32(out.as_mut_ptr(), self.0); - - for (word, chunk) in core::array::IntoIter::new(out).zip(bytes.chunks_exact_mut(4)) { - chunk.copy_from_slice(&word.to_le_bytes()); - } - } - - #[inline] - #[must_use] - #[target_feature(enable = "neon")] - unsafe fn xor(self, rhs: Self) -> Self { - StateWord(veorq_u32(self.0, rhs.0)) - } +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn inner(state: &mut [u32; STATE_WORDS], f: F) +where + R: Unsigned, + F: StreamClosure, +{ + let mut backend = Backend:: { + state: [ + vld1q_u32(state.as_ptr().offset(0)), + vld1q_u32(state.as_ptr().offset(4)), + vld1q_u32(state.as_ptr().offset(8)), + vld1q_u32(state.as_ptr().offset(12)), + ], + _pd: PhantomData, + }; + + f.call(&mut backend); + + vst1q_u32(state.as_mut_ptr().offset(12), backend.state[3]); } -/// A composite type storing the equivalent ChaCha20 word from four consecutive blocks. -#[derive(Clone, Copy)] -struct QuadWord(uint32x4_t); - -impl QuadWord { - #[inline] - #[target_feature(enable = "neon")] - unsafe fn from_u32(value: u32) -> Self { - QuadWord(vdupq_n_u32(value)) - } - - #[inline] - #[must_use] - #[target_feature(enable = "neon")] - unsafe fn add(self, rhs: Self) -> Self { - QuadWord(vaddq_u32(self.0, rhs.0)) - } - - #[inline] - #[must_use] - #[target_feature(enable = "neon")] - unsafe fn xor(self, rhs: Self) -> Self { - QuadWord(veorq_u32(self.0, rhs.0)) - } - - #[inline] - #[must_use] - #[target_feature(enable = "neon")] - unsafe fn rol(self) -> Self { - QuadWord(vsliq_n_u32(vshrq_n_u32(self.0, REST), self.0, BY)) - } - - #[inline] - #[must_use] - #[target_feature(enable = "neon")] - unsafe fn rol_16(self) -> Self { - QuadWord(vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32( - self.0, - )))) - } +struct Backend { + state: [uint32x4_t; 4], + _pd: PhantomData, } -/// The state required to process four blocks in parallel. -/// -/// Unlike the AVX2 backend, where each state word holds a quarter of a single block, here -/// each quad word holds a specific ChaCha word from all four blocks. -struct State { - x_0: QuadWord, - x_1: QuadWord, - x_2: QuadWord, - x_3: QuadWord, - x_4: QuadWord, - x_5: QuadWord, - x_6: QuadWord, - x_7: QuadWord, - x_8: QuadWord, - x_9: QuadWord, - x_10: QuadWord, - x_11: QuadWord, - x_12: QuadWord, - x_13: QuadWord, - x_14: QuadWord, - x_15: QuadWord, +impl BlockSizeUser for Backend { + type BlockSize = U64; } -impl State { - /// Stores this state in the given output. - #[inline] - unsafe fn store(self, output: &mut [u8]) { - for (i, (a, b, c, d)) in core::array::IntoIter::new([ - (self.x_0, self.x_1, self.x_2, self.x_3), - (self.x_4, self.x_5, self.x_6, self.x_7), - (self.x_8, self.x_9, self.x_10, self.x_11), - (self.x_12, self.x_13, self.x_14, self.x_15), - ]) - .enumerate() - { - let start = 16 * i; - let (q_0, q_1, q_2, q_3) = transpose(a, b, c, d); - - for (j, q) in core::array::IntoIter::new([q_0, q_1, q_2, q_3]).enumerate() { - let offset = 64 * j; - let chunk = &mut output[start + offset..start + offset + 16]; - q.store(chunk); - } - } - } - - /// XORs this state into the given output. - #[inline] - unsafe fn xor(self, output: &mut [u8]) { - for (i, (a, b, c, d)) in core::array::IntoIter::new([ - (self.x_0, self.x_1, self.x_2, self.x_3), - (self.x_4, self.x_5, self.x_6, self.x_7), - (self.x_8, self.x_9, self.x_10, self.x_11), - (self.x_12, self.x_13, self.x_14, self.x_15), - ]) - .enumerate() - { - let start = 16 * i; - let (q_0, q_1, q_2, q_3) = transpose(a, b, c, d); - - for (j, q) in core::array::IntoIter::new([q_0, q_1, q_2, q_3]).enumerate() { - let offset = 64 * j; - let chunk = &mut output[start + offset..start + offset + 16]; - let m = StateWord::from_bytes(chunk); - q.xor(m).store(chunk); - } - } - } +impl ParBlocksSizeUser for Backend { + type ParBlocksSize = U4; } -/// The ChaCha20 core function (NEON-optimized implementation for aarch64). -#[derive(Clone)] -pub struct Core { - x_0: QuadWord, - x_1: QuadWord, - x_2: QuadWord, - x_3: QuadWord, - x_4: QuadWord, - x_5: QuadWord, - x_6: QuadWord, - x_7: QuadWord, - x_8: QuadWord, - x_9: QuadWord, - x_10: QuadWord, - x_11: QuadWord, - x_14: QuadWord, - x_15: QuadWord, - rounds: PhantomData, +macro_rules! add64 { + ($a:expr, $b:expr) => { + vreinterpretq_u32_u64(vaddq_u64( + vreinterpretq_u64_u32($a), + vreinterpretq_u64_u32($b), + )) + }; } -impl Core { - /// Initialize core function with the given key size, IV, and number of rounds. - #[inline] - pub fn new(key: &[u8; KEY_SIZE], iv: [u8; IV_SIZE]) -> Self { - let key = [ - u32::from_le_bytes(key[0..4].try_into().unwrap()), - u32::from_le_bytes(key[4..8].try_into().unwrap()), - u32::from_le_bytes(key[8..12].try_into().unwrap()), - u32::from_le_bytes(key[12..16].try_into().unwrap()), - u32::from_le_bytes(key[16..20].try_into().unwrap()), - u32::from_le_bytes(key[20..24].try_into().unwrap()), - u32::from_le_bytes(key[24..28].try_into().unwrap()), - u32::from_le_bytes(key[28..32].try_into().unwrap()), - ]; - let iv = [ - u32::from_le_bytes(iv[..4].try_into().unwrap()), - u32::from_le_bytes(iv[4..].try_into().unwrap()), - ]; - - unsafe { Self::from_key_and_iv(key, iv) } - } - - #[inline] - #[target_feature(enable = "neon")] - unsafe fn from_key_and_iv(key: [u32; KEY_SIZE / 4], iv: [u32; IV_SIZE / 4]) -> Self { - Self { - x_0: QuadWord::from_u32(CONSTANTS[0]), - x_1: QuadWord::from_u32(CONSTANTS[1]), - x_2: QuadWord::from_u32(CONSTANTS[2]), - x_3: QuadWord::from_u32(CONSTANTS[3]), - x_4: QuadWord::from_u32(key[0]), - x_5: QuadWord::from_u32(key[1]), - x_6: QuadWord::from_u32(key[2]), - x_7: QuadWord::from_u32(key[3]), - x_8: QuadWord::from_u32(key[4]), - x_9: QuadWord::from_u32(key[5]), - x_10: QuadWord::from_u32(key[6]), - x_11: QuadWord::from_u32(key[7]), - x_14: QuadWord::from_u32(iv[0]), - x_15: QuadWord::from_u32(iv[1]), - rounds: PhantomData, - } - } - - #[inline] - pub fn generate(&self, counter: u64, output: &mut [u8]) { - debug_assert_eq!(output.len(), BUFFER_SIZE); - +impl StreamBackend for Backend { + #[inline(always)] + fn gen_ks_block(&mut self, block: &mut Block) { + let state3 = self.state[3]; + let mut par = ParBlocks::::default(); + self.gen_par_ks_blocks(&mut par); + *block = par[0]; unsafe { - let (x_12, x_13) = counter_setup(counter); - let state = State { - x_0: self.x_0, - x_1: self.x_1, - x_2: self.x_2, - x_3: self.x_3, - x_4: self.x_4, - x_5: self.x_5, - x_6: self.x_6, - x_7: self.x_7, - x_8: self.x_8, - x_9: self.x_9, - x_10: self.x_10, - x_11: self.x_11, - x_12, - x_13, - x_14: self.x_14, - x_15: self.x_15, - }; - let state = self.rounds(state); - state.store(output); + self.state[3] = add64!(state3, vld1q_u32([1, 0, 0, 0].as_ptr())); } } - #[inline] - #[cfg(feature = "cipher")] - pub fn apply_keystream(&self, counter: u64, output: &mut [u8]) { - debug_assert_eq!(output.len(), BUFFER_SIZE); - - unsafe { - let (x_12, x_13) = counter_setup(counter); - let state = State { - x_0: self.x_0, - x_1: self.x_1, - x_2: self.x_2, - x_3: self.x_3, - x_4: self.x_4, - x_5: self.x_5, - x_6: self.x_6, - x_7: self.x_7, - x_8: self.x_8, - x_9: self.x_9, - x_10: self.x_10, - x_11: self.x_11, - x_12, - x_13, - x_14: self.x_14, - x_15: self.x_15, + #[inline(always)] + fn gen_par_ks_blocks(&mut self, blocks: &mut ParBlocks) { + macro_rules! rotate_left { + ($v:ident, 8) => {{ + let maskb = [3u8, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14]; + let mask = vld1q_u8(maskb.as_ptr()); + + vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32($v), mask)) + }}; + ($v:ident, 16) => { + vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32($v))) + }; + ($v:ident, $r:literal) => { + vorrq_u32(vshlq_n_u32($v, $r), vshrq_n_u32($v, 32 - $r)) }; - let state = self.rounds(state); - state.xor(output); - } - } - - #[inline] - #[target_feature(enable = "neon")] - unsafe fn rounds(&self, mut state: State) -> State { - let x_12_orig = state.x_12; - let x_13_orig = state.x_13; - - for _ in 0..(R::COUNT / 2) { - state = double_quarter_round(state); - } - - State { - x_0: state.x_0.add(self.x_0), - x_1: state.x_1.add(self.x_1), - x_2: state.x_2.add(self.x_2), - x_3: state.x_3.add(self.x_3), - x_4: state.x_4.add(self.x_4), - x_5: state.x_5.add(self.x_5), - x_6: state.x_6.add(self.x_6), - x_7: state.x_7.add(self.x_7), - x_8: state.x_8.add(self.x_8), - x_9: state.x_9.add(self.x_9), - x_10: state.x_10.add(self.x_10), - x_11: state.x_11.add(self.x_11), - x_12: state.x_12.add(x_12_orig), - x_13: state.x_13.add(x_13_orig), - x_14: state.x_14.add(self.x_14), - x_15: state.x_15.add(self.x_15), } - } -} - -/// Prepares the following structure: -/// -/// ```text -/// c_0 = counter + 0 -/// c_1 = counter + 1 -/// c_2 = counter + 2 -/// c_3 = counter + 3 -/// -/// ( -/// [lo(c_0), lo(c_1), lo(c_2), lo(c_3)], -/// [hi(c_0), hi(c_1), hi(c_2), hi(c_3)], -/// ) -/// ``` -#[inline] -#[target_feature(enable = "neon")] -unsafe fn counter_setup(counter: u64) -> (QuadWord, QuadWord) { - // add_01 = [0, 1] - // add_23 = [2, 3] - let add_01 = vcombine_u64(vcreate_u64(0), vcreate_u64(1)); - let add_23 = vcombine_u64(vcreate_u64(2), vcreate_u64(3)); - - // c = [counter, counter] - let c = vdupq_n_u64(counter); - - // c_01 = [counter + 0, counter + 1] - // = [lo(c_0), hi(c_0), lo(c_1), hi(c_1)] - // c_23 = [counter + 2, counter + 3] - // = [lo(c_2), hi(c_2), lo(c_3), hi(c_3)] - let c_01 = vreinterpretq_u32_u64(vaddq_u64(c, add_01)); - let c_23 = vreinterpretq_u32_u64(vaddq_u64(c, add_23)); - - ( - // [lo(c_0), lo(c_1), lo(c_2), lo(c_3)] - QuadWord(vuzp1q_u32(c_01, c_23)), - // [hi(c_0), hi(c_1), hi(c_2), hi(c_3)] - QuadWord(vuzp2q_u32(c_01, c_23)), - ) -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn double_quarter_round(mut state: State) -> State { - vec4_quarter_round( - &mut state.x_0, - &mut state.x_4, - &mut state.x_8, - &mut state.x_12, - ); - vec4_quarter_round( - &mut state.x_1, - &mut state.x_5, - &mut state.x_9, - &mut state.x_13, - ); - vec4_quarter_round( - &mut state.x_2, - &mut state.x_6, - &mut state.x_10, - &mut state.x_14, - ); - vec4_quarter_round( - &mut state.x_3, - &mut state.x_7, - &mut state.x_11, - &mut state.x_15, - ); - vec4_quarter_round( - &mut state.x_0, - &mut state.x_5, - &mut state.x_10, - &mut state.x_15, - ); - vec4_quarter_round( - &mut state.x_1, - &mut state.x_6, - &mut state.x_11, - &mut state.x_12, - ); - vec4_quarter_round( - &mut state.x_2, - &mut state.x_7, - &mut state.x_8, - &mut state.x_13, - ); - vec4_quarter_round( - &mut state.x_3, - &mut state.x_4, - &mut state.x_9, - &mut state.x_14, - ); - - state -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vec4_quarter_round( - a: &mut QuadWord, - b: &mut QuadWord, - c: &mut QuadWord, - d: &mut QuadWord, -) { - // a += b; d ^= a; d <<<= (16, 16, 16, 16); - *a = a.add(*b); - *d = d.xor(*a).rol_16(); - - // c += d; b ^= c; b <<<= (12, 12, 12, 12); - *c = c.add(*d); - *b = b.xor(*c).rol::<12, 20>(); - - // a += b; d ^= a; d <<<= (8, 8, 8, 8); - *a = a.add(*b); - *d = d.xor(*a).rol::<8, 24>(); - - // c += d; b ^= c; b <<<= (7, 7, 7, 7); - *c = c.add(*d); - *b = b.xor(*c).rol::<7, 25>(); -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn transpose( - a: QuadWord, - b: QuadWord, - c: QuadWord, - d: QuadWord, -) -> (StateWord, StateWord, StateWord, StateWord) { - // Input: - // - // a = [a_0, a_1, a_2, a_3] - // b = [b_0, b_1, b_2, b_3] - // c = [c_0, c_1, c_2, c_3] - // d = [d_0, d_1, d_2, d_3] - - // t0dq = ( - // [a_0, b_0, a_2, b_2], - // [a_1, b_1, a_3, b_3], - // ) - // t1dq = ( - // [c_0, d_0, c_2, d_2], - // [c_1, d_1, c_3, d_3], - // ) - let t0dq = (vtrn1q_u32(a.0, b.0), vtrn2q_u32(a.0, b.0)); - let t1dq = (vtrn1q_u32(c.0, d.0), vtrn2q_u32(c.0, d.0)); - - // Output: - // ( - // [a_0, b_0, c_0, d_0], - // [a_1, b_1, c_1, d_1], - // [a_2, b_2, c_2, d_2], - // [a_3, b_3, c_3, d_3], - // ) - ( - StateWord(vreinterpretq_u32_u64(vcombine_u64( - vget_low_u64(vreinterpretq_u64_u32(t0dq.0)), - vget_low_u64(vreinterpretq_u64_u32(t1dq.0)), - ))), - StateWord(vreinterpretq_u32_u64(vcombine_u64( - vget_low_u64(vreinterpretq_u64_u32(t0dq.1)), - vget_low_u64(vreinterpretq_u64_u32(t1dq.1)), - ))), - StateWord(vreinterpretq_u32_u64(vcombine_u64( - vget_high_u64(vreinterpretq_u64_u32(t0dq.0)), - vget_high_u64(vreinterpretq_u64_u32(t1dq.0)), - ))), - StateWord(vreinterpretq_u32_u64(vcombine_u64( - vget_high_u64(vreinterpretq_u64_u32(t0dq.1)), - vget_high_u64(vreinterpretq_u64_u32(t1dq.1)), - ))), - ) -} -#[cfg(all(test, target_feature = "neon"))] -mod tests { - use super::*; - use crate::rounds::R20; - use crate::{backend::soft, BLOCK_SIZE}; - - // random inputs for testing - const R_CNT: u64 = 0x9fe625b6d23a8fa8u64; - const R_IV: [u8; IV_SIZE] = [0x2f, 0x96, 0xa8, 0x4a, 0xf8, 0x92, 0xbc, 0x94]; - const R_KEY: [u8; KEY_SIZE] = [ - 0x11, 0xf2, 0x72, 0x99, 0xe1, 0x79, 0x6d, 0xef, 0xb, 0xdc, 0x6a, 0x58, 0x1f, 0x1, 0x58, - 0x94, 0x92, 0x19, 0x69, 0x3f, 0xe9, 0x35, 0x16, 0x72, 0x63, 0xd1, 0xd, 0x94, 0x6d, 0x31, - 0x34, 0x11, - ]; - - #[test] - fn init_and_store() { - let core = Core::::new(&R_KEY, R_IV); - - unsafe { - let (x_12, x_13) = counter_setup(R_CNT); - let state = State { - x_0: core.x_0, - x_1: core.x_1, - x_2: core.x_2, - x_3: core.x_3, - x_4: core.x_4, - x_5: core.x_5, - x_6: core.x_6, - x_7: core.x_7, - x_8: core.x_8, - x_9: core.x_9, - x_10: core.x_10, - x_11: core.x_11, - x_12, - x_13, - x_14: core.x_14, - x_15: core.x_15, + macro_rules! extract { + ($v:ident, $s:literal) => { + vextq_u32($v, $v, $s) }; - - let mut output = [0u8; BUFFER_SIZE]; - state.store(&mut output); - - let expected = [ - 0x6170_7865, - 0x3320_646e, - 0x7962_2d32, - 0x6b20_6574, - 0x9972_f211, - 0xef6d_79e1, - 0x586a_dc0b, - 0x9458_011f, - 0x3f69_1992, - 0x7216_35e9, - 0x940d_d163, - 0x1134_316d, - 0xd23a_8fa8, - 0x9fe6_25b6, - 0x4aa8_962f, - 0x94bc_92f8, - ]; - - for (i, chunk) in output[..BLOCK_SIZE].chunks(4).enumerate() { - assert_eq!(expected[i], u32::from_le_bytes(chunk.try_into().unwrap())); - } } - } - - #[test] - fn init_and_double_round() { - let core = Core::::new(&R_KEY, R_IV); unsafe { - let (x_12, x_13) = counter_setup(R_CNT); - let state = State { - x_0: core.x_0, - x_1: core.x_1, - x_2: core.x_2, - x_3: core.x_3, - x_4: core.x_4, - x_5: core.x_5, - x_6: core.x_6, - x_7: core.x_7, - x_8: core.x_8, - x_9: core.x_9, - x_10: core.x_10, - x_11: core.x_11, - x_12, - x_13, - x_14: core.x_14, - x_15: core.x_15, - }; - - let state = double_quarter_round(state); - - let mut output = [0u8; BUFFER_SIZE]; - state.store(&mut output); - - let expected = [ - 562456049, 3130322832, 1534507163, 1938142593, 1427879055, 3727017100, 1549525649, - 2358041203, 1010155040, 657444539, 2865892668, 2826477124, 737507996, 3254278724, - 3376929372, 928763221, + let ctrs = [ + vld1q_u32([1, 0, 0, 0].as_ptr()), + vld1q_u32([2, 0, 0, 0].as_ptr()), + vld1q_u32([3, 0, 0, 0].as_ptr()), + vld1q_u32([4, 0, 0, 0].as_ptr()), ]; - for (i, chunk) in output[..BLOCK_SIZE].chunks(4).enumerate() { - assert_eq!(expected[i], u32::from_le_bytes(chunk.try_into().unwrap())); + let mut r0_0 = self.state[0]; + let mut r0_1 = self.state[1]; + let mut r0_2 = self.state[2]; + let mut r0_3 = self.state[3]; + + let mut r1_0 = self.state[0]; + let mut r1_1 = self.state[1]; + let mut r1_2 = self.state[2]; + let mut r1_3 = add64!(r0_3, ctrs[0]); + + let mut r2_0 = self.state[0]; + let mut r2_1 = self.state[1]; + let mut r2_2 = self.state[2]; + let mut r2_3 = add64!(r0_3, ctrs[1]); + + let mut r3_0 = self.state[0]; + let mut r3_1 = self.state[1]; + let mut r3_2 = self.state[2]; + let mut r3_3 = add64!(r0_3, ctrs[2]); + + for _ in 0..R::USIZE { + r0_0 = vaddq_u32(r0_0, r0_1); + r1_0 = vaddq_u32(r1_0, r1_1); + r2_0 = vaddq_u32(r2_0, r2_1); + r3_0 = vaddq_u32(r3_0, r3_1); + + r0_3 = veorq_u32(r0_3, r0_0); + r1_3 = veorq_u32(r1_3, r1_0); + r2_3 = veorq_u32(r2_3, r2_0); + r3_3 = veorq_u32(r3_3, r3_0); + + r0_3 = rotate_left!(r0_3, 16); + r1_3 = rotate_left!(r1_3, 16); + r2_3 = rotate_left!(r2_3, 16); + r3_3 = rotate_left!(r3_3, 16); + + r0_2 = vaddq_u32(r0_2, r0_3); + r1_2 = vaddq_u32(r1_2, r1_3); + r2_2 = vaddq_u32(r2_2, r2_3); + r3_2 = vaddq_u32(r3_2, r3_3); + + r0_1 = veorq_u32(r0_1, r0_2); + r1_1 = veorq_u32(r1_1, r1_2); + r2_1 = veorq_u32(r2_1, r2_2); + r3_1 = veorq_u32(r3_1, r3_2); + + r0_1 = rotate_left!(r0_1, 12); + r1_1 = rotate_left!(r1_1, 12); + r2_1 = rotate_left!(r2_1, 12); + r3_1 = rotate_left!(r3_1, 12); + + r0_0 = vaddq_u32(r0_0, r0_1); + r1_0 = vaddq_u32(r1_0, r1_1); + r2_0 = vaddq_u32(r2_0, r2_1); + r3_0 = vaddq_u32(r3_0, r3_1); + + r0_3 = veorq_u32(r0_3, r0_0); + r1_3 = veorq_u32(r1_3, r1_0); + r2_3 = veorq_u32(r2_3, r2_0); + r3_3 = veorq_u32(r3_3, r3_0); + + r0_3 = rotate_left!(r0_3, 8); + r1_3 = rotate_left!(r1_3, 8); + r2_3 = rotate_left!(r2_3, 8); + r3_3 = rotate_left!(r3_3, 8); + + r0_2 = vaddq_u32(r0_2, r0_3); + r1_2 = vaddq_u32(r1_2, r1_3); + r2_2 = vaddq_u32(r2_2, r2_3); + r3_2 = vaddq_u32(r3_2, r3_3); + + r0_1 = veorq_u32(r0_1, r0_2); + r1_1 = veorq_u32(r1_1, r1_2); + r2_1 = veorq_u32(r2_1, r2_2); + r3_1 = veorq_u32(r3_1, r3_2); + + r0_1 = rotate_left!(r0_1, 7); + r1_1 = rotate_left!(r1_1, 7); + r2_1 = rotate_left!(r2_1, 7); + r3_1 = rotate_left!(r3_1, 7); + + r0_1 = extract!(r0_1, 1); + r0_2 = extract!(r0_2, 2); + r0_3 = extract!(r0_3, 3); + + r1_1 = extract!(r1_1, 1); + r1_2 = extract!(r1_2, 2); + r1_3 = extract!(r1_3, 3); + + r2_1 = extract!(r2_1, 1); + r2_2 = extract!(r2_2, 2); + r2_3 = extract!(r2_3, 3); + + r3_1 = extract!(r3_1, 1); + r3_2 = extract!(r3_2, 2); + r3_3 = extract!(r3_3, 3); + + r0_0 = vaddq_u32(r0_0, r0_1); + r1_0 = vaddq_u32(r1_0, r1_1); + r2_0 = vaddq_u32(r2_0, r2_1); + r3_0 = vaddq_u32(r3_0, r3_1); + + r0_3 = veorq_u32(r0_3, r0_0); + r1_3 = veorq_u32(r1_3, r1_0); + r2_3 = veorq_u32(r2_3, r2_0); + r3_3 = veorq_u32(r3_3, r3_0); + + r0_3 = rotate_left!(r0_3, 16); + r1_3 = rotate_left!(r1_3, 16); + r2_3 = rotate_left!(r2_3, 16); + r3_3 = rotate_left!(r3_3, 16); + + r0_2 = vaddq_u32(r0_2, r0_3); + r1_2 = vaddq_u32(r1_2, r1_3); + r2_2 = vaddq_u32(r2_2, r2_3); + r3_2 = vaddq_u32(r3_2, r3_3); + + r0_1 = veorq_u32(r0_1, r0_2); + r1_1 = veorq_u32(r1_1, r1_2); + r2_1 = veorq_u32(r2_1, r2_2); + r3_1 = veorq_u32(r3_1, r3_2); + + r0_1 = rotate_left!(r0_1, 12); + r1_1 = rotate_left!(r1_1, 12); + r2_1 = rotate_left!(r2_1, 12); + r3_1 = rotate_left!(r3_1, 12); + + r0_0 = vaddq_u32(r0_0, r0_1); + r1_0 = vaddq_u32(r1_0, r1_1); + r2_0 = vaddq_u32(r2_0, r2_1); + r3_0 = vaddq_u32(r3_0, r3_1); + + r0_3 = veorq_u32(r0_3, r0_0); + r1_3 = veorq_u32(r1_3, r1_0); + r2_3 = veorq_u32(r2_3, r2_0); + r3_3 = veorq_u32(r3_3, r3_0); + + r0_3 = rotate_left!(r0_3, 8); + r1_3 = rotate_left!(r1_3, 8); + r2_3 = rotate_left!(r2_3, 8); + r3_3 = rotate_left!(r3_3, 8); + + r0_2 = vaddq_u32(r0_2, r0_3); + r1_2 = vaddq_u32(r1_2, r1_3); + r2_2 = vaddq_u32(r2_2, r2_3); + r3_2 = vaddq_u32(r3_2, r3_3); + + r0_1 = veorq_u32(r0_1, r0_2); + r1_1 = veorq_u32(r1_1, r1_2); + r2_1 = veorq_u32(r2_1, r2_2); + r3_1 = veorq_u32(r3_1, r3_2); + + r0_1 = rotate_left!(r0_1, 7); + r1_1 = rotate_left!(r1_1, 7); + r2_1 = rotate_left!(r2_1, 7); + r3_1 = rotate_left!(r3_1, 7); + + r0_1 = extract!(r0_1, 3); + r0_2 = extract!(r0_2, 2); + r0_3 = extract!(r0_3, 1); + + r1_1 = extract!(r1_1, 3); + r1_2 = extract!(r1_2, 2); + r1_3 = extract!(r1_3, 1); + + r2_1 = extract!(r2_1, 3); + r2_2 = extract!(r2_2, 2); + r2_3 = extract!(r2_3, 1); + + r3_1 = extract!(r3_1, 3); + r3_2 = extract!(r3_2, 2); + r3_3 = extract!(r3_3, 1); } - } - } - - #[test] - fn generate_vs_scalar_impl() { - let mut soft_result = [0u8; soft::BUFFER_SIZE]; - soft::Core::::new(&R_KEY, R_IV).generate(R_CNT, &mut soft_result); - let mut simd_result = [0u8; BUFFER_SIZE]; - Core::::new(&R_KEY, R_IV).generate(R_CNT, &mut simd_result); - - assert_eq!(&soft_result[..], &simd_result[..soft::BUFFER_SIZE]) + r0_0 = vaddq_u32(r0_0, self.state[0]); + r0_1 = vaddq_u32(r0_1, self.state[1]); + r0_2 = vaddq_u32(r0_2, self.state[2]); + r0_3 = vaddq_u32(r0_3, self.state[3]); + + r1_0 = vaddq_u32(r1_0, self.state[0]); + r1_1 = vaddq_u32(r1_1, self.state[1]); + r1_2 = vaddq_u32(r1_2, self.state[2]); + r1_3 = vaddq_u32(r1_3, self.state[3]); + r1_3 = add64!(r1_3, ctrs[0]); + + r2_0 = vaddq_u32(r2_0, self.state[0]); + r2_1 = vaddq_u32(r2_1, self.state[1]); + r2_2 = vaddq_u32(r2_2, self.state[2]); + r2_3 = vaddq_u32(r2_3, self.state[3]); + r2_3 = add64!(r2_3, ctrs[1]); + + r3_0 = vaddq_u32(r3_0, self.state[0]); + r3_1 = vaddq_u32(r3_1, self.state[1]); + r3_2 = vaddq_u32(r3_2, self.state[2]); + r3_3 = vaddq_u32(r3_3, self.state[3]); + r3_3 = add64!(r3_3, ctrs[2]); + + vst1q_u8(blocks[0].as_mut_ptr().offset(0), vreinterpretq_u8_u32(r0_0)); + vst1q_u8( + blocks[0].as_mut_ptr().offset(16), + vreinterpretq_u8_u32(r0_1), + ); + vst1q_u8( + blocks[0].as_mut_ptr().offset(2 * 16), + vreinterpretq_u8_u32(r0_2), + ); + vst1q_u8( + blocks[0].as_mut_ptr().offset(3 * 16), + vreinterpretq_u8_u32(r0_3), + ); + + vst1q_u8(blocks[1].as_mut_ptr().offset(0), vreinterpretq_u8_u32(r1_0)); + vst1q_u8( + blocks[1].as_mut_ptr().offset(16), + vreinterpretq_u8_u32(r1_1), + ); + vst1q_u8( + blocks[1].as_mut_ptr().offset(2 * 16), + vreinterpretq_u8_u32(r1_2), + ); + vst1q_u8( + blocks[1].as_mut_ptr().offset(3 * 16), + vreinterpretq_u8_u32(r1_3), + ); + + vst1q_u8(blocks[2].as_mut_ptr().offset(0), vreinterpretq_u8_u32(r2_0)); + vst1q_u8( + blocks[2].as_mut_ptr().offset(16), + vreinterpretq_u8_u32(r2_1), + ); + vst1q_u8( + blocks[2].as_mut_ptr().offset(2 * 16), + vreinterpretq_u8_u32(r2_2), + ); + vst1q_u8( + blocks[2].as_mut_ptr().offset(3 * 16), + vreinterpretq_u8_u32(r2_3), + ); + + vst1q_u8(blocks[3].as_mut_ptr().offset(0), vreinterpretq_u8_u32(r3_0)); + vst1q_u8( + blocks[3].as_mut_ptr().offset(16), + vreinterpretq_u8_u32(r3_1), + ); + vst1q_u8( + blocks[3].as_mut_ptr().offset(2 * 16), + vreinterpretq_u8_u32(r3_2), + ); + vst1q_u8( + blocks[3].as_mut_ptr().offset(3 * 16), + vreinterpretq_u8_u32(r3_3), + ); + + self.state[3] = add64!(self.state[3], ctrs[3]); + } } } diff --git a/chacha20/src/lib.rs b/chacha20/src/lib.rs index 576bf383..9890c936 100644 --- a/chacha20/src/lib.rs +++ b/chacha20/src/lib.rs @@ -282,6 +282,10 @@ impl StreamCipherCore for ChaChaCore { } } } + } else if #[cfg(any(target_arch = "aarch64"))] { + unsafe { + backends::neon::inner::(&mut self.state, f); + } } else { f.call(&mut backends::soft::Backend(self)); } From 35492bbbd4ba7303080580d9d96062a4d85f1568 Mon Sep 17 00:00:00 2001 From: Coda Hale Date: Sat, 29 Oct 2022 19:55:24 -0600 Subject: [PATCH 2/4] chacha20: Gate NEON backend behind `neon` feature. --- chacha20/Cargo.toml | 1 + chacha20/src/backends.rs | 2 +- chacha20/src/lib.rs | 6 +----- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/chacha20/Cargo.toml b/chacha20/Cargo.toml index ad2b6aad..f901b9cd 100644 --- a/chacha20/Cargo.toml +++ b/chacha20/Cargo.toml @@ -32,6 +32,7 @@ hex-literal = "0.3.3" [features] std = ["cipher/std"] zeroize = ["cipher/zeroize"] +neon = [] [package.metadata.docs.rs] all-features = true diff --git a/chacha20/src/backends.rs b/chacha20/src/backends.rs index c5bbb226..44aa77ce 100644 --- a/chacha20/src/backends.rs +++ b/chacha20/src/backends.rs @@ -15,7 +15,7 @@ cfg_if! { pub(crate) mod sse2; } } - } else if #[cfg(any(target_arch = "aarch64"))] { + } else if #[cfg(all(feature = "neon", target_arch = "aarch64", target_feature = "neon"))] { pub(crate) mod neon; } else { pub(crate) mod soft; diff --git a/chacha20/src/lib.rs b/chacha20/src/lib.rs index 9890c936..d40fcfb5 100644 --- a/chacha20/src/lib.rs +++ b/chacha20/src/lib.rs @@ -105,10 +105,6 @@ html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/media/8f1a9894/logo.svg", html_root_url = "https://docs.rs/chacha20/0.9.0" )] -#![cfg_attr( - all(feature = "neon", target_arch = "aarch64", target_feature = "neon"), - feature(stdsimd, aarch64_target_feature) -)] #![warn(missing_docs, rust_2018_idioms, trivial_casts, unused_qualifications)] #![allow(clippy::needless_range_loop)] @@ -282,7 +278,7 @@ impl StreamCipherCore for ChaChaCore { } } } - } else if #[cfg(any(target_arch = "aarch64"))] { + } else if #[cfg(all(feature = "neon", target_arch = "aarch64", target_feature = "neon"))] { unsafe { backends::neon::inner::(&mut self.state, f); } From 5cb21002a0c24f1479cbbefe7f6dea5f19c4791f Mon Sep 17 00:00:00 2001 From: Coda Hale Date: Sun, 30 Oct 2022 08:31:09 -0600 Subject: [PATCH 3/4] chacha20: Re-enable CI for NEON backend --- .github/workflows/chacha20.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/chacha20.yml b/.github/workflows/chacha20.yml index af99d0d2..13cfb708 100644 --- a/.github/workflows/chacha20.yml +++ b/.github/workflows/chacha20.yml @@ -197,9 +197,9 @@ jobs: rust: stable # ARM64 with NEON backend - #- target: aarch64-unknown-linux-gnu - # rust: nightly - # features: --features neon + - target: aarch64-unknown-linux-gnu + rust: stable + features: --features neon # PPC32 - target: powerpc-unknown-linux-gnu From b748d1e0066cd76712a8005afb9c7f13c51f5e34 Mon Sep 17 00:00:00 2001 From: Coda Hale Date: Sun, 30 Oct 2022 08:35:35 -0600 Subject: [PATCH 4/4] chacha20: Update README to include `neon` --- chacha20/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/chacha20/README.md b/chacha20/README.md index a4450852..2ff7d5bb 100644 --- a/chacha20/README.md +++ b/chacha20/README.md @@ -34,6 +34,8 @@ work on stable Rust with the following `RUSTFLAGS`: - `x86` / `x86_64` - `avx2`: (~1.4cpb) `-Ctarget-cpu=haswell -Ctarget-feature=+avx2` - `sse2`: (~2.5cpb) `-Ctarget-feature=+sse2` (on by default on x86 CPUs) +- `aarch64` + - `neon` (~2-3x faster than `soft`) requires Rust 1.61+ and the `neon` feature enabled - Portable - `soft`: (~5 cpb on x86/x86_64) @@ -76,8 +78,8 @@ done with a minor version bump. Licensed under either of: - * [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) - * [MIT license](http://opensource.org/licenses/MIT) +- [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) +- [MIT license](http://opensource.org/licenses/MIT) at your option.