-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
poly1305: AVX2 backend for x86 and x86_64
Originally derived from Goll and Gueron's AVX2 C code. The logic has been extensively rewritten and documented, and several bugs in the original C code were fixed.
- Loading branch information
Showing
3 changed files
with
1,971 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
//! AVX2 implementation of the Poly1305 state machine. | ||
// The State struct and its logic was originally derived from Goll and Gueron's AVX2 C | ||
// code: | ||
// [Vectorization of Poly1305 message authentication code](https://ieeexplore.ieee.org/document/7113463) | ||
// | ||
// which was sourced from Bhattacharyya and Sarkar's modified variant: | ||
// [Improved SIMD Implementation of Poly1305](https://eprint.iacr.org/2019/842) | ||
// https://github.com/Sreyosi/Improved-SIMD-Implementation-of-Poly1305 | ||
// | ||
// The logic has been extensively rewritten and documented, and several bugs in the | ||
// original C code were fixed. | ||
// | ||
// Note that State only implements the original Goll-Gueron algorithm, not the | ||
// optimisations provided by Bhattacharyya and Sarkar. The latter require the message | ||
// length to be known, which is incompatible with the streaming API of UniversalHash. | ||
|
||
use universal_hash::generic_array::GenericArray; | ||
|
||
use crate::{Block, Key, Tag, BLOCK_SIZE}; | ||
|
||
mod helpers; | ||
use self::helpers::*; | ||
|
||
const BLOCK_X4_SIZE: usize = BLOCK_SIZE * 4; | ||
|
||
#[derive(Clone)] | ||
struct Initialized { | ||
p: Aligned4x130, | ||
m: SpacedMultiplier4x130, | ||
r4: PrecomputedMultiplier, | ||
} | ||
|
||
#[derive(Clone)] | ||
pub(crate) struct State { | ||
k: AdditionKey, | ||
r1: PrecomputedMultiplier, | ||
r2: PrecomputedMultiplier, | ||
cached_blocks: [u8; BLOCK_X4_SIZE], | ||
num_cached_blocks: usize, | ||
partial_block: Option<Block>, | ||
initialized: Option<Initialized>, | ||
} | ||
|
||
impl State { | ||
/// Initialize Poly1305 state with the given key | ||
pub(crate) fn new(key: &Key) -> Self { | ||
// Prepare addition key and polynomial key. | ||
let (k, r1) = prepare_keys(key); | ||
|
||
// Precompute R^2. | ||
let r2 = (r1 * r1).reduce(); | ||
|
||
State { | ||
k, | ||
r1, | ||
r2: r2.into(), | ||
cached_blocks: [0u8; BLOCK_X4_SIZE], | ||
num_cached_blocks: 0, | ||
partial_block: None, | ||
initialized: None, | ||
} | ||
} | ||
|
||
/// Reset internal state | ||
pub(crate) fn reset(&mut self) { | ||
self.num_cached_blocks = 0; | ||
self.initialized = None; | ||
} | ||
|
||
pub(crate) fn compute_block(&mut self, block: &Block, partial: bool) { | ||
// We can cache a single partial block. | ||
if partial { | ||
assert!(self.partial_block.is_none()); | ||
self.partial_block = Some(*block); | ||
return; | ||
} | ||
|
||
self.cached_blocks | ||
[self.num_cached_blocks * BLOCK_SIZE..(self.num_cached_blocks + 1) * BLOCK_SIZE] | ||
.copy_from_slice(block); | ||
if self.num_cached_blocks < 3 { | ||
self.num_cached_blocks += 1; | ||
return; | ||
} else { | ||
self.num_cached_blocks = 0; | ||
} | ||
|
||
if let Some(inner) = &mut self.initialized { | ||
// P <-- R^4 * P + blocks | ||
inner.p = (&inner.p * inner.r4).reduce() | ||
+ Aligned4x130::from_blocks(&self.cached_blocks[..]); | ||
} else { | ||
// Initialize the polynomial. | ||
let p = Aligned4x130::from_blocks(&self.cached_blocks[..]); | ||
|
||
// Initialize the multiplier (used to merge down the polynomial during | ||
// finalization). | ||
let (m, r4) = SpacedMultiplier4x130::new(self.r1, self.r2); | ||
|
||
self.initialized = Some(Initialized { p, m, r4 }) | ||
} | ||
} | ||
|
||
pub(crate) fn finalize(&mut self) -> Tag { | ||
assert!(self.num_cached_blocks < 4); | ||
let mut data = &self.cached_blocks[..]; | ||
|
||
// T ← R◦T | ||
// P = T_0 + T_1 + T_2 + T_3 | ||
let mut p = self | ||
.initialized | ||
.take() | ||
.map(|inner| (inner.p * inner.m).sum().reduce()); | ||
|
||
if self.num_cached_blocks >= 2 { | ||
// Compute 32 byte block (remaining data < 64 bytes) | ||
let mut c = Aligned2x130::from_blocks(&data[0..BLOCK_SIZE * 2]); | ||
if let Some(p) = p { | ||
c = c + p; | ||
} | ||
p = Some(c.mul_and_sum(self.r1, self.r2).reduce()); | ||
data = &data[BLOCK_SIZE * 2..]; | ||
self.num_cached_blocks -= 2; | ||
} | ||
|
||
if self.num_cached_blocks == 1 { | ||
// Compute 16 byte block (remaining data < 32 bytes) | ||
let mut c = Aligned130::from_block(&data[0..BLOCK_SIZE]); | ||
if let Some(p) = p { | ||
c = c + p; | ||
} | ||
p = Some((c * self.r1).reduce()); | ||
self.num_cached_blocks -= 1; | ||
} | ||
|
||
if let Some(block) = &self.partial_block { | ||
// Compute last block (remaining data < 16 bytes) | ||
let mut c = Aligned130::from_partial_block(block); | ||
if let Some(p) = p { | ||
c = c + p; | ||
} | ||
p = Some((c * self.r1).reduce()); | ||
} | ||
|
||
// Compute tag: p + k mod 2^128 | ||
let mut tag = GenericArray::<u8, _>::default(); | ||
let tag_int = if let Some(p) = p { | ||
self.k + p | ||
} else { | ||
self.k.into() | ||
}; | ||
tag_int.write(tag.as_mut_slice()); | ||
|
||
Tag::new(tag) | ||
} | ||
} |
Oops, something went wrong.