From 432270dcb4af3e29eda3337050452a982a153d3b Mon Sep 17 00:00:00 2001 From: Paul Dicker Date: Sun, 22 Oct 2017 15:17:41 +0200 Subject: [PATCH 1/4] Split `gen_usize_*` benchmarks into `gen_u32_*` and `gen_u64_*` --- benches/generators.rs | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/benches/generators.rs b/benches/generators.rs index 44adb0e1c2d..6cf86a23611 100644 --- a/benches/generators.rs +++ b/benches/generators.rs @@ -36,28 +36,49 @@ gen_bytes!(gen_bytes_chacha, ChaChaRng); gen_bytes!(gen_bytes_std, StdRng); gen_bytes!(gen_bytes_os, OsRng); +macro_rules! gen_u32 { + ($fnn:ident, $gen:ident) => { + #[bench] + fn $fnn(b: &mut Bencher) { + let mut rng = $gen::new().unwrap(); + b.iter(|| { + for _ in 0..RAND_BENCH_N { + black_box(u32::rand(&mut rng, Default)); + } + }); + b.bytes = size_of::() as u64 * RAND_BENCH_N; + } + } +} + +gen_u32!(gen_u32_xorshift, XorShiftRng); +gen_u32!(gen_u32_isaac, IsaacRng); +gen_u32!(gen_u32_isaac64, Isaac64Rng); +gen_u32!(gen_u32_chacha, ChaChaRng); +gen_u32!(gen_u32_std, StdRng); +gen_u32!(gen_u32_os, OsRng); -macro_rules! gen_usize { +macro_rules! gen_u64 { ($fnn:ident, $gen:ident) => { #[bench] fn $fnn(b: &mut Bencher) { let mut rng = $gen::new().unwrap(); b.iter(|| { for _ in 0..RAND_BENCH_N { - black_box(usize::rand(&mut rng, Default)); + black_box(u64::rand(&mut rng, Default)); } }); - b.bytes = size_of::() as u64 * RAND_BENCH_N; + b.bytes = size_of::() as u64 * RAND_BENCH_N; } } } -gen_usize!(gen_usize_xorshift, XorShiftRng); -gen_usize!(gen_usize_isaac, IsaacRng); -gen_usize!(gen_usize_isaac64, Isaac64Rng); -gen_usize!(gen_usize_chacha, ChaChaRng); -gen_usize!(gen_usize_std, StdRng); -gen_usize!(gen_usize_os, OsRng); +gen_u64!(gen_u64_xorshift, XorShiftRng); +gen_u64!(gen_u64_isaac, IsaacRng); +gen_u64!(gen_u64_isaac64, Isaac64Rng); +gen_u64!(gen_u64_chacha, ChaChaRng); +gen_u64!(gen_u64_std, StdRng); +gen_u64!(gen_u64_os, OsRng); macro_rules! init_gen { ($fnn:ident, $gen:ident) => { From 277b804342497dd07db0b9e3440fe7ce67076500 Mon Sep 17 00:00:00 2001 From: Paul Dicker Date: Sun, 22 Oct 2017 15:19:00 +0200 Subject: [PATCH 2/4] Improve performance of `isaac64::next_u32` by 45% --- src/prng/isaac64.rs | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/src/prng/isaac64.rs b/src/prng/isaac64.rs index 26daa705ea6..1872642bc63 100644 --- a/src/prng/isaac64.rs +++ b/src/prng/isaac64.rs @@ -132,6 +132,12 @@ impl Isaac64Rng { /// - We maintain one index `i` and add `m` or `m2` as base (m2 for the /// `s[i+128 mod 256]`), relying on the optimizer to turn it into pointer /// arithmetic. + /// - In `cnt` we do not store the number of available u64's, but fake it as + /// the number of available u32 (e.g. multiply `cnt` by 2). This way we + /// can make more efficient use of the generated results in `next_u32`. + /// For `next_u64` the correct index is `cnt >> 1`, which also takes care + /// of any alignment issues that could arise if `next_u64` was called + /// after `next_u32`. fn isaac64(&mut self) { self.c += w(1); // abbreviations @@ -181,36 +187,45 @@ impl Isaac64Rng { self.a = a; self.b = b; - self.cnt = RAND_SIZE as u32; + self.cnt = (RAND_SIZE * 2) as u32; } } impl Rng for Isaac64Rng { #[inline] fn next_u32(&mut self) -> u32 { - self.next_u64() as u32 + if self.cnt == 0 { + // make some more numbers + self.isaac64(); + } + self.cnt -= 1; + + let rsl = unsafe { &*(&mut self.rsl as *mut [w64; RAND_SIZE] + as *mut [u32; RAND_SIZE * 2]) }; + + rsl[self.cnt as usize % (RAND_SIZE * 2)] } #[inline] fn next_u64(&mut self) -> u64 { - if self.cnt == 0 { + if self.cnt < 2 { // make some more numbers self.isaac64(); } - self.cnt -= 1; + self.cnt -= 2; - // self.cnt is at most RAND_SIZE, but that is before the + // self.cnt is at most RAND_SIZE * 2, but that is before the // subtraction above. We want to index without bounds // checking, but this could lead to incorrect code if someone // misrefactors, so we check, sometimes. // // (Changes here should be reflected in IsaacRng.next_u32.) - debug_assert!((self.cnt as usize) < RAND_SIZE); + debug_assert!(((self.cnt >> 1) as usize) < RAND_SIZE); // (the % is cheaply telling the optimiser that we're always // in bounds, without unsafe. NB. this is a power of two, so // it optimises to a bitwise mask). - self.rsl[self.cnt as usize % RAND_SIZE].0 + self.rsl[(self.cnt >> 1) as usize % RAND_SIZE].0 } #[cfg(feature = "i128_support")] @@ -219,7 +234,7 @@ impl Rng for Isaac64Rng { } fn fill_bytes(&mut self, dest: &mut [u8]) { - ::rand_core::impls::fill_bytes_via_u32(self, dest); + ::rand_core::impls::fill_bytes_via_u64(self, dest); } fn try_fill(&mut self, dest: &mut [u8]) -> Result<(), Error> { From 87cf51bbc8e6925b2616de969ceefaa8f3411cb9 Mon Sep 17 00:00:00 2001 From: Paul Dicker Date: Sun, 22 Oct 2017 20:19:57 +0200 Subject: [PATCH 3/4] Improve performance of `isaac*::fill_bytes` by 45% --- src/prng/isaac.rs | 47 ++++++++++++++++++++++++++++++++----- src/prng/isaac64.rs | 56 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 90 insertions(+), 13 deletions(-) diff --git a/src/prng/isaac.rs b/src/prng/isaac.rs index 9e5ab2d91e9..2bf98f6c82e 100644 --- a/src/prng/isaac.rs +++ b/src/prng/isaac.rs @@ -14,6 +14,7 @@ use core::slice; use core::iter::repeat; use core::num::Wrapping as w; use core::fmt; +use core::cmp::min; use {Rng, SeedFromRng, SeedableRng, Error}; @@ -87,7 +88,7 @@ const RAND_SIZE: usize = 1 << RAND_SIZE_LEN; /// [3]: Jean-Philippe Aumasson, [*On the pseudo-random generator ISAAC*] /// (http://eprint.iacr.org/2006/438) pub struct IsaacRng { - rsl: [w32; RAND_SIZE], + rsl: [u32; RAND_SIZE], mem: [w32; RAND_SIZE], a: w32, b: w32, @@ -175,7 +176,7 @@ impl IsaacRng { let y = *a + *b + ind(&ctx.mem, x, 2); ctx.mem[base + m] = y; *b = x + ind(&ctx.mem, y, 2 + RAND_SIZE_LEN); - ctx.rsl[base + m] = *b; + ctx.rsl[base + m] = (*b).0; } let mut m = 0; @@ -200,12 +201,42 @@ impl IsaacRng { self.b = b; self.cnt = RAND_SIZE as u32; } + + fn fill_chunk(&mut self, dest: &mut [u8]) -> usize { + if self.cnt < 1 { + self.isaac(); + } + + let mut index_u32 = self.cnt as usize; + let available = index_u32 * 4; + let chunk_size_u8 = min(available, dest.len()); + let chunk_size_u32 = (chunk_size_u8 + 3) / 4; + + index_u32 -= chunk_size_u32; + let index_u8 = index_u32 * 4; + + // convert to LE: + if cfg!(target_endian = "big") { + for ref mut x in self.rsl[index_u32..(index_u32 + chunk_size_u32)].iter_mut() { + **x = (*x).to_le(); + } + } + + let rsl = unsafe { &*(&mut self.rsl as *mut [u32; RAND_SIZE] + as *mut [u8; RAND_SIZE * 4]) }; + + let copy = &mut dest[0..chunk_size_u8]; + copy.copy_from_slice(&rsl[index_u8..(index_u8 + chunk_size_u8)]); + + self.cnt = index_u32 as u32; + chunk_size_u8 + } } impl Rng for IsaacRng { #[inline] fn next_u32(&mut self) -> u32 { - if self.cnt == 0 { + if self.cnt < 1 { // make some more numbers self.isaac(); } @@ -222,7 +253,7 @@ impl Rng for IsaacRng { // (the % is cheaply telling the optimiser that we're always // in bounds, without unsafe. NB. this is a power of two, so // it optimises to a bitwise mask). - self.rsl[self.cnt as usize % RAND_SIZE].0 + self.rsl[self.cnt as usize % RAND_SIZE] } fn next_u64(&mut self) -> u64 { @@ -235,7 +266,11 @@ impl Rng for IsaacRng { } fn fill_bytes(&mut self, dest: &mut [u8]) { - ::rand_core::impls::fill_bytes_via_u32(self, dest); + let mut read_len = 0; + while read_len < dest.len() { + let chunk_len = self.fill_chunk(&mut dest[read_len..]); + read_len += chunk_len; + } } fn try_fill(&mut self, dest: &mut [u8]) -> Result<(), Error> { @@ -300,7 +335,7 @@ fn init(mut mem: [w32; RAND_SIZE], rounds: u32) -> IsaacRng { } let mut rng = IsaacRng { - rsl: [w(0); RAND_SIZE], + rsl: [0; RAND_SIZE], mem: mem, a: w(0), b: w(0), diff --git a/src/prng/isaac64.rs b/src/prng/isaac64.rs index 1872642bc63..ddd804f5c26 100644 --- a/src/prng/isaac64.rs +++ b/src/prng/isaac64.rs @@ -14,6 +14,7 @@ use core::slice; use core::iter::repeat; use core::num::Wrapping as w; use core::fmt; +use core::cmp::min; use {Rng, SeedFromRng, SeedableRng, Error}; @@ -71,7 +72,7 @@ const RAND_SIZE: usize = 1 << RAND_SIZE_LEN; /// [1]: Bob Jenkins, [*ISAAC and RC4*] /// (http://burtleburtle.net/bob/rand/isaac.html) pub struct Isaac64Rng { - rsl: [w64; RAND_SIZE], + rsl: [u64; RAND_SIZE], mem: [w64; RAND_SIZE], a: w64, b: w64, @@ -164,7 +165,7 @@ impl Isaac64Rng { let y = *a + *b + ind(&ctx.mem, x, 3); ctx.mem[base + m] = y; *b = x + ind(&ctx.mem, y, 3 + RAND_SIZE_LEN); - ctx.rsl[base + m] = *b; + ctx.rsl[base + m] = (*b).0; } let mut m = 0; @@ -189,18 +190,55 @@ impl Isaac64Rng { self.b = b; self.cnt = (RAND_SIZE * 2) as u32; } + + fn fill_chunk(&mut self, dest: &mut [u8]) -> usize { + if self.cnt < 2 { + self.isaac64(); + } + + let mut index_u64 = (self.cnt >> 1) as usize; + let available = index_u64 * 8; + let chunk_size_u8 = min(available, dest.len()); + let chunk_size_u64 = (chunk_size_u8 + 7) / 8; + + index_u64 -= chunk_size_u64; + let index_u8 = index_u64 * 8; + + // convert to LE: + if cfg!(target_endian = "big") { + for ref mut x in self.rsl[index_u64..(index_u64 + chunk_size_u64)].iter_mut() { + **x = (*x).to_le(); + } + } + + let rsl = unsafe { &*(&mut self.rsl as *mut [u64; RAND_SIZE] + as *mut [u8; RAND_SIZE * 8]) }; + + let copy = &mut dest[0..chunk_size_u8]; + copy.copy_from_slice(&rsl[index_u8..(index_u8 + chunk_size_u8)]); + + self.cnt = (index_u64 << 1) as u32; + chunk_size_u8 + } } impl Rng for Isaac64Rng { #[inline] fn next_u32(&mut self) -> u32 { - if self.cnt == 0 { + if self.cnt < 1 { // make some more numbers self.isaac64(); } self.cnt -= 1; - let rsl = unsafe { &*(&mut self.rsl as *mut [w64; RAND_SIZE] + // If this is de first u32 that we read from what actually is an u64, + // convert the whole u64 to little-endian (no-op on little-endian + // architectures). + if self.cnt & 1 ==1 { + (self.rsl[(self.cnt >> 1) as usize % RAND_SIZE]).to_le(); + } + + let rsl = unsafe { &*(&mut self.rsl as *mut [u64; RAND_SIZE] as *mut [u32; RAND_SIZE * 2]) }; rsl[self.cnt as usize % (RAND_SIZE * 2)] @@ -225,7 +263,7 @@ impl Rng for Isaac64Rng { // (the % is cheaply telling the optimiser that we're always // in bounds, without unsafe. NB. this is a power of two, so // it optimises to a bitwise mask). - self.rsl[(self.cnt >> 1) as usize % RAND_SIZE].0 + self.rsl[(self.cnt >> 1) as usize % RAND_SIZE] } #[cfg(feature = "i128_support")] @@ -234,7 +272,11 @@ impl Rng for Isaac64Rng { } fn fill_bytes(&mut self, dest: &mut [u8]) { - ::rand_core::impls::fill_bytes_via_u64(self, dest); + let mut read_len = 0; + while read_len < dest.len() { + let chunk_len = self.fill_chunk(&mut dest[read_len..]); + read_len += chunk_len; + } } fn try_fill(&mut self, dest: &mut [u8]) -> Result<(), Error> { @@ -274,7 +316,7 @@ fn init(mut mem: [w64; RAND_SIZE], rounds: u32) -> Isaac64Rng { } let mut rng = Isaac64Rng { - rsl: [w(0); RAND_SIZE], + rsl: [0; RAND_SIZE], mem: mem, a: w(0), b: w(0), From 74d6b18a556abd9e413bae2d005815be3fd451be Mon Sep 17 00:00:00 2001 From: Paul Dicker Date: Mon, 23 Oct 2017 18:38:16 +0200 Subject: [PATCH 4/4] Make `ChaChaRng::fill_bytes` similar to Isaac. This does not change benchmark results, just makes the code similar. --- src/prng/chacha.rs | 64 ++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/src/prng/chacha.rs b/src/prng/chacha.rs index d066e652c29..b5bb207cec6 100644 --- a/src/prng/chacha.rs +++ b/src/prng/chacha.rs @@ -12,6 +12,7 @@ use core::num::Wrapping as w; use core::fmt; +use core::cmp::min; use {Rng, CryptoRng, SeedFromRng, SeedableRng, Error}; #[allow(bad_style)] @@ -34,7 +35,7 @@ const CHACHA_ROUNDS: u32 = 20; // Cryptographically secure from 8 upwards as of pub struct ChaChaRng { buffer: [w32; STATE_WORDS], // Internal buffer of output state: [w32; STATE_WORDS], // Initial state - index: usize, // Index into state + index: usize, // Index into state } // Custom Debug implementation that does not expose the internal state @@ -189,6 +190,31 @@ impl ChaChaRng { if self.state[14] != w(0) { return }; self.state[15] = self.state[15] + w(1); } + + fn fill_chunk(&mut self, dest: &mut [u8]) -> usize { + if self.index == STATE_WORDS { + self.update(); + } + + let available = (STATE_WORDS - self.index) * 4; + let chunk_size_u8 = min(available, dest.len()); + let chunk_size_u32 = (chunk_size_u8 + 3) / 4; + + // convert to LE: + for ref mut x in self.buffer[self.index..self.index+chunk_size_u32].iter_mut() { + **x = w((*x).0.to_le()); + } + + let buf = unsafe { &*(&mut self.buffer as *mut [w32; STATE_WORDS] + as *mut [u8; STATE_WORDS * 4]) }; + + let index = self.index * 4; + let copy = &mut dest[0..chunk_size_u8]; + copy.copy_from_slice(&buf[index..index+chunk_size_u8]); + + self.index += chunk_size_u32; + chunk_size_u8 + } } impl Rng for ChaChaRng { @@ -211,39 +237,11 @@ impl Rng for ChaChaRng { ::rand_core::impls::next_u128_via_u64(self) } - // Custom implementation allowing larger reads from buffer is about 8% - // faster than default implementation in my tests fn fill_bytes(&mut self, dest: &mut [u8]) { - use core::cmp::min; - use core::intrinsics::{transmute, copy_nonoverlapping}; - - let mut left = dest; - while left.len() >= 4 { - if self.index == STATE_WORDS { - self.update(); - } - - let words = min(left.len() / 4, STATE_WORDS - self.index); - let (l, r) = {left}.split_at_mut(4 * words); - left = r; - - // convert to LE: - for ref mut x in self.buffer[self.index..self.index+words].iter_mut() { - **x = w((*x).0.to_le()); - } - - unsafe{ copy_nonoverlapping( - &self.buffer[self.index].0 as *const u32 as *const u8, - l.as_mut_ptr(), - words) }; - self.index += words; - } - let n = left.len(); - if n > 0 { - let chunk: [u8; 4] = unsafe { - transmute(self.next_u32().to_le()) - }; - left.copy_from_slice(&chunk[..n]); + let mut read_len = 0; + while read_len < dest.len() { + let chunk_len = self.fill_chunk(&mut dest[read_len..]); + read_len += chunk_len; } }