From 432270dcb4af3e29eda3337050452a982a153d3b Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Sun, 22 Oct 2017 15:17:41 +0200
Subject: [PATCH 1/4] Split `gen_usize_*` benchmarks into `gen_u32_*` and
 `gen_u64_*`

---
 benches/generators.rs | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)
diff --git a/benches/generators.rs b/benches/generators.rs
index 44adb0e1c2d..6cf86a23611 100644
--- a/benches/generators.rs
+++ b/benches/generators.rs
@@ -36,28 +36,49 @@ gen_bytes!(gen_bytes_chacha, ChaChaRng);
 gen_bytes!(gen_bytes_std, StdRng);
 gen_bytes!(gen_bytes_os, OsRng);
 
+macro_rules! gen_u32 {
+    ($fnn:ident, $gen:ident) => {
+        #[bench]
+        fn $fnn(b: &mut Bencher) {
+            let mut rng = $gen::new().unwrap();
+            b.iter(|| {
+                for _ in 0..RAND_BENCH_N {
+                    black_box(u32::rand(&mut rng, Default));
+                }
+            });
+            b.bytes = size_of::<u32>() as u64 * RAND_BENCH_N;
+        }
+    }
+}
+
+gen_u32!(gen_u32_xorshift, XorShiftRng);
+gen_u32!(gen_u32_isaac, IsaacRng);
+gen_u32!(gen_u32_isaac64, Isaac64Rng);
+gen_u32!(gen_u32_chacha, ChaChaRng);
+gen_u32!(gen_u32_std, StdRng);
+gen_u32!(gen_u32_os, OsRng);
 
-macro_rules! gen_usize {
+macro_rules! gen_u64 {
     ($fnn:ident, $gen:ident) => {
         #[bench]
         fn $fnn(b: &mut Bencher) {
             let mut rng = $gen::new().unwrap();
             b.iter(|| {
                 for _ in 0..RAND_BENCH_N {
-                    black_box(usize::rand(&mut rng, Default));
+                    black_box(u64::rand(&mut rng, Default));
                 }
             });
-            b.bytes = size_of::<usize>() as u64 * RAND_BENCH_N;
+            b.bytes = size_of::<u64>() as u64 * RAND_BENCH_N;
         }
     }
 }
 
-gen_usize!(gen_usize_xorshift, XorShiftRng);
-gen_usize!(gen_usize_isaac, IsaacRng);
-gen_usize!(gen_usize_isaac64, Isaac64Rng);
-gen_usize!(gen_usize_chacha, ChaChaRng);
-gen_usize!(gen_usize_std, StdRng);
-gen_usize!(gen_usize_os, OsRng);
+gen_u64!(gen_u64_xorshift, XorShiftRng);
+gen_u64!(gen_u64_isaac, IsaacRng);
+gen_u64!(gen_u64_isaac64, Isaac64Rng);
+gen_u64!(gen_u64_chacha, ChaChaRng);
+gen_u64!(gen_u64_std, StdRng);
+gen_u64!(gen_u64_os, OsRng);
 
 macro_rules! init_gen {
     ($fnn:ident, $gen:ident) => {

From 277b804342497dd07db0b9e3440fe7ce67076500 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Sun, 22 Oct 2017 15:19:00 +0200
Subject: [PATCH 2/4] Improve performance of `isaac64::next_u32` by 45%

---
 src/prng/isaac64.rs | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/prng/isaac64.rs b/src/prng/isaac64.rs
index 26daa705ea6..1872642bc63 100644
--- a/src/prng/isaac64.rs
+++ b/src/prng/isaac64.rs
@@ -132,6 +132,12 @@ impl Isaac64Rng {
     /// - We maintain one index `i` and add `m` or `m2` as base (m2 for the
     ///   `s[i+128 mod 256]`), relying on the optimizer to turn it into pointer
     ///   arithmetic.
+    /// - In `cnt` we do not store the number of available u64's, but fake it as
+    ///   the number of available u32 (e.g. multiply `cnt` by 2). This way we
+    ///   can make more efficient use of the generated results in `next_u32`.
+    ///   For `next_u64` the correct index is `cnt >> 1`, which also takes care
+    ///   of any alignment issues that could arise if `next_u64` was called
+    ///   after `next_u32`.
     fn isaac64(&mut self) {
         self.c += w(1);
         // abbreviations
@@ -181,36 +187,45 @@ impl Isaac64Rng {
 
         self.a = a;
         self.b = b;
-        self.cnt = RAND_SIZE as u32;
+        self.cnt = (RAND_SIZE * 2) as u32;
     }
 }
 
 impl Rng for Isaac64Rng {
     #[inline]
     fn next_u32(&mut self) -> u32 {
-        self.next_u64() as u32
+        if self.cnt == 0 {
+            // make some more numbers
+            self.isaac64();
+        }
+        self.cnt -= 1;
+
+        let rsl = unsafe { &*(&mut self.rsl as *mut [w64; RAND_SIZE]
+                                            as *mut [u32; RAND_SIZE * 2]) };
+
+        rsl[self.cnt as usize % (RAND_SIZE * 2)]
     }
 
     #[inline]
     fn next_u64(&mut self) -> u64 {
-        if self.cnt == 0 {
+        if self.cnt < 2 {
             // make some more numbers
             self.isaac64();
         }
-        self.cnt -= 1;
+        self.cnt -= 2;
 
-        // self.cnt is at most RAND_SIZE, but that is before the
+        // self.cnt is at most RAND_SIZE * 2, but that is before the
         // subtraction above. We want to index without bounds
         // checking, but this could lead to incorrect code if someone
         // misrefactors, so we check, sometimes.
         //
         // (Changes here should be reflected in IsaacRng.next_u32.)
-        debug_assert!((self.cnt as usize) < RAND_SIZE);
+        debug_assert!(((self.cnt >> 1) as usize) < RAND_SIZE);
 
         // (the % is cheaply telling the optimiser that we're always
         // in bounds, without unsafe. NB. this is a power of two, so
         // it optimises to a bitwise mask).
-        self.rsl[self.cnt as usize % RAND_SIZE].0
+        self.rsl[(self.cnt >> 1) as usize % RAND_SIZE].0
     }
 
     #[cfg(feature = "i128_support")]
@@ -219,7 +234,7 @@ impl Rng for Isaac64Rng {
     }
 
     fn fill_bytes(&mut self, dest: &mut [u8]) {
-        ::rand_core::impls::fill_bytes_via_u32(self, dest);
+        ::rand_core::impls::fill_bytes_via_u64(self, dest);
     }
 
     fn try_fill(&mut self, dest: &mut [u8]) -> Result<(), Error> {

From 87cf51bbc8e6925b2616de969ceefaa8f3411cb9 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Sun, 22 Oct 2017 20:19:57 +0200
Subject: [PATCH 3/4] Improve performance of `isaac*::fill_bytes` by 45%

---
 src/prng/isaac.rs   | 47 ++++++++++++++++++++++++++++++++-----
 src/prng/isaac64.rs | 56 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/src/prng/isaac.rs b/src/prng/isaac.rs
index 9e5ab2d91e9..2bf98f6c82e 100644
--- a/src/prng/isaac.rs
+++ b/src/prng/isaac.rs
@@ -14,6 +14,7 @@ use core::slice;
 use core::iter::repeat;
 use core::num::Wrapping as w;
 use core::fmt;
+use core::cmp::min;
 
 use {Rng, SeedFromRng, SeedableRng, Error};
 
@@ -87,7 +88,7 @@ const RAND_SIZE: usize = 1 << RAND_SIZE_LEN;
 /// [3]: Jean-Philippe Aumasson, [*On the pseudo-random generator ISAAC*]
 ///      (http://eprint.iacr.org/2006/438)
 pub struct IsaacRng {
-    rsl: [w32; RAND_SIZE],
+    rsl: [u32; RAND_SIZE],
     mem: [w32; RAND_SIZE],
     a: w32,
     b: w32,
@@ -175,7 +176,7 @@ impl IsaacRng {
             let y = *a + *b + ind(&ctx.mem, x, 2);
             ctx.mem[base + m] = y;
             *b = x + ind(&ctx.mem, y, 2 + RAND_SIZE_LEN);
-            ctx.rsl[base + m] = *b;
+            ctx.rsl[base + m] = (*b).0;
         }
 
         let mut m = 0;
@@ -200,12 +201,42 @@ impl IsaacRng {
         self.b = b;
         self.cnt = RAND_SIZE as u32;
     }
+
+    fn fill_chunk(&mut self, dest: &mut [u8]) -> usize {
+        if self.cnt < 1 {
+            self.isaac();
+        }
+
+        let mut index_u32 = self.cnt as usize;
+        let available = index_u32 * 4;
+        let chunk_size_u8 = min(available, dest.len());
+        let chunk_size_u32 = (chunk_size_u8 + 3) / 4;
+
+        index_u32 -= chunk_size_u32;
+        let index_u8 = index_u32 * 4;
+
+        // convert to LE:
+        if cfg!(target_endian = "big") {
+            for ref mut x in self.rsl[index_u32..(index_u32 + chunk_size_u32)].iter_mut() {
+                **x = (*x).to_le();
+            }
+        }
+
+        let rsl = unsafe { &*(&mut self.rsl as *mut [u32; RAND_SIZE]
+                                            as *mut [u8; RAND_SIZE * 4]) };
+
+        let copy = &mut dest[0..chunk_size_u8];
+        copy.copy_from_slice(&rsl[index_u8..(index_u8 + chunk_size_u8)]);
+
+        self.cnt = index_u32 as u32;
+        chunk_size_u8
+    }
 }
 
 impl Rng for IsaacRng {
     #[inline]
     fn next_u32(&mut self) -> u32 {
-        if self.cnt == 0 {
+        if self.cnt < 1 {
             // make some more numbers
             self.isaac();
         }
@@ -222,7 +253,7 @@ impl Rng for IsaacRng {
         // (the % is cheaply telling the optimiser that we're always
         // in bounds, without unsafe. NB. this is a power of two, so
         // it optimises to a bitwise mask).
-        self.rsl[self.cnt as usize % RAND_SIZE].0
+        self.rsl[self.cnt as usize % RAND_SIZE]
     }
 
     fn next_u64(&mut self) -> u64 {
@@ -235,7 +266,11 @@ impl Rng for IsaacRng {
     }
 
     fn fill_bytes(&mut self, dest: &mut [u8]) {
-        ::rand_core::impls::fill_bytes_via_u32(self, dest);
+        let mut read_len = 0;
+        while read_len < dest.len() {
+            let chunk_len = self.fill_chunk(&mut dest[read_len..]);
+            read_len += chunk_len;
+        }
     }
 
     fn try_fill(&mut self, dest: &mut [u8]) -> Result<(), Error> {
@@ -300,7 +335,7 @@ fn init(mut mem: [w32; RAND_SIZE], rounds: u32) -> IsaacRng {
     }
 
     let mut rng = IsaacRng {
-        rsl: [w(0); RAND_SIZE],
+        rsl: [0; RAND_SIZE],
         mem: mem,
         a: w(0),
         b: w(0),
diff --git a/src/prng/isaac64.rs b/src/prng/isaac64.rs
index 1872642bc63..ddd804f5c26 100644
--- a/src/prng/isaac64.rs
+++ b/src/prng/isaac64.rs
@@ -14,6 +14,7 @@ use core::slice;
 use core::iter::repeat;
 use core::num::Wrapping as w;
 use core::fmt;
+use core::cmp::min;
 
 use {Rng, SeedFromRng, SeedableRng, Error};
 
@@ -71,7 +72,7 @@ const RAND_SIZE: usize = 1 << RAND_SIZE_LEN;
 /// [1]: Bob Jenkins, [*ISAAC and RC4*]
 ///      (http://burtleburtle.net/bob/rand/isaac.html)
 pub struct Isaac64Rng {
-    rsl: [w64; RAND_SIZE],
+    rsl: [u64; RAND_SIZE],
     mem: [w64; RAND_SIZE],
     a: w64,
     b: w64,
@@ -164,7 +165,7 @@ impl Isaac64Rng {
             let y = *a + *b + ind(&ctx.mem, x, 3);
             ctx.mem[base + m] = y;
             *b = x + ind(&ctx.mem, y, 3 + RAND_SIZE_LEN);
-            ctx.rsl[base + m] = *b;
+            ctx.rsl[base + m] = (*b).0;
         }
 
         let mut m = 0;
@@ -189,18 +190,55 @@ impl Isaac64Rng {
         self.b = b;
         self.cnt = (RAND_SIZE * 2) as u32;
     }
+
+    fn fill_chunk(&mut self, dest: &mut [u8]) -> usize {
+        if self.cnt < 2 {
+            self.isaac64();
+        }
+
+        let mut index_u64 = (self.cnt >> 1) as usize;
+        let available = index_u64 * 8;
+        let chunk_size_u8 = min(available, dest.len());
+        let chunk_size_u64 = (chunk_size_u8 + 7) / 8;
+
+        index_u64 -= chunk_size_u64;
+        let index_u8 = index_u64 * 8;
+
+        // convert to LE:
+        if cfg!(target_endian = "big") {
+            for ref mut x in self.rsl[index_u64..(index_u64 + chunk_size_u64)].iter_mut() {
+                **x = (*x).to_le();
+            }
+        }
+
+        let rsl = unsafe { &*(&mut self.rsl as *mut [u64; RAND_SIZE]
+                                            as *mut [u8; RAND_SIZE * 8]) };
+
+        let copy = &mut dest[0..chunk_size_u8];
+        copy.copy_from_slice(&rsl[index_u8..(index_u8 + chunk_size_u8)]);
+
+        self.cnt = (index_u64 << 1) as u32;
+        chunk_size_u8
+    }
 }
 
 impl Rng for Isaac64Rng {
     #[inline]
     fn next_u32(&mut self) -> u32 {
-        if self.cnt == 0 {
+        if self.cnt < 1 {
             // make some more numbers
             self.isaac64();
         }
         self.cnt -= 1;
 
-        let rsl = unsafe { &*(&mut self.rsl as *mut [w64; RAND_SIZE]
+        // If this is de first u32 that we read from what actually is an u64,
+        // convert the whole u64 to little-endian (no-op on little-endian
+        // architectures).
+        if self.cnt & 1 ==1 {
+            (self.rsl[(self.cnt >> 1) as usize % RAND_SIZE]).to_le();
+        }
+
+        let rsl = unsafe { &*(&mut self.rsl as *mut [u64; RAND_SIZE]
                                             as *mut [u32; RAND_SIZE * 2]) };
 
         rsl[self.cnt as usize % (RAND_SIZE * 2)]
@@ -225,7 +263,7 @@ impl Rng for Isaac64Rng {
         // (the % is cheaply telling the optimiser that we're always
         // in bounds, without unsafe. NB. this is a power of two, so
         // it optimises to a bitwise mask).
-        self.rsl[(self.cnt >> 1) as usize % RAND_SIZE].0
+        self.rsl[(self.cnt >> 1) as usize % RAND_SIZE]
     }
 
     #[cfg(feature = "i128_support")]
@@ -234,7 +272,11 @@ impl Rng for Isaac64Rng {
     }
 
     fn fill_bytes(&mut self, dest: &mut [u8]) {
-        ::rand_core::impls::fill_bytes_via_u64(self, dest);
+        let mut read_len = 0;
+        while read_len < dest.len() {
+            let chunk_len = self.fill_chunk(&mut dest[read_len..]);
+            read_len += chunk_len;
+        }
     }
 
     fn try_fill(&mut self, dest: &mut [u8]) -> Result<(), Error> {
@@ -274,7 +316,7 @@ fn init(mut mem: [w64; RAND_SIZE], rounds: u32) -> Isaac64Rng {
     }
 
     let mut rng = Isaac64Rng {
-        rsl: [w(0); RAND_SIZE],
+        rsl: [0; RAND_SIZE],
         mem: mem,
         a: w(0),
         b: w(0),

From 74d6b18a556abd9e413bae2d005815be3fd451be Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Mon, 23 Oct 2017 18:38:16 +0200
Subject: [PATCH 4/4] Make `ChaChaRng::fill_bytes` similar to Isaac.

This does not change benchmark results, just makes the code similar.
---
 src/prng/chacha.rs | 64 ++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/src/prng/chacha.rs b/src/prng/chacha.rs
index d066e652c29..b5bb207cec6 100644
--- a/src/prng/chacha.rs
+++ b/src/prng/chacha.rs
@@ -12,6 +12,7 @@
 
 use core::num::Wrapping as w;
 use core::fmt;
+use core::cmp::min;
 use {Rng, CryptoRng, SeedFromRng, SeedableRng, Error};
 
 #[allow(bad_style)]
@@ -34,7 +35,7 @@ const CHACHA_ROUNDS: u32 = 20; // Cryptographically secure from 8 upwards as of
 pub struct ChaChaRng {
     buffer:  [w32; STATE_WORDS], // Internal buffer of output
     state:   [w32; STATE_WORDS], // Initial state
-    index:   usize,                 // Index into state
+    index:   usize,              // Index into state
 }
 
 // Custom Debug implementation that does not expose the internal state
@@ -189,6 +190,31 @@ impl ChaChaRng {
         if self.state[14] != w(0) { return };
         self.state[15] = self.state[15] + w(1);
     }
+
+    fn fill_chunk(&mut self, dest: &mut [u8]) -> usize {
+        if self.index == STATE_WORDS {
+            self.update();
+        }
+
+        let available = (STATE_WORDS - self.index) * 4;
+        let chunk_size_u8 = min(available, dest.len());
+        let chunk_size_u32 = (chunk_size_u8 + 3) / 4;
+
+        // convert to LE:
+        for ref mut x in self.buffer[self.index..self.index+chunk_size_u32].iter_mut() {
+            **x = w((*x).0.to_le());
+        }
+
+        let buf = unsafe { &*(&mut self.buffer as *mut [w32; STATE_WORDS]
+                                               as *mut [u8; STATE_WORDS * 4]) };
+
+        let index = self.index * 4;
+        let copy = &mut dest[0..chunk_size_u8];
+        copy.copy_from_slice(&buf[index..index+chunk_size_u8]);
+
+        self.index += chunk_size_u32;
+        chunk_size_u8
+    }
 }
 
 impl Rng for ChaChaRng {
@@ -211,39 +237,11 @@ impl Rng for ChaChaRng {
         ::rand_core::impls::next_u128_via_u64(self)
     }
     
-    // Custom implementation allowing larger reads from buffer is about 8%
-    // faster than default implementation in my tests
     fn fill_bytes(&mut self, dest: &mut [u8]) {
-        use core::cmp::min;
-        use core::intrinsics::{transmute, copy_nonoverlapping};
-        
-        let mut left = dest;
-        while left.len() >= 4 {
-            if self.index == STATE_WORDS {
-                self.update();
-            }
-            
-            let words = min(left.len() / 4, STATE_WORDS - self.index);
-            let (l, r) = {left}.split_at_mut(4 * words);
-            left = r;
-            
-            // convert to LE:
-            for ref mut x in self.buffer[self.index..self.index+words].iter_mut() {
-                **x = w((*x).0.to_le());
-            }
-            
-            unsafe{ copy_nonoverlapping(
-                &self.buffer[self.index].0 as *const u32 as *const u8,
-                l.as_mut_ptr(),
-                words) };
-            self.index += words;
-        }
-        let n = left.len();
-        if n > 0 {
-            let chunk: [u8; 4] = unsafe {
-                transmute(self.next_u32().to_le())
-            };
-            left.copy_from_slice(&chunk[..n]);
+        let mut read_len = 0;
+        while read_len < dest.len() {
+            let chunk_len = self.fill_chunk(&mut dest[read_len..]);
+            read_len += chunk_len;
         }
     }