Use unbiased ranges in shuffle, move to separate function

pitdicker · pitdicker · commit 88c88e09a428 · 2018-03-21T20:30:15.000+01:00
diff --git a/src/lib.rs b/src/lib.rs
@@ -626,68 +626,7 @@ pub trait Rng: RngCore {
     /// println!("{:?}", y);
     /// ```
     fn shuffle<T>(&mut self, values: &mut [T]) {
-        use distributions::range::WideningMultiply;
-        // In theory this function is nothing more then:
-        // ```
-        // while i >= 2 {
-        //     i -= 1;
-        //     values.swap(i, self.gen_range(0, i + 1));`
-        // }
-        // ```
-        //
-        // `gen_range` samples completely unbiased from the range. It can be 4x
-        // as fast if only we could sample with a tiny bias. Using a modulus to
-        // reduce the range will always show a bias towards the lower numbers
-        // in the range.
-        //
-        // But we use the widening multiply method. Here the bias is evenly
-        // distributed over all numbers in the range. Also for every
-        // iteration the range is 1 less then the previous iteration, and
-        // the numbers that might have a bias are different. So besides the
-        // biases being very tiny, they also partly cancel out each other.
-        //
-        // Said differently, we only have to worry about biases when a whole
-        // group together shows a bias, as in the modulus method.
-
-        // We also optimize for slices of different length: huge slices with
-        // more than 2^32 elements (usize indexes), large slices with more than
-        // 2^16 elements, and medium to small slices. This allows us to make
-        // better use of the bits generated by the RNG.
-        let mut i = values.len();
-        while i > core::u32::MAX as usize || i & 1 == 1 {
-            // Invariant: elements with index >= i have been locked in place.
-            i -= 1;
-            // Lock element i in place, by swapping it with a random element in
-            // the range [0, i] (inclusive).
-            let (j, _) = self.gen::<u64>().wmul((i + 1) as u64);
-            values.swap(i, j as usize);
-        }
-        let mut i = i as u32;
-
-        while i > core::u16::MAX as u32 || i & 2 == 2 {
-            let r = self.gen::<u64>();
-            let r1 = r as u32;
-            let r2 = (r >> 32) as u32;
-
-            i -= 1;
-            let (j, _) = r1.wmul(i + 1);
-            values.swap(i as usize, j as usize);
-
-            i -= 1;
-            let (j, _) = r2.wmul(i + 1);
-            values.swap(i as usize, j as usize);
-        }
-
-        let mut i = i as u16;
-        while i >= 2 {
-            let mut r = self.gen::<u64>();
-            for _ in 0..4 {
-                i -= 1;
-                let (j, _) = (r as u16).wmul(i + 1);
-                values.swap(i as usize, j as usize);
-                r = r >> 16;
-            }
-        }
+        seq::shuffle(self, values)
     }
 }
 
diff --git a/src/seq.rs b/src/seq.rs
@@ -10,7 +10,8 @@
 
 //! Functions for randomly accessing and sampling sequences.
 
-use super::Rng;
+use Rng;
+use distributions::range::WideningMultiply;
 
 // This crate is only enabled when either std or alloc is available.
 // BTreeMap is not as fast in tests, but better than nothing.
@@ -224,6 +225,57 @@ fn sample_indices_cache<R>(
     out
 }
 
+pub(crate) fn shuffle<R, T>(rng: &mut R, values: &mut [T])
+where R: Rng + ?Sized {
+    // In theory this function is nothing more then:
+    // ```
+    // while i > 1 {
+    //     // invariant: elements with index >= i have been locked in place.
+    //     i -= 1;
+    //     // lock element i in place.
+    //     values.swap(i, self.gen_range(0, i + 1));
+    // }
+    // ```
+    //
+    // We optimize for slices of different, because generating ranges is
+    // faster for smaller integers. Less bits from the RNG are necessary,
+    // and multiplies are faster.
+    //
+    // We don't switch exactly at the boundary between integer sizes,
+    // because right below the integer boundary there is a very large zone
+    // of values that have to be rejected to avoid bias, 25~50%.
+    let mut i = values.len() as u64;
+    while i > (1 << 31) {
+        i -= 1;
+        values.swap(i as usize, rng.gen_range(0, i + 1) as usize);
+    }
+    let mut i = i as u32;
+    while i > (1 << 15) {
+        i -= 1;
+        values.swap(i as usize, rng.gen_range(0, i + 1) as usize);
+    }
+    let mut i = i as u16;
+    while i > 4 {
+        // Reimplement the range reduction here, because we can do better
+        // than generating 32 bits and throwing away half of them.
+        let mut value: u64 = rng.gen();
+        for _ in 0..4 {
+            let val = value as u16;
+            value = value >> 16;
+            let (hi, lo) = val.wmul(i);
+            let zone = ::core::u16::MAX - (::core::u16::MAX - i + 1) % i;
+            if lo <= zone {
+                i -= 1;
+                values.swap(i as usize, hi as usize);
+            }
+        }
+    }
+    while i > 1 {
+        i -= 1;
+        values.swap(i as usize, rng.gen_range(0, i + 1) as usize);
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;