From b8e9c068661cda79f27a031a71b0b9ce2bc18972 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 8 Jul 2024 11:26:06 +0200 Subject: [PATCH] Document bias and behavior when running out of entropy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `choose` and `choose_iter` incorrectly claimed to return `Error::NotEnoughData` when they in fact default to the first choice. This also documents that default in various other APIs. Additionally, `int_in_range` (and APIs that rely on it) has bias for non-power-of-two ranges. `u.int_in_range(0..=170)` for example will consume one byte of entropy, and take its value modulo 171 (the size of the range) to generate the returned integer. As a result, values in `0..=84` (the first ~half of the range) are twice as likely to get chosen as the rest (assuming the underlying bytes are uniform). In general, the result distribution is only uniform if the range size is a power of two (where the modulo just masks some bits). It would be accurate to document that return values are biased towards lower values when the range size is not a power of two, but do we want this much detail in the documented “contract” of this method? Similarly, I just called `ratio` “approximate”. `u.ratio(5, 7)` returns true for 184 out of 256 possible underlying byte values, ~0.6% too often. In the worst case, `u.ratio(84, 170)` return true ~33% too often. Notably, `#[derive(Arbitrary)]` chooses enum variants not with `choose_index` (although that seems most appropriate from reading `Unstructured` docs) but by always consuming 4 bytes of entropy: ```rust // Use a multiply + shift to generate a ranged random number // with slight bias. For details, see: // https://lemire.me/blog/2016/06/30/fast-random-shuffling Ok(match (u64::from(::arbitrary(u)?) * #count) >> 32 { #(#variants,)* _ => unreachable!() }) ``` `int_in_range` tries to minimize consumption based on the range size but that contributes to having more bias than multiply + shift. Is this a real trade-off worth having two methods? --- src/error.rs | 4 ++++ src/lib.rs | 27 +++++++++++++++++++++++++++ src/unstructured.rs | 31 +++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 6 deletions(-) diff --git a/src/error.rs b/src/error.rs index 6ca8f19..8cdf39b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -8,6 +8,10 @@ pub enum Error { EmptyChoose, /// There was not enough underlying data to fulfill some request for raw /// bytes. + /// + /// Note that outside of [`Unstructured::bytes`][crate::Unstructured::bytes], + /// most APIs do *not* return this error when running out of underlying arbitrary bytes + /// but silently return some default value instead. NotEnoughData, /// The input bytes were not of the right format IncorrectFormat, diff --git a/src/lib.rs b/src/lib.rs index 6ff98f3..7da6f1b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -321,6 +321,7 @@ impl<'a> Arbitrary<'a> for () { } } +/// Returns false, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for bool { fn arbitrary(u: &mut Unstructured<'a>) -> Result { Ok(>::arbitrary(u)? & 1 == 1) @@ -335,6 +336,8 @@ impl<'a> Arbitrary<'a> for bool { macro_rules! impl_arbitrary_for_integers { ( $( $ty:ty; )* ) => { $( + /// Returns zero, not an error, + /// if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for $ty { fn arbitrary(u: &mut Unstructured<'a>) -> Result { let mut buf = [0; mem::size_of::<$ty>()]; @@ -369,6 +372,7 @@ impl_arbitrary_for_integers! { // Note: We forward Arbitrary for i/usize to i/u64 in order to simplify corpus // compatibility between 32-bit and 64-bit builds. This introduces dead space in // 32-bit builds but keeps the input layout independent of the build platform. +/// Returns zero, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for usize { fn arbitrary(u: &mut Unstructured<'a>) -> Result { u.arbitrary::().map(|x| x as usize) @@ -380,6 +384,7 @@ impl<'a> Arbitrary<'a> for usize { } } +/// Returns zero, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for isize { fn arbitrary(u: &mut Unstructured<'a>) -> Result { u.arbitrary::().map(|x| x as isize) @@ -394,6 +399,8 @@ impl<'a> Arbitrary<'a> for isize { macro_rules! impl_arbitrary_for_floats { ( $( $ty:ident : $unsigned:ty; )* ) => { $( + /// Returns zero, not an error, + /// if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for $ty { fn arbitrary(u: &mut Unstructured<'a>) -> Result { Ok(Self::from_bits(<$unsigned as Arbitrary<'a>>::arbitrary(u)?)) @@ -413,6 +420,7 @@ impl_arbitrary_for_floats! { f64: u64; } +/// Returns '\0', not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for char { fn arbitrary(u: &mut Unstructured<'a>) -> Result { use std::char; @@ -437,6 +445,7 @@ impl<'a> Arbitrary<'a> for char { } } +/// Returns false, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for AtomicBool { fn arbitrary(u: &mut Unstructured<'a>) -> Result { Arbitrary::arbitrary(u).map(Self::new) @@ -448,6 +457,7 @@ impl<'a> Arbitrary<'a> for AtomicBool { } } +/// Returns zero, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for AtomicIsize { fn arbitrary(u: &mut Unstructured<'a>) -> Result { Arbitrary::arbitrary(u).map(Self::new) @@ -459,6 +469,7 @@ impl<'a> Arbitrary<'a> for AtomicIsize { } } +/// Returns zero, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for AtomicUsize { fn arbitrary(u: &mut Unstructured<'a>) -> Result { Arbitrary::arbitrary(u).map(Self::new) @@ -559,6 +570,7 @@ where cb(bound) } +/// Returns zero, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a> Arbitrary<'a> for Duration { fn arbitrary(u: &mut Unstructured<'a>) -> Result { Ok(Self::new( @@ -576,6 +588,7 @@ impl<'a> Arbitrary<'a> for Duration { } } +/// Returns `None`, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. impl<'a, A: Arbitrary<'a>> Arbitrary<'a> for Option { fn arbitrary(u: &mut Unstructured<'a>) -> Result { Ok(if >::arbitrary(u)? { @@ -1621,6 +1634,20 @@ mod test { ); assert_eq!((1, None), <(u8, Vec) as Arbitrary>::size_hint(0)); } + + #[test] + fn exhausted_entropy() { + let mut u = Unstructured::new(&[]); + assert_eq!(u.arbitrary::().unwrap(), false); + assert_eq!(u.arbitrary::().unwrap(), 0); + assert_eq!(u.arbitrary::().unwrap(), 0); + assert_eq!(u.arbitrary::().unwrap(), 0.0); + assert_eq!(u.arbitrary::().unwrap(), 0.0); + assert_eq!(u.arbitrary::>().unwrap(), None); + assert_eq!(u.int_in_range(4..=100).unwrap(), 4); + assert_eq!(u.choose_index(10).unwrap(), 0); + assert_eq!(u.ratio(5, 7).unwrap(), true); + } } /// Multiple conflicting arbitrary attributes are used on the same field: diff --git a/src/unstructured.rs b/src/unstructured.rs index 639a1fc..0a045e5 100644 --- a/src/unstructured.rs +++ b/src/unstructured.rs @@ -273,6 +273,11 @@ impl<'a> Unstructured<'a> { /// Do not use this to generate the size of a collection. Use /// `arbitrary_len` instead. /// + /// The probability distribution of the return value is not necessarily uniform. + /// + /// Returns `range.start()`, not an error, + /// if this `Unstructured` [is empty][Unstructured::is_empty]. + /// /// # Panics /// /// Panics if `range.start > range.end`. That is, the given range must be @@ -376,8 +381,12 @@ impl<'a> Unstructured<'a> { /// /// This should only be used inside of `Arbitrary` implementations. /// - /// Returns an error if there is not enough underlying data to make a - /// choice or if no choices are provided. + /// The probability distribution of choices is not necessarily uniform. + /// + /// Returns the first choice, not an error, + /// if this `Unstructured` [is empty][Unstructured::is_empty]. + /// + /// Returns an error if no choices are provided. /// /// # Examples /// @@ -415,8 +424,12 @@ impl<'a> Unstructured<'a> { /// /// This should only be used inside of `Arbitrary` implementations. /// - /// Returns an error if there is not enough underlying data to make a - /// choice or if no choices are provided. + /// The probability distribution of choices is not necessarily uniform. + /// + /// Returns the first choice, not an error, + /// if this `Unstructured` [is empty][Unstructured::is_empty]. + /// + /// Returns an error if no choices are provided. /// /// # Examples /// @@ -448,6 +461,10 @@ impl<'a> Unstructured<'a> { /// Choose a value in `0..len`. /// + /// The probability distribution of return values is not necessarily uniform. + /// + /// Returns zero, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. + /// /// Returns an error if the `len` is zero. /// /// # Examples @@ -491,7 +508,9 @@ impl<'a> Unstructured<'a> { Ok(idx) } - /// Generate a boolean according to the given ratio. + /// Generate a boolean which is true with probability approximately the given ratio. + /// + /// Returns true, not an error, if this `Unstructured` [is empty][Unstructured::is_empty]. /// /// # Panics /// @@ -511,7 +530,7 @@ impl<'a> Unstructured<'a> { /// let mut u = Unstructured::new(&my_data); /// /// if u.ratio(5, 7)? { - /// // Take this branch 5/7 of the time. + /// // Take this branch approximately 5/7 of the time. /// } /// # Ok(()) /// # }