Auto merge of #94598 - scottmcm:prefix-free-hasher-methods, r=Amanieu

bors · bors · commit 8c4fc9d9a45b · 2022-05-06T09:43:57.000Z
Add a dedicated length-prefixing method to `Hasher` This accomplishes two main goals: - Make it clear who is responsible for prefix-freedom, including how they should do it - Make it feasible for a `Hasher` that *doesn't* care about Hash-DoS resistance to get better performance by not hashing lengths This does not change rustc-hash, since that's in an external crate, but that could potentially use it in future. Fixes #94026 r? rust-lang/libs --- The core of this change is the following two new methods on `Hasher`: ```rust pub trait Hasher { /// Writes a length prefix into this hasher, as part of being prefix-free. /// /// If you're implementing [`Hash`] for a custom collection, call this before /// writing its contents to this `Hasher`. That way /// `(collection![1, 2, 3], collection![4, 5])` and /// `(collection![1, 2], collection![3, 4, 5])` will provide different /// sequences of values to the `Hasher` /// /// The `impl<T> Hash for [T]` includes a call to this method, so if you're /// hashing a slice (or array or vector) via its `Hash::hash` method, /// you should **not** call this yourself. /// /// This method is only for providing domain separation. If you want to /// hash a `usize` that represents part of the *data*, then it's important /// that you pass it to [`Hasher::write_usize`] instead of to this method. /// /// # Examples /// /// ``` /// #![feature(hasher_prefixfree_extras)] /// # // Stubs to make the `impl` below pass the compiler /// # struct MyCollection<T>(Option<T>); /// # impl<T> MyCollection<T> { /// # fn len(&self) -> usize { todo!() } /// # } /// # impl<'a, T> IntoIterator for &'a MyCollection<T> { /// # type Item = T; /// # type IntoIter = std::iter::Empty<T>; /// # fn into_iter(self) -> Self::IntoIter { todo!() } /// # } /// /// use std::hash::{Hash, Hasher}; /// impl<T: Hash> Hash for MyCollection<T> { /// fn hash<H: Hasher>(&self, state: &mut H) { /// state.write_length_prefix(self.len()); /// for elt in self { /// elt.hash(state); /// } /// } /// } /// ``` /// /// # Note to Implementers /// /// If you've decided that your `Hasher` is willing to be susceptible to /// Hash-DoS attacks, then you might consider skipping hashing some or all /// of the `len` provided in the name of increased performance. #[inline] #[unstable(feature = "hasher_prefixfree_extras", issue = "88888888")] fn write_length_prefix(&mut self, len: usize) { self.write_usize(len); } /// Writes a single `str` into this hasher. /// /// If you're implementing [`Hash`], you generally do not need to call this, /// as the `impl Hash for str` does, so you can just use that. /// /// This includes the domain separator for prefix-freedom, so you should /// **not** call `Self::write_length_prefix` before calling this. /// /// # Note to Implementers /// /// The default implementation of this method includes a call to /// [`Self::write_length_prefix`], so if your implementation of `Hasher` /// doesn't care about prefix-freedom and you've thus overridden /// that method to do nothing, there's no need to override this one. /// /// This method is available to be overridden separately from the others /// as `str` being UTF-8 means that it never contains `0xFF` bytes, which /// can be used to provide prefix-freedom cheaper than hashing a length. /// /// For example, if your `Hasher` works byte-by-byte (perhaps by accumulating /// them into a buffer), then you can hash the bytes of the `str` followed /// by a single `0xFF` byte. /// /// If your `Hasher` works in chunks, you can also do this by being careful /// about how you pad partial chunks. If the chunks are padded with `0x00` /// bytes then just hashing an extra `0xFF` byte doesn't necessarily /// provide prefix-freedom, as `"ab"` and `"ab\u{0}"` would likely hash /// the same sequence of chunks. But if you pad with `0xFF` bytes instead, /// ensuring at least one padding byte, then it can often provide /// prefix-freedom cheaper than hashing the length would. #[inline] #[unstable(feature = "hasher_prefixfree_extras", issue = "88888888")] fn write_str(&mut self, s: &str) { self.write_length_prefix(s.len()); self.write(s.as_bytes()); } } ``` With updates to the `Hash` implementations for slices and containers to call `write_length_prefix` instead of `write_usize`. `write_str` defaults to using `write_length_prefix` since, as was pointed out in the issue, the `write_u8(0xFF)` approach is insufficient for hashers that work in chunks, as those would hash `"a\u{0}"` and `"a"` to the same thing. But since `SipHash` works byte-wise (there's an internal buffer to accumulate bytes until a full chunk is available) it overrides `write_str` to continue to use the add-non-UTF-8-byte approach. --- Compatibility: Because the default implementation of `write_length_prefix` calls `write_usize`, the changed hash implementation for slices will do the same thing the old one did on existing `Hasher`s.
diff --git a/compiler/rustc_data_structures/src/lib.rs b/compiler/rustc_data_structures/src/lib.rs
@@ -17,6 +17,7 @@
 #![feature(generators)]
 #![feature(let_else)]
 #![feature(hash_raw_entry)]
+#![feature(hasher_prefixfree_extras)]
 #![feature(maybe_uninit_uninit_array)]
 #![feature(min_specialization)]
 #![feature(never_type)]
diff --git a/compiler/rustc_data_structures/src/sip128.rs b/compiler/rustc_data_structures/src/sip128.rs
@@ -462,6 +462,14 @@ impl Hasher for SipHasher128 {
         self.slice_write(msg);
     }
 
+    #[inline]
+    fn write_str(&mut self, s: &str) {
+        // This hasher works byte-wise, and `0xFF` cannot show up in a `str`,
+        // so just hashing the one extra byte is enough to be prefix-free.
+        self.write(s.as_bytes());
+        self.write_u8(0xFF);
+    }
+
     fn finish(&self) -> u64 {
         panic!("SipHasher128 cannot provide valid 64 bit hashes")
     }
diff --git a/compiler/rustc_data_structures/src/stable_hasher.rs b/compiler/rustc_data_structures/src/stable_hasher.rs
@@ -73,6 +73,17 @@ impl Hasher for StableHasher {
         self.state.write(bytes);
     }
 
+    #[inline]
+    fn write_str(&mut self, s: &str) {
+        self.state.write_str(s);
+    }
+
+    #[inline]
+    fn write_length_prefix(&mut self, len: usize) {
+        // Our impl for `usize` will extend it if needed.
+        self.write_usize(len);
+    }
+
     #[inline]
     fn write_u8(&mut self, i: u8) {
         self.state.write_u8(i);
diff --git a/library/alloc/src/boxed.rs b/library/alloc/src/boxed.rs
@@ -1369,6 +1369,12 @@ impl<T: ?Sized + Hasher, A: Allocator> Hasher for Box<T, A> {
     fn write_isize(&mut self, i: isize) {
         (**self).write_isize(i)
     }
+    fn write_length_prefix(&mut self, len: usize) {
+        (**self).write_length_prefix(len)
+    }
+    fn write_str(&mut self, s: &str) {
+        (**self).write_str(s)
+    }
 }
 
 #[cfg(not(no_global_oom_handling))]
diff --git a/library/alloc/src/collections/btree/map.rs b/library/alloc/src/collections/btree/map.rs
@@ -1990,7 +1990,7 @@ impl<'a, K: Ord + Copy, V: Copy> Extend<(&'a K, &'a V)> for BTreeMap<K, V> {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<K: Hash, V: Hash> Hash for BTreeMap<K, V> {
     fn hash<H: Hasher>(&self, state: &mut H) {
-        self.len().hash(state);
+        state.write_length_prefix(self.len());
         for elt in self {
             elt.hash(state);
         }
diff --git a/library/alloc/src/collections/linked_list.rs b/library/alloc/src/collections/linked_list.rs
@@ -1944,7 +1944,7 @@ impl<T: fmt::Debug> fmt::Debug for LinkedList<T> {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<T: Hash> Hash for LinkedList<T> {
     fn hash<H: Hasher>(&self, state: &mut H) {
-        self.len().hash(state);
+        state.write_length_prefix(self.len());
         for elt in self {
             elt.hash(state);
         }
diff --git a/library/alloc/src/collections/vec_deque/mod.rs b/library/alloc/src/collections/vec_deque/mod.rs
@@ -2899,7 +2899,7 @@ impl<T: Ord, A: Allocator> Ord for VecDeque<T, A> {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<T: Hash, A: Allocator> Hash for VecDeque<T, A> {
     fn hash<H: Hasher>(&self, state: &mut H) {
-        self.len().hash(state);
+        state.write_length_prefix(self.len());
         // It's not possible to use Hash::hash_slice on slices
         // returned by as_slices method as their length can vary
         // in otherwise identical deques.
diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs
@@ -117,6 +117,7 @@
 #![feature(extend_one)]
 #![feature(fmt_internals)]
 #![feature(fn_traits)]
+#![feature(hasher_prefixfree_extras)]
 #![feature(inplace_iteration)]
 #![feature(iter_advance_by)]
 #![feature(layout_for_ptr)]
diff --git a/library/core/src/hash/mod.rs b/library/core/src/hash/mod.rs
@@ -333,6 +333,12 @@ pub trait Hasher {
     ///
     /// println!("Hash is {:x}!", hasher.finish());
     /// ```
+    ///
+    /// # Note to Implementers
+    ///
+    /// You generally should not do length-prefixing as part of implementing
+    /// this method.  It's up to the [`Hash`] implementation to call
+    /// [`Hasher::write_length_prefix`] before sequences that need it.
     #[stable(feature = "rust1", since = "1.0.0")]
     fn write(&mut self, bytes: &[u8]);
 
@@ -409,6 +415,127 @@ pub trait Hasher {
     fn write_isize(&mut self, i: isize) {
         self.write_usize(i as usize)
     }
+
+    /// Writes a length prefix into this hasher, as part of being prefix-free.
+    ///
+    /// If you're implementing [`Hash`] for a custom collection, call this before
+    /// writing its contents to this `Hasher`.  That way
+    /// `(collection![1, 2, 3], collection![4, 5])` and
+    /// `(collection![1, 2], collection![3, 4, 5])` will provide different
+    /// sequences of values to the `Hasher`
+    ///
+    /// The `impl<T> Hash for [T]` includes a call to this method, so if you're
+    /// hashing a slice (or array or vector) via its `Hash::hash` method,
+    /// you should **not** call this yourself.
+    ///
+    /// This method is only for providing domain separation.  If you want to
+    /// hash a `usize` that represents part of the *data*, then it's important
+    /// that you pass it to [`Hasher::write_usize`] instead of to this method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(hasher_prefixfree_extras)]
+    /// # // Stubs to make the `impl` below pass the compiler
+    /// # struct MyCollection<T>(Option<T>);
+    /// # impl<T> MyCollection<T> {
+    /// #     fn len(&self) -> usize { todo!() }
+    /// # }
+    /// # impl<'a, T> IntoIterator for &'a MyCollection<T> {
+    /// #     type Item = T;
+    /// #     type IntoIter = std::iter::Empty<T>;
+    /// #     fn into_iter(self) -> Self::IntoIter { todo!() }
+    /// # }
+    ///
+    /// use std::hash::{Hash, Hasher};
+    /// impl<T: Hash> Hash for MyCollection<T> {
+    ///     fn hash<H: Hasher>(&self, state: &mut H) {
+    ///         state.write_length_prefix(self.len());
+    ///         for elt in self {
+    ///             elt.hash(state);
+    ///         }
+    ///     }
+    /// }
+    /// ```
+    ///
+    /// # Note to Implementers
+    ///
+    /// If you've decided that your `Hasher` is willing to be susceptible to
+    /// Hash-DoS attacks, then you might consider skipping hashing some or all
+    /// of the `len` provided in the name of increased performance.
+    #[inline]
+    #[unstable(feature = "hasher_prefixfree_extras", issue = "96762")]
+    fn write_length_prefix(&mut self, len: usize) {
+        self.write_usize(len);
+    }
+
+    /// Writes a single `str` into this hasher.
+    ///
+    /// If you're implementing [`Hash`], you generally do not need to call this,
+    /// as the `impl Hash for str` does, so you should prefer that instead.
+    ///
+    /// This includes the domain separator for prefix-freedom, so you should
+    /// **not** call `Self::write_length_prefix` before calling this.
+    ///
+    /// # Note to Implementers
+    ///
+    /// There are at least two reasonable default ways to implement this.
+    /// Which one will be the default is not yet decided, so for now
+    /// you probably want to override it specifically.
+    ///
+    /// ## The general answer
+    ///
+    /// It's always correct to implement this with a length prefix:
+    ///
+    /// ```
+    /// # #![feature(hasher_prefixfree_extras)]
+    /// # struct Foo;
+    /// # impl std::hash::Hasher for Foo {
+    /// # fn finish(&self) -> u64 { unimplemented!() }
+    /// # fn write(&mut self, _bytes: &[u8]) { unimplemented!() }
+    /// fn write_str(&mut self, s: &str) {
+    ///     self.write_length_prefix(s.len());
+    ///     self.write(s.as_bytes());
+    /// }
+    /// # }
+    /// ```
+    ///
+    /// And, if your `Hasher` works in `usize` chunks, this is likely a very
+    /// efficient way to do it, as anything more complicated may well end up
+    /// slower than just running the round with the length.
+    ///
+    /// ## If your `Hasher` works byte-wise
+    ///
+    /// One nice thing about `str` being UTF-8 is that the `b'\xFF'` byte
+    /// never happens.  That means that you can append that to the byte stream
+    /// being hashed and maintain prefix-freedom:
+    ///
+    /// ```
+    /// # #![feature(hasher_prefixfree_extras)]
+    /// # struct Foo;
+    /// # impl std::hash::Hasher for Foo {
+    /// # fn finish(&self) -> u64 { unimplemented!() }
+    /// # fn write(&mut self, _bytes: &[u8]) { unimplemented!() }
+    /// fn write_str(&mut self, s: &str) {
+    ///     self.write(s.as_bytes());
+    ///     self.write_u8(0xff);
+    /// }
+    /// # }
+    /// ```
+    ///
+    /// This does require that your implementation not add extra padding, and
+    /// thus generally requires that you maintain a buffer, running a round
+    /// only once that buffer is full (or `finish` is called).
+    ///
+    /// That's because if `write` pads data out to a fixed chunk size, it's
+    /// likely that it does it in such a way that `"a"` and `"a\x00"` would
+    /// end up hashing the same sequence of things, introducing conflicts.
+    #[inline]
+    #[unstable(feature = "hasher_prefixfree_extras", issue = "96762")]
+    fn write_str(&mut self, s: &str) {
+        self.write(s.as_bytes());
+        self.write_u8(0xff);
+    }
 }
 
 #[stable(feature = "indirect_hasher_impl", since = "1.22.0")]
@@ -455,6 +582,12 @@ impl<H: Hasher + ?Sized> Hasher for &mut H {
     fn write_isize(&mut self, i: isize) {
         (**self).write_isize(i)
     }
+    fn write_length_prefix(&mut self, len: usize) {
+        (**self).write_length_prefix(len)
+    }
+    fn write_str(&mut self, s: &str) {
+        (**self).write_str(s)
+    }
 }
 
 /// A trait for creating instances of [`Hasher`].
@@ -709,8 +842,7 @@ mod impls {
     impl Hash for str {
         #[inline]
         fn hash<H: Hasher>(&self, state: &mut H) {
-            state.write(self.as_bytes());
-            state.write_u8(0xff)
+            state.write_str(self);
         }
     }
 
@@ -767,7 +899,7 @@ mod impls {
     impl<T: Hash> Hash for [T] {
         #[inline]
         fn hash<H: Hasher>(&self, state: &mut H) {
-            self.len().hash(state);
+            state.write_length_prefix(self.len());
             Hash::hash_slice(self, state)
         }
     }
diff --git a/library/core/src/hash/sip.rs b/library/core/src/hash/sip.rs
@@ -233,6 +233,11 @@ impl super::Hasher for SipHasher {
         self.0.hasher.write(msg)
     }
 
+    #[inline]
+    fn write_str(&mut self, s: &str) {
+        self.0.hasher.write_str(s);
+    }
+
     #[inline]
     fn finish(&self) -> u64 {
         self.0.hasher.finish()
@@ -246,6 +251,11 @@ impl super::Hasher for SipHasher13 {
         self.hasher.write(msg)
     }
 
+    #[inline]
+    fn write_str(&mut self, s: &str) {
+        self.hasher.write_str(s);
+    }
+
     #[inline]
     fn finish(&self) -> u64 {
         self.hasher.finish()
@@ -307,6 +317,14 @@ impl<S: Sip> super::Hasher for Hasher<S> {
         self.ntail = left;
     }
 
+    #[inline]
+    fn write_str(&mut self, s: &str) {
+        // This hasher works byte-wise, and `0xFF` cannot show up in a `str`,
+        // so just hashing the one extra byte is enough to be prefix-free.
+        self.write(s.as_bytes());
+        self.write_u8(0xFF);
+    }
+
     #[inline]
     fn finish(&self) -> u64 {
         let mut state = self.state;
diff --git a/library/core/tests/hash/mod.rs b/library/core/tests/hash/mod.rs
@@ -20,6 +20,10 @@ impl Hasher for MyHasher {
             self.hash += *byte as u64;
         }
     }
+    fn write_str(&mut self, s: &str) {
+        self.write(s.as_bytes());
+        self.write_u8(0xFF);
+    }
     fn finish(&self) -> u64 {
         self.hash
     }
diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs
@@ -36,6 +36,7 @@
 #![feature(future_join)]
 #![feature(future_poll_fn)]
 #![feature(array_from_fn)]
+#![feature(hasher_prefixfree_extras)]
 #![feature(hashmap_internals)]
 #![feature(try_find)]
 #![feature(inline_const)]
diff --git a/library/std/src/collections/hash/map.rs b/library/std/src/collections/hash/map.rs
@@ -3006,11 +3006,19 @@ impl Default for DefaultHasher {
 
 #[stable(feature = "hashmap_default_hasher", since = "1.13.0")]
 impl Hasher for DefaultHasher {
+    // The underlying `SipHasher13` doesn't override the other
+    // `write_*` methods, so it's ok not to forward them here.
+
     #[inline]
     fn write(&mut self, msg: &[u8]) {
         self.0.write(msg)
     }
 
+    #[inline]
+    fn write_str(&mut self, s: &str) {
+        self.0.write_str(s);
+    }
+
     #[inline]
     fn finish(&self) -> u64 {
         self.0.finish()
diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs
@@ -271,6 +271,7 @@
 #![feature(exact_size_is_empty)]
 #![feature(extend_one)]
 #![feature(float_minimum_maximum)]
+#![feature(hasher_prefixfree_extras)]
 #![feature(hashmap_internals)]
 #![feature(int_error_internals)]
 #![feature(maybe_uninit_slice)]

Original file line number	Diff line number	Diff line change
`@@ -462,6 +462,14 @@ impl Hasher for SipHasher128 {`
`462`	`462`	`self.slice_write(msg);`
`463`	`463`	`}`
`464`	`464`
	`465`	`+ #[inline]`
	`466`	`+ fn write_str(&mut self, s: &str) {`
	`467`	+ // This hasher works byte-wise, and `0xFF` cannot show up in a `str`,
	`468`	`+ // so just hashing the one extra byte is enough to be prefix-free.`
	`469`	`+ self.write(s.as_bytes());`
	`470`	`+ self.write_u8(0xFF);`
	`471`	`+ }`
	`472`	`+`
`465`	`473`	`fn finish(&self) -> u64 {`
`466`	`474`	`panic!("SipHasher128 cannot provide valid 64 bit hashes")`
`467`	`475`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1369,6 +1369,12 @@ impl<T: ?Sized + Hasher, A: Allocator> Hasher for Box<T, A> {`
`1369`	`1369`	`fn write_isize(&mut self, i: isize) {`
`1370`	`1370`	`(**self).write_isize(i)`
`1371`	`1371`	`}`
	`1372`	`+ fn write_length_prefix(&mut self, len: usize) {`
	`1373`	`+ (**self).write_length_prefix(len)`
	`1374`	`+ }`
	`1375`	`+ fn write_str(&mut self, s: &str) {`
	`1376`	`+ (**self).write_str(s)`
	`1377`	`+ }`
`1372`	`1378`	`}`
`1373`	`1379`
`1374`	`1380`	`#[cfg(not(no_global_oom_handling))]`
Original file line number	Diff line number	Diff line change
`@@ -1990,7 +1990,7 @@ impl<'a, K: Ord + Copy, V: Copy> Extend<(&'a K, &'a V)> for BTreeMap<K, V> {`
`1990`	`1990`	`#[stable(feature = "rust1", since = "1.0.0")]`
`1991`	`1991`	`impl<K: Hash, V: Hash> Hash for BTreeMap<K, V> {`
`1992`	`1992`	`fn hash<H: Hasher>(&self, state: &mut H) {`
`1993`		`- self.len().hash(state);`
	`1993`	`+ state.write_length_prefix(self.len());`
`1994`	`1994`	`for elt in self {`
`1995`	`1995`	`elt.hash(state);`
`1996`	`1996`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1944,7 +1944,7 @@ impl<T: fmt::Debug> fmt::Debug for LinkedList<T> {`
`1944`	`1944`	`#[stable(feature = "rust1", since = "1.0.0")]`
`1945`	`1945`	`impl<T: Hash> Hash for LinkedList<T> {`
`1946`	`1946`	`fn hash<H: Hasher>(&self, state: &mut H) {`
`1947`		`- self.len().hash(state);`
	`1947`	`+ state.write_length_prefix(self.len());`
`1948`	`1948`	`for elt in self {`
`1949`	`1949`	`elt.hash(state);`
`1950`	`1950`	`}`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,10 @@ impl Hasher for MyHasher {`
`20`	`20`	`self.hash += *byte as u64;`
`21`	`21`	`}`
`22`	`22`	`}`
	`23`	`+ fn write_str(&mut self, s: &str) {`
	`24`	`+ self.write(s.as_bytes());`
	`25`	`+ self.write_u8(0xFF);`
	`26`	`+ }`
`23`	`27`	`fn finish(&self) -> u64 {`
`24`	`28`	`self.hash`
`25`	`29`	`}`