From fef5be9c9587974584bafa40c54a375ce6fbbc9a Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Sun, 6 Nov 2022 17:34:59 -0800 Subject: [PATCH] Fix Hash impl on Rope and RopeSlice. The previous impl misunderstood the Hasher API, and assumed that as long as the same data was passed in the same order, the hash output would be the same. But in fact, the data must be split among the `write()` calls exactly the same as well. This new impl fixes that by always passing the data in exact fixed-sized blocks, regardless of the underlying chunks boundaries of the rope. Co-authored-by: Pascal Kuthe --- Cargo.toml | 6 ++ benches/hash.rs | 116 +++++++++++++++++++++++++++++++++++++++ src/rope.rs | 9 +-- src/slice.rs | 45 ++++++++++++++- tests/hash.rs | 125 ++++++++++++++++++++++++++++++++++++++++++ tests/small_ascii.txt | 24 ++++++++ 6 files changed, 316 insertions(+), 9 deletions(-) create mode 100644 benches/hash.rs create mode 100644 tests/hash.rs create mode 100644 tests/small_ascii.txt diff --git a/Cargo.toml b/Cargo.toml index 9381831b..e4c92ab2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,8 @@ rand = "0.8" proptest = "1.0" criterion = { version = "0.3", features = ["html_reports"] } unicode-segmentation = "1.3" +fnv = "1" +fxhash = "0.2" #----------------------------------------- @@ -37,6 +39,10 @@ harness = false name = "insert" harness = false +[[bench]] +name = "hash" +harness = false + [[bench]] name = "remove" harness = false diff --git a/benches/hash.rs b/benches/hash.rs new file mode 100644 index 00000000..6bcc8b16 --- /dev/null +++ b/benches/hash.rs @@ -0,0 +1,116 @@ +extern crate criterion; +extern crate fnv; +extern crate fxhash; +extern crate ropey; + +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use fnv::FnvHasher; +use fxhash::FxHasher; +use ropey::Rope; + +const TEXT: &str = include_str!("large.txt"); +const TEXT_SMALL: &str = include_str!("small.txt"); +const TEXT_TINY: &str = "hello"; + +//---- + +fn hash_large(c: &mut Criterion) { + let mut group = c.benchmark_group("hash_large"); + + group.bench_function("default", |bench| { + let r = Rope::from_str(TEXT); + bench.iter(|| { + let mut hasher = DefaultHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fnv", |bench| { + let r = Rope::from_str(TEXT); + bench.iter(|| { + let mut hasher = FnvHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fxhash", |bench| { + let r = Rope::from_str(TEXT); + bench.iter(|| { + let mut hasher = FxHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); +} + +fn hash_small(c: &mut Criterion) { + let mut group = c.benchmark_group("hash_small"); + + group.bench_function("default", |bench| { + let r = Rope::from_str(TEXT_SMALL); + bench.iter(|| { + let mut hasher = DefaultHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fnv", |bench| { + let r = Rope::from_str(TEXT_SMALL); + bench.iter(|| { + let mut hasher = FnvHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fxhash", |bench| { + let r = Rope::from_str(TEXT_SMALL); + bench.iter(|| { + let mut hasher = FxHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); +} + +fn hash_tiny(c: &mut Criterion) { + let mut group = c.benchmark_group("hash_tiny"); + + group.bench_function("default", |bench| { + let r = Rope::from_str(TEXT_TINY); + bench.iter(|| { + let mut hasher = DefaultHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fnv", |bench| { + let r = Rope::from_str(TEXT_TINY); + bench.iter(|| { + let mut hasher = FnvHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fxhash", |bench| { + let r = Rope::from_str(TEXT_TINY); + bench.iter(|| { + let mut hasher = FxHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); +} + +//---- + +criterion_group!(benches, hash_large, hash_small, hash_tiny,); +criterion_main!(benches); diff --git a/src/rope.rs b/src/rope.rs index 011ab211..7be18b95 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -2052,14 +2052,7 @@ impl std::cmp::PartialOrd for Rope { impl std::hash::Hash for Rope { fn hash(&self, state: &mut H) { - for chunk in self.chunks() { - state.write(chunk.as_bytes()); - } - - // Same strategy as `&str` in stdlib, so that e.g. two adjacent - // fields in a `#[derive(Hash)]` struct with "Hi " and "there" - // vs "Hi t" and "here" give the struct a different hash. - state.write_u8(0xff) + self.slice(..).hash(state) } } diff --git a/src/slice.rs b/src/slice.rs index 0b151777..84bcd32a 100644 --- a/src/slice.rs +++ b/src/slice.rs @@ -1974,8 +1974,51 @@ impl<'a, 'b> std::cmp::PartialOrd> for RopeSlice<'a> { impl<'a> std::hash::Hash for RopeSlice<'a> { fn hash(&self, state: &mut H) { + // `std::hash::Hasher` only guarantees the same hash output for + // exactly the same calls to `Hasher::write()`. Just submitting + // the same data in the same order isn't enough--it also has to + // be split the same between calls. So we go to some effort here + // to ensure that we always submit the text data in the same + // fixed-size blocks, even if those blocks don't align with chunk + // boundaries at all. + // + // The naive approach is to always copy to a fixed-size buffer + // and submit the buffer whenever it fills up. We conceptually + // follow that approach here, but we do a little better by + // skipping the buffer and directly passing the data without + // copying when possible. + const BLOCK_SIZE: usize = 256; + + let mut buffer = [0u8; BLOCK_SIZE]; + let mut buffer_len = 0; + for chunk in self.chunks() { - state.write(chunk.as_bytes()); + let mut data = chunk.as_bytes(); + + while !data.is_empty() { + if buffer_len == 0 && data.len() >= BLOCK_SIZE { + // Process data directly, skipping the buffer. + let (head, tail) = data.split_at(BLOCK_SIZE); + state.write(head); + data = tail; + } else if buffer_len == BLOCK_SIZE { + // Process the filled buffer. + state.write(&buffer[..]); + buffer_len = 0; + } else { + // Append to the buffer. + let n = (BLOCK_SIZE - buffer_len).min(data.len()); + let (head, tail) = data.split_at(n); + (&mut buffer[buffer_len..(buffer_len + n)]).copy_from_slice(head); + buffer_len += n; + data = tail; + } + } + } + + // Write any remaining unprocessed data in the buffer. + if buffer_len > 0 { + state.write(&buffer[..buffer_len]); } // Same strategy as `&str` in stdlib, so that e.g. two adjacent diff --git a/tests/hash.rs b/tests/hash.rs new file mode 100644 index 00000000..79d96ca3 --- /dev/null +++ b/tests/hash.rs @@ -0,0 +1,125 @@ +extern crate ropey; + +use std::hash::{Hash, Hasher}; + +use ropey::{Rope, RopeBuilder}; + +const SMALL_TEXT: &str = include_str!("small_ascii.txt"); + +/// This is an example `Hasher` to demonstrate a property guaranteed by +/// the documentation that is not exploited by the default `Hasher` (SipHash) +/// Relevant excerpt from the `Hasher` documentation: +/// > Nor can you assume that adjacent +/// > `write` calls are merged, so it's possible, for example, that +/// > ``` +/// > # fn foo(hasher: &mut impl std::hash::Hasher) { +/// > hasher.write(&[1, 2]); +/// > hasher.write(&[3, 4, 5, 6]); +/// > # } +/// > ``` +/// > and +/// > ``` +/// > # fn foo(hasher: &mut impl std::hash::Hasher) { +/// > hasher.write(&[1, 2, 3, 4]); +/// > hasher.write(&[5, 6]); +/// > # } +/// > ``` +/// > end up producing different hashes. +/// +/// This dummy hasher simply collects all bytes and inserts a separator byte (0xFF) at the end of `write`. +/// While this hasher might seem a little silly, it is perfectly inline with the std documentation. +/// Many other commonly used high performance `Hasher`s (fxhash, ahash, fnvhash) exploit the same property +/// to improve the performance of `write`, so violating this property will cause issues in practice. +#[derive(Default)] +struct TestHasher(std::collections::hash_map::DefaultHasher); +impl Hasher for TestHasher { + fn finish(&self) -> u64 { + self.0.finish() + } + + fn write(&mut self, bytes: &[u8]) { + self.0.write(bytes); + self.0.write_u8(0xFF); + } +} + +#[test] +#[cfg_attr(miri, ignore)] +fn hash_1() { + // Build two ropes with the same contents but different chunk boundaries. + let r1 = { + let mut b = RopeBuilder::new(); + b._append_chunk("Hello w"); + b._append_chunk("orld"); + b._finish_no_fix() + }; + let r2 = { + let mut b = RopeBuilder::new(); + b._append_chunk("Hell"); + b._append_chunk("o world"); + b._finish_no_fix() + }; + + let mut hasher1 = TestHasher::default(); + let mut hasher2 = TestHasher::default(); + r1.hash(&mut hasher1); + r2.hash(&mut hasher2); + + assert_eq!(hasher1.finish(), hasher2.finish()); +} + +#[test] +#[cfg_attr(miri, ignore)] +fn hash_2() { + // Build two ropes with the same contents but different chunk boundaries. + let r1 = { + let mut b = RopeBuilder::new(); + for chunk in SMALL_TEXT.as_bytes().chunks(5) { + b._append_chunk(std::str::from_utf8(chunk).unwrap()); + } + b._finish_no_fix() + }; + let r2 = { + let mut b = RopeBuilder::new(); + for chunk in SMALL_TEXT.as_bytes().chunks(7) { + b._append_chunk(std::str::from_utf8(chunk).unwrap()); + } + b._finish_no_fix() + }; + + for (l1, l2) in r1.lines().zip(r2.lines()) { + let mut hasher1 = TestHasher::default(); + let mut hasher2 = TestHasher::default(); + l1.hash(&mut hasher1); + l2.hash(&mut hasher2); + + assert_eq!(hasher1.finish(), hasher2.finish()); + } +} + +#[test] +#[cfg_attr(miri, ignore)] +fn hash_3() { + // Build two ropes with the same contents but different chunk boundaries. + let r1 = { + let mut b = RopeBuilder::new(); + for chunk in SMALL_TEXT.as_bytes().chunks(521) { + b._append_chunk(std::str::from_utf8(chunk).unwrap()); + } + b._finish_no_fix() + }; + let r2 = { + let mut b = RopeBuilder::new(); + for chunk in SMALL_TEXT.as_bytes().chunks(547) { + b._append_chunk(std::str::from_utf8(chunk).unwrap()); + } + b._finish_no_fix() + }; + + let mut hasher1 = TestHasher::default(); + let mut hasher2 = TestHasher::default(); + r1.hash(&mut hasher1); + r2.hash(&mut hasher2); + + assert_eq!(hasher1.finish(), hasher2.finish()); +} diff --git a/tests/small_ascii.txt b/tests/small_ascii.txt new file mode 100644 index 00000000..8f5157d1 --- /dev/null +++ b/tests/small_ascii.txt @@ -0,0 +1,24 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas sit amet tellus +nec turpis feugiat semper. Nam at nulla laoreet, finibus eros sit amet, fringilla +mauris. Fusce vestibulum nec ligula efficitur laoreet. Nunc orci leo, varius eget +ligula vulputate, consequat eleifend nisi. Cras justo purus, imperdiet a augue +malesuada, convallis cursus libero. Fusce pretium arcu in elementum laoreet. Duis +mauris nulla, suscipit at est nec, malesuada pellentesque eros. Quisque semper porta +malesuada. Nunc hendrerit est ac faucibus mollis. Nam fermentum id libero sed +egestas. Duis a accumsan sapien. Nam neque diam, congue non erat et, porta sagittis +turpis. Vivamus vitae mauris sit amet massa mollis molestie. Morbi scelerisque, +augue id congue imperdiet, felis lacus euismod dui, vitae facilisis massa dui quis +sapien. Vivamus hendrerit a urna a lobortis. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas sit amet tellus +nec turpis feugiat semper. Nam at nulla laoreet, finibus eros sit amet, fringilla +mauris. Fusce vestibulum nec ligula efficitur laoreet. Nunc orci leo, varius eget +ligula vulputate, consequat eleifend nisi. Cras justo purus, imperdiet a augue +malesuada, convallis cursus libero. Fusce pretium arcu in elementum laoreet. Duis +mauris nulla, suscipit at est nec, malesuada pellentesque eros. Quisque semper porta +malesuada. Nunc hendrerit est ac faucibus mollis. Nam fermentum id libero sed +egestas. Duis a accumsan sapien. Nam neque diam, congue non erat et, porta sagittis +turpis. Vivamus vitae mauris sit amet massa mollis molestie. Morbi scelerisque, +augue id congue imperdiet, felis lacus euismod dui, vitae facilisis massa dui quis +sapien. Vivamus hendrerit a urna a lobortis. +