diff --git a/Cargo.toml b/Cargo.toml index 9381831b..e4c92ab2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,8 @@ rand = "0.8" proptest = "1.0" criterion = { version = "0.3", features = ["html_reports"] } unicode-segmentation = "1.3" +fnv = "1" +fxhash = "0.2" #----------------------------------------- @@ -37,6 +39,10 @@ harness = false name = "insert" harness = false +[[bench]] +name = "hash" +harness = false + [[bench]] name = "remove" harness = false diff --git a/benches/hash.rs b/benches/hash.rs new file mode 100644 index 00000000..6bcc8b16 --- /dev/null +++ b/benches/hash.rs @@ -0,0 +1,116 @@ +extern crate criterion; +extern crate fnv; +extern crate fxhash; +extern crate ropey; + +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use fnv::FnvHasher; +use fxhash::FxHasher; +use ropey::Rope; + +const TEXT: &str = include_str!("large.txt"); +const TEXT_SMALL: &str = include_str!("small.txt"); +const TEXT_TINY: &str = "hello"; + +//---- + +fn hash_large(c: &mut Criterion) { + let mut group = c.benchmark_group("hash_large"); + + group.bench_function("default", |bench| { + let r = Rope::from_str(TEXT); + bench.iter(|| { + let mut hasher = DefaultHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fnv", |bench| { + let r = Rope::from_str(TEXT); + bench.iter(|| { + let mut hasher = FnvHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fxhash", |bench| { + let r = Rope::from_str(TEXT); + bench.iter(|| { + let mut hasher = FxHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); +} + +fn hash_small(c: &mut Criterion) { + let mut group = c.benchmark_group("hash_small"); + + group.bench_function("default", |bench| { + let r = Rope::from_str(TEXT_SMALL); + bench.iter(|| { + let mut hasher = DefaultHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fnv", |bench| { + let r = Rope::from_str(TEXT_SMALL); + bench.iter(|| { + let mut hasher = FnvHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fxhash", |bench| { + let r = Rope::from_str(TEXT_SMALL); + bench.iter(|| { + let mut hasher = FxHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); +} + +fn hash_tiny(c: &mut Criterion) { + let mut group = c.benchmark_group("hash_tiny"); + + group.bench_function("default", |bench| { + let r = Rope::from_str(TEXT_TINY); + bench.iter(|| { + let mut hasher = DefaultHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fnv", |bench| { + let r = Rope::from_str(TEXT_TINY); + bench.iter(|| { + let mut hasher = FnvHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); + + group.bench_function("fxhash", |bench| { + let r = Rope::from_str(TEXT_TINY); + bench.iter(|| { + let mut hasher = FxHasher::default(); + r.hash(black_box(&mut hasher)); + black_box(hasher.finish()); + }) + }); +} + +//---- + +criterion_group!(benches, hash_large, hash_small, hash_tiny,); +criterion_main!(benches); diff --git a/src/rope.rs b/src/rope.rs index 011ab211..7be18b95 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -2052,14 +2052,7 @@ impl std::cmp::PartialOrd for Rope { impl std::hash::Hash for Rope { fn hash(&self, state: &mut H) { - for chunk in self.chunks() { - state.write(chunk.as_bytes()); - } - - // Same strategy as `&str` in stdlib, so that e.g. two adjacent - // fields in a `#[derive(Hash)]` struct with "Hi " and "there" - // vs "Hi t" and "here" give the struct a different hash. - state.write_u8(0xff) + self.slice(..).hash(state) } } diff --git a/src/slice.rs b/src/slice.rs index 0b151777..84bcd32a 100644 --- a/src/slice.rs +++ b/src/slice.rs @@ -1974,8 +1974,51 @@ impl<'a, 'b> std::cmp::PartialOrd> for RopeSlice<'a> { impl<'a> std::hash::Hash for RopeSlice<'a> { fn hash(&self, state: &mut H) { + // `std::hash::Hasher` only guarantees the same hash output for + // exactly the same calls to `Hasher::write()`. Just submitting + // the same data in the same order isn't enough--it also has to + // be split the same between calls. So we go to some effort here + // to ensure that we always submit the text data in the same + // fixed-size blocks, even if those blocks don't align with chunk + // boundaries at all. + // + // The naive approach is to always copy to a fixed-size buffer + // and submit the buffer whenever it fills up. We conceptually + // follow that approach here, but we do a little better by + // skipping the buffer and directly passing the data without + // copying when possible. + const BLOCK_SIZE: usize = 256; + + let mut buffer = [0u8; BLOCK_SIZE]; + let mut buffer_len = 0; + for chunk in self.chunks() { - state.write(chunk.as_bytes()); + let mut data = chunk.as_bytes(); + + while !data.is_empty() { + if buffer_len == 0 && data.len() >= BLOCK_SIZE { + // Process data directly, skipping the buffer. + let (head, tail) = data.split_at(BLOCK_SIZE); + state.write(head); + data = tail; + } else if buffer_len == BLOCK_SIZE { + // Process the filled buffer. + state.write(&buffer[..]); + buffer_len = 0; + } else { + // Append to the buffer. + let n = (BLOCK_SIZE - buffer_len).min(data.len()); + let (head, tail) = data.split_at(n); + (&mut buffer[buffer_len..(buffer_len + n)]).copy_from_slice(head); + buffer_len += n; + data = tail; + } + } + } + + // Write any remaining unprocessed data in the buffer. + if buffer_len > 0 { + state.write(&buffer[..buffer_len]); } // Same strategy as `&str` in stdlib, so that e.g. two adjacent diff --git a/tests/hash.rs b/tests/hash.rs new file mode 100644 index 00000000..79d96ca3 --- /dev/null +++ b/tests/hash.rs @@ -0,0 +1,125 @@ +extern crate ropey; + +use std::hash::{Hash, Hasher}; + +use ropey::{Rope, RopeBuilder}; + +const SMALL_TEXT: &str = include_str!("small_ascii.txt"); + +/// This is an example `Hasher` to demonstrate a property guaranteed by +/// the documentation that is not exploited by the default `Hasher` (SipHash) +/// Relevant excerpt from the `Hasher` documentation: +/// > Nor can you assume that adjacent +/// > `write` calls are merged, so it's possible, for example, that +/// > ``` +/// > # fn foo(hasher: &mut impl std::hash::Hasher) { +/// > hasher.write(&[1, 2]); +/// > hasher.write(&[3, 4, 5, 6]); +/// > # } +/// > ``` +/// > and +/// > ``` +/// > # fn foo(hasher: &mut impl std::hash::Hasher) { +/// > hasher.write(&[1, 2, 3, 4]); +/// > hasher.write(&[5, 6]); +/// > # } +/// > ``` +/// > end up producing different hashes. +/// +/// This dummy hasher simply collects all bytes and inserts a separator byte (0xFF) at the end of `write`. +/// While this hasher might seem a little silly, it is perfectly inline with the std documentation. +/// Many other commonly used high performance `Hasher`s (fxhash, ahash, fnvhash) exploit the same property +/// to improve the performance of `write`, so violating this property will cause issues in practice. +#[derive(Default)] +struct TestHasher(std::collections::hash_map::DefaultHasher); +impl Hasher for TestHasher { + fn finish(&self) -> u64 { + self.0.finish() + } + + fn write(&mut self, bytes: &[u8]) { + self.0.write(bytes); + self.0.write_u8(0xFF); + } +} + +#[test] +#[cfg_attr(miri, ignore)] +fn hash_1() { + // Build two ropes with the same contents but different chunk boundaries. + let r1 = { + let mut b = RopeBuilder::new(); + b._append_chunk("Hello w"); + b._append_chunk("orld"); + b._finish_no_fix() + }; + let r2 = { + let mut b = RopeBuilder::new(); + b._append_chunk("Hell"); + b._append_chunk("o world"); + b._finish_no_fix() + }; + + let mut hasher1 = TestHasher::default(); + let mut hasher2 = TestHasher::default(); + r1.hash(&mut hasher1); + r2.hash(&mut hasher2); + + assert_eq!(hasher1.finish(), hasher2.finish()); +} + +#[test] +#[cfg_attr(miri, ignore)] +fn hash_2() { + // Build two ropes with the same contents but different chunk boundaries. + let r1 = { + let mut b = RopeBuilder::new(); + for chunk in SMALL_TEXT.as_bytes().chunks(5) { + b._append_chunk(std::str::from_utf8(chunk).unwrap()); + } + b._finish_no_fix() + }; + let r2 = { + let mut b = RopeBuilder::new(); + for chunk in SMALL_TEXT.as_bytes().chunks(7) { + b._append_chunk(std::str::from_utf8(chunk).unwrap()); + } + b._finish_no_fix() + }; + + for (l1, l2) in r1.lines().zip(r2.lines()) { + let mut hasher1 = TestHasher::default(); + let mut hasher2 = TestHasher::default(); + l1.hash(&mut hasher1); + l2.hash(&mut hasher2); + + assert_eq!(hasher1.finish(), hasher2.finish()); + } +} + +#[test] +#[cfg_attr(miri, ignore)] +fn hash_3() { + // Build two ropes with the same contents but different chunk boundaries. + let r1 = { + let mut b = RopeBuilder::new(); + for chunk in SMALL_TEXT.as_bytes().chunks(521) { + b._append_chunk(std::str::from_utf8(chunk).unwrap()); + } + b._finish_no_fix() + }; + let r2 = { + let mut b = RopeBuilder::new(); + for chunk in SMALL_TEXT.as_bytes().chunks(547) { + b._append_chunk(std::str::from_utf8(chunk).unwrap()); + } + b._finish_no_fix() + }; + + let mut hasher1 = TestHasher::default(); + let mut hasher2 = TestHasher::default(); + r1.hash(&mut hasher1); + r2.hash(&mut hasher2); + + assert_eq!(hasher1.finish(), hasher2.finish()); +} diff --git a/tests/small_ascii.txt b/tests/small_ascii.txt new file mode 100644 index 00000000..8f5157d1 --- /dev/null +++ b/tests/small_ascii.txt @@ -0,0 +1,24 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas sit amet tellus +nec turpis feugiat semper. Nam at nulla laoreet, finibus eros sit amet, fringilla +mauris. Fusce vestibulum nec ligula efficitur laoreet. Nunc orci leo, varius eget +ligula vulputate, consequat eleifend nisi. Cras justo purus, imperdiet a augue +malesuada, convallis cursus libero. Fusce pretium arcu in elementum laoreet. Duis +mauris nulla, suscipit at est nec, malesuada pellentesque eros. Quisque semper porta +malesuada. Nunc hendrerit est ac faucibus mollis. Nam fermentum id libero sed +egestas. Duis a accumsan sapien. Nam neque diam, congue non erat et, porta sagittis +turpis. Vivamus vitae mauris sit amet massa mollis molestie. Morbi scelerisque, +augue id congue imperdiet, felis lacus euismod dui, vitae facilisis massa dui quis +sapien. Vivamus hendrerit a urna a lobortis. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas sit amet tellus +nec turpis feugiat semper. Nam at nulla laoreet, finibus eros sit amet, fringilla +mauris. Fusce vestibulum nec ligula efficitur laoreet. Nunc orci leo, varius eget +ligula vulputate, consequat eleifend nisi. Cras justo purus, imperdiet a augue +malesuada, convallis cursus libero. Fusce pretium arcu in elementum laoreet. Duis +mauris nulla, suscipit at est nec, malesuada pellentesque eros. Quisque semper porta +malesuada. Nunc hendrerit est ac faucibus mollis. Nam fermentum id libero sed +egestas. Duis a accumsan sapien. Nam neque diam, congue non erat et, porta sagittis +turpis. Vivamus vitae mauris sit amet massa mollis molestie. Morbi scelerisque, +augue id congue imperdiet, felis lacus euismod dui, vitae facilisis massa dui quis +sapien. Vivamus hendrerit a urna a lobortis. +