Skip to content

Commit

Permalink
Fix Hash impl on Rope and RopeSlice.
Browse files Browse the repository at this point in the history
The previous impl misunderstood the Hasher API, and assumed that
as long as the same data was passed in the same order, the hash
output would be the same.  But in fact, the data must be split
among the `write()` calls exactly the same as well.  This new impl
fixes that by always passing the data in exact fixed-sized blocks,
regardless of the underlying chunks boundaries of the rope.

Co-authored-by: Pascal Kuthe <pascal.kuthe@semimod.de>
  • Loading branch information
cessen and pascalkuthe committed Nov 7, 2022
1 parent 69cfb1e commit fef5be9
Show file tree
Hide file tree
Showing 6 changed files with 316 additions and 9 deletions.
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ rand = "0.8"
proptest = "1.0"
criterion = { version = "0.3", features = ["html_reports"] }
unicode-segmentation = "1.3"
fnv = "1"
fxhash = "0.2"

#-----------------------------------------

Expand All @@ -37,6 +39,10 @@ harness = false
name = "insert"
harness = false

[[bench]]
name = "hash"
harness = false

[[bench]]
name = "remove"
harness = false
Expand Down
116 changes: 116 additions & 0 deletions benches/hash.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
extern crate criterion;
extern crate fnv;
extern crate fxhash;
extern crate ropey;

use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use fnv::FnvHasher;
use fxhash::FxHasher;
use ropey::Rope;

const TEXT: &str = include_str!("large.txt");
const TEXT_SMALL: &str = include_str!("small.txt");
const TEXT_TINY: &str = "hello";

//----

fn hash_large(c: &mut Criterion) {
let mut group = c.benchmark_group("hash_large");

group.bench_function("default", |bench| {
let r = Rope::from_str(TEXT);
bench.iter(|| {
let mut hasher = DefaultHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});

group.bench_function("fnv", |bench| {
let r = Rope::from_str(TEXT);
bench.iter(|| {
let mut hasher = FnvHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});

group.bench_function("fxhash", |bench| {
let r = Rope::from_str(TEXT);
bench.iter(|| {
let mut hasher = FxHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});
}

fn hash_small(c: &mut Criterion) {
let mut group = c.benchmark_group("hash_small");

group.bench_function("default", |bench| {
let r = Rope::from_str(TEXT_SMALL);
bench.iter(|| {
let mut hasher = DefaultHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});

group.bench_function("fnv", |bench| {
let r = Rope::from_str(TEXT_SMALL);
bench.iter(|| {
let mut hasher = FnvHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});

group.bench_function("fxhash", |bench| {
let r = Rope::from_str(TEXT_SMALL);
bench.iter(|| {
let mut hasher = FxHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});
}

fn hash_tiny(c: &mut Criterion) {
let mut group = c.benchmark_group("hash_tiny");

group.bench_function("default", |bench| {
let r = Rope::from_str(TEXT_TINY);
bench.iter(|| {
let mut hasher = DefaultHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});

group.bench_function("fnv", |bench| {
let r = Rope::from_str(TEXT_TINY);
bench.iter(|| {
let mut hasher = FnvHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});

group.bench_function("fxhash", |bench| {
let r = Rope::from_str(TEXT_TINY);
bench.iter(|| {
let mut hasher = FxHasher::default();
r.hash(black_box(&mut hasher));
black_box(hasher.finish());
})
});
}

//----

criterion_group!(benches, hash_large, hash_small, hash_tiny,);
criterion_main!(benches);
9 changes: 1 addition & 8 deletions src/rope.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2052,14 +2052,7 @@ impl std::cmp::PartialOrd<Rope> for Rope {

impl std::hash::Hash for Rope {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
for chunk in self.chunks() {
state.write(chunk.as_bytes());
}

// Same strategy as `&str` in stdlib, so that e.g. two adjacent
// fields in a `#[derive(Hash)]` struct with "Hi " and "there"
// vs "Hi t" and "here" give the struct a different hash.
state.write_u8(0xff)
self.slice(..).hash(state)
}
}

Expand Down
45 changes: 44 additions & 1 deletion src/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1974,8 +1974,51 @@ impl<'a, 'b> std::cmp::PartialOrd<RopeSlice<'b>> for RopeSlice<'a> {

impl<'a> std::hash::Hash for RopeSlice<'a> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
// `std::hash::Hasher` only guarantees the same hash output for
// exactly the same calls to `Hasher::write()`. Just submitting
// the same data in the same order isn't enough--it also has to
// be split the same between calls. So we go to some effort here
// to ensure that we always submit the text data in the same
// fixed-size blocks, even if those blocks don't align with chunk
// boundaries at all.
//
// The naive approach is to always copy to a fixed-size buffer
// and submit the buffer whenever it fills up. We conceptually
// follow that approach here, but we do a little better by
// skipping the buffer and directly passing the data without
// copying when possible.
const BLOCK_SIZE: usize = 256;

let mut buffer = [0u8; BLOCK_SIZE];
let mut buffer_len = 0;

for chunk in self.chunks() {
state.write(chunk.as_bytes());
let mut data = chunk.as_bytes();

while !data.is_empty() {
if buffer_len == 0 && data.len() >= BLOCK_SIZE {
// Process data directly, skipping the buffer.
let (head, tail) = data.split_at(BLOCK_SIZE);
state.write(head);
data = tail;
} else if buffer_len == BLOCK_SIZE {
// Process the filled buffer.
state.write(&buffer[..]);
buffer_len = 0;
} else {
// Append to the buffer.
let n = (BLOCK_SIZE - buffer_len).min(data.len());
let (head, tail) = data.split_at(n);
(&mut buffer[buffer_len..(buffer_len + n)]).copy_from_slice(head);
buffer_len += n;
data = tail;
}
}
}

// Write any remaining unprocessed data in the buffer.
if buffer_len > 0 {
state.write(&buffer[..buffer_len]);
}

// Same strategy as `&str` in stdlib, so that e.g. two adjacent
Expand Down
125 changes: 125 additions & 0 deletions tests/hash.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
extern crate ropey;

use std::hash::{Hash, Hasher};

use ropey::{Rope, RopeBuilder};

const SMALL_TEXT: &str = include_str!("small_ascii.txt");

/// This is an example `Hasher` to demonstrate a property guaranteed by
/// the documentation that is not exploited by the default `Hasher` (SipHash)
/// Relevant excerpt from the `Hasher` documentation:
/// > Nor can you assume that adjacent
/// > `write` calls are merged, so it's possible, for example, that
/// > ```
/// > # fn foo(hasher: &mut impl std::hash::Hasher) {
/// > hasher.write(&[1, 2]);
/// > hasher.write(&[3, 4, 5, 6]);
/// > # }
/// > ```
/// > and
/// > ```
/// > # fn foo(hasher: &mut impl std::hash::Hasher) {
/// > hasher.write(&[1, 2, 3, 4]);
/// > hasher.write(&[5, 6]);
/// > # }
/// > ```
/// > end up producing different hashes.
///
/// This dummy hasher simply collects all bytes and inserts a separator byte (0xFF) at the end of `write`.
/// While this hasher might seem a little silly, it is perfectly inline with the std documentation.
/// Many other commonly used high performance `Hasher`s (fxhash, ahash, fnvhash) exploit the same property
/// to improve the performance of `write`, so violating this property will cause issues in practice.
#[derive(Default)]
struct TestHasher(std::collections::hash_map::DefaultHasher);
impl Hasher for TestHasher {
fn finish(&self) -> u64 {
self.0.finish()
}

fn write(&mut self, bytes: &[u8]) {
self.0.write(bytes);
self.0.write_u8(0xFF);
}
}

#[test]
#[cfg_attr(miri, ignore)]
fn hash_1() {
// Build two ropes with the same contents but different chunk boundaries.
let r1 = {
let mut b = RopeBuilder::new();
b._append_chunk("Hello w");
b._append_chunk("orld");
b._finish_no_fix()
};
let r2 = {
let mut b = RopeBuilder::new();
b._append_chunk("Hell");
b._append_chunk("o world");
b._finish_no_fix()
};

let mut hasher1 = TestHasher::default();
let mut hasher2 = TestHasher::default();
r1.hash(&mut hasher1);
r2.hash(&mut hasher2);

assert_eq!(hasher1.finish(), hasher2.finish());
}

#[test]
#[cfg_attr(miri, ignore)]
fn hash_2() {
// Build two ropes with the same contents but different chunk boundaries.
let r1 = {
let mut b = RopeBuilder::new();
for chunk in SMALL_TEXT.as_bytes().chunks(5) {
b._append_chunk(std::str::from_utf8(chunk).unwrap());
}
b._finish_no_fix()
};
let r2 = {
let mut b = RopeBuilder::new();
for chunk in SMALL_TEXT.as_bytes().chunks(7) {
b._append_chunk(std::str::from_utf8(chunk).unwrap());
}
b._finish_no_fix()
};

for (l1, l2) in r1.lines().zip(r2.lines()) {
let mut hasher1 = TestHasher::default();
let mut hasher2 = TestHasher::default();
l1.hash(&mut hasher1);
l2.hash(&mut hasher2);

assert_eq!(hasher1.finish(), hasher2.finish());
}
}

#[test]
#[cfg_attr(miri, ignore)]
fn hash_3() {
// Build two ropes with the same contents but different chunk boundaries.
let r1 = {
let mut b = RopeBuilder::new();
for chunk in SMALL_TEXT.as_bytes().chunks(521) {
b._append_chunk(std::str::from_utf8(chunk).unwrap());
}
b._finish_no_fix()
};
let r2 = {
let mut b = RopeBuilder::new();
for chunk in SMALL_TEXT.as_bytes().chunks(547) {
b._append_chunk(std::str::from_utf8(chunk).unwrap());
}
b._finish_no_fix()
};

let mut hasher1 = TestHasher::default();
let mut hasher2 = TestHasher::default();
r1.hash(&mut hasher1);
r2.hash(&mut hasher2);

assert_eq!(hasher1.finish(), hasher2.finish());
}
24 changes: 24 additions & 0 deletions tests/small_ascii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas sit amet tellus
nec turpis feugiat semper. Nam at nulla laoreet, finibus eros sit amet, fringilla
mauris. Fusce vestibulum nec ligula efficitur laoreet. Nunc orci leo, varius eget
ligula vulputate, consequat eleifend nisi. Cras justo purus, imperdiet a augue
malesuada, convallis cursus libero. Fusce pretium arcu in elementum laoreet. Duis
mauris nulla, suscipit at est nec, malesuada pellentesque eros. Quisque semper porta
malesuada. Nunc hendrerit est ac faucibus mollis. Nam fermentum id libero sed
egestas. Duis a accumsan sapien. Nam neque diam, congue non erat et, porta sagittis
turpis. Vivamus vitae mauris sit amet massa mollis molestie. Morbi scelerisque,
augue id congue imperdiet, felis lacus euismod dui, vitae facilisis massa dui quis
sapien. Vivamus hendrerit a urna a lobortis.

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas sit amet tellus
nec turpis feugiat semper. Nam at nulla laoreet, finibus eros sit amet, fringilla
mauris. Fusce vestibulum nec ligula efficitur laoreet. Nunc orci leo, varius eget
ligula vulputate, consequat eleifend nisi. Cras justo purus, imperdiet a augue
malesuada, convallis cursus libero. Fusce pretium arcu in elementum laoreet. Duis
mauris nulla, suscipit at est nec, malesuada pellentesque eros. Quisque semper porta
malesuada. Nunc hendrerit est ac faucibus mollis. Nam fermentum id libero sed
egestas. Duis a accumsan sapien. Nam neque diam, congue non erat et, porta sagittis
turpis. Vivamus vitae mauris sit amet massa mollis molestie. Morbi scelerisque,
augue id congue imperdiet, felis lacus euismod dui, vitae facilisis massa dui quis
sapien. Vivamus hendrerit a urna a lobortis.

0 comments on commit fef5be9

Please sign in to comment.