|
1 | 1 | //! This is a copy of `core::hash::sip` adapted to providing 128 bit hashes.
|
2 | 2 |
|
3 |
| -use std::cmp; |
4 | 3 | use std::hash::Hasher;
|
5 |
| -use std::mem; |
| 4 | +use std::mem::{self, MaybeUninit}; |
6 | 5 | use std::ptr;
|
7 | 6 |
|
8 | 7 | #[cfg(test)]
|
9 | 8 | mod tests;
|
10 | 9 |
|
| 10 | +// The SipHash algorithm operates on 8-byte chunks. |
| 11 | +const ELEM_SIZE: usize = mem::size_of::<u64>(); |
| 12 | + |
| 13 | +// Size of the buffer in number of elements, not including the spill. |
| 14 | +// |
| 15 | +// The selection of this size was guided by rustc-perf benchmark comparisons of |
| 16 | +// different buffer sizes. It should be periodically reevaluated as the compiler |
| 17 | +// implementation and input characteristics change. |
| 18 | +// |
| 19 | +// Using the same-sized buffer for everything we hash is a performance versus |
| 20 | +// complexity tradeoff. The ideal buffer size, and whether buffering should even |
| 21 | +// be used, depends on what is being hashed. It may be worth it to size the |
| 22 | +// buffer appropriately (perhaps by making SipHasher128 generic over the buffer |
| 23 | +// size) or disable buffering depending on what is being hashed. But at this |
| 24 | +// time, we use the same buffer size for everything. |
| 25 | +const BUFFER_CAPACITY: usize = 8; |
| 26 | + |
| 27 | +// Size of the buffer in bytes, not including the spill. |
| 28 | +const BUFFER_SIZE: usize = BUFFER_CAPACITY * ELEM_SIZE; |
| 29 | + |
| 30 | +// Size of the buffer in number of elements, including the spill. |
| 31 | +const BUFFER_WITH_SPILL_CAPACITY: usize = BUFFER_CAPACITY + 1; |
| 32 | + |
| 33 | +// Size of the buffer in bytes, including the spill. |
| 34 | +const BUFFER_WITH_SPILL_SIZE: usize = BUFFER_WITH_SPILL_CAPACITY * ELEM_SIZE; |
| 35 | + |
| 36 | +// Index of the spill element in the buffer. |
| 37 | +const BUFFER_SPILL_INDEX: usize = BUFFER_WITH_SPILL_CAPACITY - 1; |
| 38 | + |
11 | 39 | #[derive(Debug, Clone)]
|
| 40 | +#[repr(C)] |
12 | 41 | pub struct SipHasher128 {
|
13 |
| - k0: u64, |
14 |
| - k1: u64, |
15 |
| - length: usize, // how many bytes we've processed |
16 |
| - state: State, // hash State |
17 |
| - tail: u64, // unprocessed bytes le |
18 |
| - ntail: usize, // how many bytes in tail are valid |
| 42 | + // The access pattern during hashing consists of accesses to `nbuf` and |
| 43 | + // `buf` until the buffer is full, followed by accesses to `state` and |
| 44 | + // `processed`, and then repetition of that pattern until hashing is done. |
| 45 | + // This is the basis for the ordering of fields below. However, in practice |
| 46 | + // the cache miss-rate for data access is extremely low regardless of order. |
| 47 | + nbuf: usize, // how many bytes in buf are valid |
| 48 | + buf: [MaybeUninit<u64>; BUFFER_WITH_SPILL_CAPACITY], // unprocessed bytes le |
| 49 | + state: State, // hash State |
| 50 | + processed: usize, // how many bytes we've processed |
19 | 51 | }
|
20 | 52 |
|
21 | 53 | #[derive(Debug, Clone, Copy)]
|
@@ -51,271 +83,386 @@ macro_rules! compress {
|
51 | 83 | }};
|
52 | 84 | }
|
53 | 85 |
|
54 |
| -/// Loads an integer of the desired type from a byte stream, in LE order. Uses |
55 |
| -/// `copy_nonoverlapping` to let the compiler generate the most efficient way |
56 |
| -/// to load it from a possibly unaligned address. |
57 |
| -/// |
58 |
| -/// Unsafe because: unchecked indexing at i..i+size_of(int_ty) |
59 |
| -macro_rules! load_int_le { |
60 |
| - ($buf:expr, $i:expr, $int_ty:ident) => {{ |
61 |
| - debug_assert!($i + mem::size_of::<$int_ty>() <= $buf.len()); |
62 |
| - let mut data = 0 as $int_ty; |
63 |
| - ptr::copy_nonoverlapping( |
64 |
| - $buf.get_unchecked($i), |
65 |
| - &mut data as *mut _ as *mut u8, |
66 |
| - mem::size_of::<$int_ty>(), |
67 |
| - ); |
68 |
| - data.to_le() |
69 |
| - }}; |
70 |
| -} |
71 |
| - |
72 |
| -/// Loads a u64 using up to 7 bytes of a byte slice. It looks clumsy but the |
73 |
| -/// `copy_nonoverlapping` calls that occur (via `load_int_le!`) all have fixed |
74 |
| -/// sizes and avoid calling `memcpy`, which is good for speed. |
75 |
| -/// |
76 |
| -/// Unsafe because: unchecked indexing at start..start+len |
| 86 | +// Copies up to 8 bytes from source to destination. This performs better than |
| 87 | +// `ptr::copy_nonoverlapping` on microbenchmarks and may perform better on real |
| 88 | +// workloads since all of the copies have fixed sizes and avoid calling memcpy. |
| 89 | +// |
| 90 | +// This is specifically designed for copies of up to 8 bytes, because that's the |
| 91 | +// maximum of number bytes needed to fill an 8-byte-sized element on which |
| 92 | +// SipHash operates. Note that for variable-sized copies which are known to be |
| 93 | +// less than 8 bytes, this function will perform more work than necessary unless |
| 94 | +// the compiler is able to optimize the extra work away. |
77 | 95 | #[inline]
|
78 |
| -unsafe fn u8to64_le(buf: &[u8], start: usize, len: usize) -> u64 { |
79 |
| - debug_assert!(len < 8); |
80 |
| - let mut i = 0; // current byte index (from LSB) in the output u64 |
81 |
| - let mut out = 0; |
82 |
| - if i + 3 < len { |
83 |
| - out = load_int_le!(buf, start + i, u32) as u64; |
| 96 | +unsafe fn copy_nonoverlapping_small(src: *const u8, dst: *mut u8, count: usize) { |
| 97 | + debug_assert!(count <= 8); |
| 98 | + |
| 99 | + if count == 8 { |
| 100 | + ptr::copy_nonoverlapping(src, dst, 8); |
| 101 | + return; |
| 102 | + } |
| 103 | + |
| 104 | + let mut i = 0; |
| 105 | + if i + 3 < count { |
| 106 | + ptr::copy_nonoverlapping(src.add(i), dst.add(i), 4); |
84 | 107 | i += 4;
|
85 | 108 | }
|
86 |
| - if i + 1 < len { |
87 |
| - out |= (load_int_le!(buf, start + i, u16) as u64) << (i * 8); |
| 109 | + |
| 110 | + if i + 1 < count { |
| 111 | + ptr::copy_nonoverlapping(src.add(i), dst.add(i), 2); |
88 | 112 | i += 2
|
89 | 113 | }
|
90 |
| - if i < len { |
91 |
| - out |= (*buf.get_unchecked(start + i) as u64) << (i * 8); |
| 114 | + |
| 115 | + if i < count { |
| 116 | + *dst.add(i) = *src.add(i); |
92 | 117 | i += 1;
|
93 | 118 | }
|
94 |
| - debug_assert_eq!(i, len); |
95 |
| - out |
| 119 | + |
| 120 | + debug_assert_eq!(i, count); |
96 | 121 | }
|
97 | 122 |
|
| 123 | +// # Implementation |
| 124 | +// |
| 125 | +// This implementation uses buffering to reduce the hashing cost for inputs |
| 126 | +// consisting of many small integers. Buffering simplifies the integration of |
| 127 | +// integer input--the integer write function typically just appends to the |
| 128 | +// buffer with a statically sized write, updates metadata, and returns. |
| 129 | +// |
| 130 | +// Buffering also prevents alternating between writes that do and do not trigger |
| 131 | +// the hashing process. Only when the entire buffer is full do we transition |
| 132 | +// into hashing. This allows us to keep the hash state in registers for longer, |
| 133 | +// instead of loading and storing it before and after processing each element. |
| 134 | +// |
| 135 | +// When a write fills the buffer, a buffer processing function is invoked to |
| 136 | +// hash all of the buffered input. The buffer processing functions are marked |
| 137 | +// `#[inline(never)]` so that they aren't inlined into the append functions, |
| 138 | +// which ensures the more frequently called append functions remain inlineable |
| 139 | +// and don't include register pushing/popping that would only be made necessary |
| 140 | +// by inclusion of the complex buffer processing path which uses those |
| 141 | +// registers. |
| 142 | +// |
| 143 | +// The buffer includes a "spill"--an extra element at the end--which simplifies |
| 144 | +// the integer write buffer processing path. The value that fills the buffer can |
| 145 | +// be written with a statically sized write that may spill over into the spill. |
| 146 | +// After the buffer is processed, the part of the value that spilled over can be |
| 147 | +// written from the spill to the beginning of the buffer with another statically |
| 148 | +// sized write. This write may copy more bytes than actually spilled over, but |
| 149 | +// we maintain the metadata such that any extra copied bytes will be ignored by |
| 150 | +// subsequent processing. Due to the static sizes, this scheme performs better |
| 151 | +// than copying the exact number of bytes needed into the end and beginning of |
| 152 | +// the buffer. |
| 153 | +// |
| 154 | +// The buffer is uninitialized, which improves performance, but may preclude |
| 155 | +// efficient implementation of alternative approaches. The improvement is not so |
| 156 | +// large that an alternative approach should be disregarded because it cannot be |
| 157 | +// efficiently implemented with an uninitialized buffer. On the other hand, an |
| 158 | +// uninitialized buffer may become more important should a larger one be used. |
| 159 | +// |
| 160 | +// # Platform Dependence |
| 161 | +// |
| 162 | +// The SipHash algorithm operates on byte sequences. It parses the input stream |
| 163 | +// as 8-byte little-endian integers. Therefore, given the same byte sequence, it |
| 164 | +// produces the same result on big- and little-endian hardware. |
| 165 | +// |
| 166 | +// However, the Hasher trait has methods which operate on multi-byte integers. |
| 167 | +// How they are converted into byte sequences can be endian-dependent (by using |
| 168 | +// native byte order) or independent (by consistently using either LE or BE byte |
| 169 | +// order). It can also be `isize` and `usize` size dependent (by using the |
| 170 | +// native size), or independent (by converting to a common size), supposing the |
| 171 | +// values can be represented in 32 bits. |
| 172 | +// |
| 173 | +// In order to make `SipHasher128` consistent with `SipHasher` in libstd, we |
| 174 | +// choose to do the integer to byte sequence conversion in the platform- |
| 175 | +// dependent way. Clients can achieve platform-independent hashing by widening |
| 176 | +// `isize` and `usize` integers to 64 bits on 32-bit systems and byte-swapping |
| 177 | +// integers on big-endian systems before passing them to the writing functions. |
| 178 | +// This causes the input byte sequence to look identical on big- and little- |
| 179 | +// endian systems (supposing `isize` and `usize` values can be represented in 32 |
| 180 | +// bits), which ensures platform-independent results. |
98 | 181 | impl SipHasher128 {
|
99 | 182 | #[inline]
|
100 | 183 | pub fn new_with_keys(key0: u64, key1: u64) -> SipHasher128 {
|
101 |
| - let mut state = SipHasher128 { |
102 |
| - k0: key0, |
103 |
| - k1: key1, |
104 |
| - length: 0, |
105 |
| - state: State { v0: 0, v1: 0, v2: 0, v3: 0 }, |
106 |
| - tail: 0, |
107 |
| - ntail: 0, |
| 184 | + let mut hasher = SipHasher128 { |
| 185 | + nbuf: 0, |
| 186 | + buf: MaybeUninit::uninit_array(), |
| 187 | + state: State { |
| 188 | + v0: key0 ^ 0x736f6d6570736575, |
| 189 | + // The XOR with 0xee is only done on 128-bit algorithm version. |
| 190 | + v1: key1 ^ (0x646f72616e646f6d ^ 0xee), |
| 191 | + v2: key0 ^ 0x6c7967656e657261, |
| 192 | + v3: key1 ^ 0x7465646279746573, |
| 193 | + }, |
| 194 | + processed: 0, |
108 | 195 | };
|
109 |
| - state.reset(); |
110 |
| - state |
| 196 | + |
| 197 | + unsafe { |
| 198 | + // Initialize spill because we read from it in `short_write_process_buffer`. |
| 199 | + *hasher.buf.get_unchecked_mut(BUFFER_SPILL_INDEX) = MaybeUninit::zeroed(); |
| 200 | + } |
| 201 | + |
| 202 | + hasher |
111 | 203 | }
|
112 | 204 |
|
| 205 | + // A specialized write function for values with size <= 8. |
113 | 206 | #[inline]
|
114 |
| - fn reset(&mut self) { |
115 |
| - self.length = 0; |
116 |
| - self.state.v0 = self.k0 ^ 0x736f6d6570736575; |
117 |
| - self.state.v1 = self.k1 ^ 0x646f72616e646f6d; |
118 |
| - self.state.v2 = self.k0 ^ 0x6c7967656e657261; |
119 |
| - self.state.v3 = self.k1 ^ 0x7465646279746573; |
120 |
| - self.ntail = 0; |
121 |
| - |
122 |
| - // This is only done in the 128 bit version: |
123 |
| - self.state.v1 ^= 0xee; |
| 207 | + fn short_write<T>(&mut self, x: T) { |
| 208 | + let size = mem::size_of::<T>(); |
| 209 | + let nbuf = self.nbuf; |
| 210 | + debug_assert!(size <= 8); |
| 211 | + debug_assert!(nbuf < BUFFER_SIZE); |
| 212 | + debug_assert!(nbuf + size < BUFFER_WITH_SPILL_SIZE); |
| 213 | + |
| 214 | + if nbuf + size < BUFFER_SIZE { |
| 215 | + unsafe { |
| 216 | + // The memcpy call is optimized away because the size is known. |
| 217 | + let dst = (self.buf.as_mut_ptr() as *mut u8).add(nbuf); |
| 218 | + ptr::copy_nonoverlapping(&x as *const _ as *const u8, dst, size); |
| 219 | + } |
| 220 | + |
| 221 | + self.nbuf = nbuf + size; |
| 222 | + |
| 223 | + return; |
| 224 | + } |
| 225 | + |
| 226 | + unsafe { self.short_write_process_buffer(x) } |
124 | 227 | }
|
125 | 228 |
|
126 |
| - // A specialized write function for values with size <= 8. |
127 |
| - // |
128 |
| - // The input must be zero-extended to 64-bits by the caller. This extension |
129 |
| - // isn't hashed, but the implementation requires it for correctness. |
| 229 | + // A specialized write function for values with size <= 8 that should only |
| 230 | + // be called when the write would cause the buffer to fill. |
130 | 231 | //
|
131 |
| - // This function, given the same integer size and value, has the same effect |
132 |
| - // on both little- and big-endian hardware. It operates on values without |
133 |
| - // depending on their sequence in memory, so is independent of endianness. |
134 |
| - // |
135 |
| - // However, we want SipHasher128 to be platform-dependent, in order to be |
136 |
| - // consistent with the platform-dependent SipHasher in libstd. In other |
137 |
| - // words, we want: |
138 |
| - // |
139 |
| - // - little-endian: `write_u32(0xDDCCBBAA)` == `write([0xAA, 0xBB, 0xCC, 0xDD])` |
140 |
| - // - big-endian: `write_u32(0xDDCCBBAA)` == `write([0xDD, 0xCC, 0xBB, 0xAA])` |
141 |
| - // |
142 |
| - // Therefore, in order to produce endian-dependent results, SipHasher128's |
143 |
| - // `write_xxx` Hasher trait methods byte-swap `x` prior to zero-extending. |
144 |
| - // |
145 |
| - // If clients of SipHasher128 itself want platform-independent results, they |
146 |
| - // *also* must byte-swap integer inputs before invoking the `write_xxx` |
147 |
| - // methods on big-endian hardware (that is, two byte-swaps must occur--one |
148 |
| - // in the client, and one in SipHasher128). Additionally, they must extend |
149 |
| - // `usize` and `isize` types to 64 bits on 32-bit systems. |
150 |
| - #[inline] |
151 |
| - fn short_write<T>(&mut self, _x: T, x: u64) { |
| 232 | + // SAFETY: the write of `x` into `self.buf` starting at byte offset |
| 233 | + // `self.nbuf` must cause `self.buf` to become fully initialized (and not |
| 234 | + // overflow) if it wasn't already. |
| 235 | + #[inline(never)] |
| 236 | + unsafe fn short_write_process_buffer<T>(&mut self, x: T) { |
152 | 237 | let size = mem::size_of::<T>();
|
153 |
| - self.length += size; |
154 |
| - |
155 |
| - // The original number must be zero-extended, not sign-extended. |
156 |
| - debug_assert!(if size < 8 { x >> (8 * size) == 0 } else { true }); |
157 |
| - |
158 |
| - // The number of bytes needed to fill `self.tail`. |
159 |
| - let needed = 8 - self.ntail; |
160 |
| - |
161 |
| - // SipHash parses the input stream as 8-byte little-endian integers. |
162 |
| - // Inputs are put into `self.tail` until 8 bytes of data have been |
163 |
| - // collected, and then that word is processed. |
164 |
| - // |
165 |
| - // For example, imagine that `self.tail` is 0x0000_00EE_DDCC_BBAA, |
166 |
| - // `self.ntail` is 5 (because 5 bytes have been put into `self.tail`), |
167 |
| - // and `needed` is therefore 3. |
168 |
| - // |
169 |
| - // - Scenario 1, `self.write_u8(0xFF)`: we have already zero-extended |
170 |
| - // the input to 0x0000_0000_0000_00FF. We now left-shift it five |
171 |
| - // bytes, giving 0x0000_FF00_0000_0000. We then bitwise-OR that value |
172 |
| - // into `self.tail`, resulting in 0x0000_FFEE_DDCC_BBAA. |
173 |
| - // (Zero-extension of the original input is critical in this scenario |
174 |
| - // because we don't want the high two bytes of `self.tail` to be |
175 |
| - // touched by the bitwise-OR.) `self.tail` is not yet full, so we |
176 |
| - // return early, after updating `self.ntail` to 6. |
177 |
| - // |
178 |
| - // - Scenario 2, `self.write_u32(0xIIHH_GGFF)`: we have already |
179 |
| - // zero-extended the input to 0x0000_0000_IIHH_GGFF. We now |
180 |
| - // left-shift it five bytes, giving 0xHHGG_FF00_0000_0000. We then |
181 |
| - // bitwise-OR that value into `self.tail`, resulting in |
182 |
| - // 0xHHGG_FFEE_DDCC_BBAA. `self.tail` is now full, and we can use it |
183 |
| - // to update `self.state`. (As mentioned above, this assumes a |
184 |
| - // little-endian machine; on a big-endian machine we would have |
185 |
| - // byte-swapped 0xIIHH_GGFF in the caller, giving 0xFFGG_HHII, and we |
186 |
| - // would then end up bitwise-ORing 0xGGHH_II00_0000_0000 into |
187 |
| - // `self.tail`). |
188 |
| - // |
189 |
| - self.tail |= x << (8 * self.ntail); |
190 |
| - if size < needed { |
191 |
| - self.ntail += size; |
| 238 | + let nbuf = self.nbuf; |
| 239 | + debug_assert!(size <= 8); |
| 240 | + debug_assert!(nbuf < BUFFER_SIZE); |
| 241 | + debug_assert!(nbuf + size >= BUFFER_SIZE); |
| 242 | + debug_assert!(nbuf + size < BUFFER_WITH_SPILL_SIZE); |
| 243 | + |
| 244 | + // Copy first part of input into end of buffer, possibly into spill |
| 245 | + // element. The memcpy call is optimized away because the size is known. |
| 246 | + let dst = (self.buf.as_mut_ptr() as *mut u8).add(nbuf); |
| 247 | + ptr::copy_nonoverlapping(&x as *const _ as *const u8, dst, size); |
| 248 | + |
| 249 | + // Process buffer. |
| 250 | + for i in 0..BUFFER_CAPACITY { |
| 251 | + let elem = self.buf.get_unchecked(i).assume_init().to_le(); |
| 252 | + self.state.v3 ^= elem; |
| 253 | + Sip24Rounds::c_rounds(&mut self.state); |
| 254 | + self.state.v0 ^= elem; |
| 255 | + } |
| 256 | + |
| 257 | + // Copy remaining input into start of buffer by copying size - 1 |
| 258 | + // elements from spill (at most size - 1 bytes could have overflowed |
| 259 | + // into the spill). The memcpy call is optimized away because the size |
| 260 | + // is known. And the whole copy is optimized away for size == 1. |
| 261 | + let src = self.buf.get_unchecked(BUFFER_SPILL_INDEX) as *const _ as *const u8; |
| 262 | + ptr::copy_nonoverlapping(src, self.buf.as_mut_ptr() as *mut u8, size - 1); |
| 263 | + |
| 264 | + // This function should only be called when the write fills the buffer. |
| 265 | + // Therefore, when size == 1, the new `self.nbuf` must be zero. The size |
| 266 | + // is statically known, so the branch is optimized away. |
| 267 | + self.nbuf = if size == 1 { 0 } else { nbuf + size - BUFFER_SIZE }; |
| 268 | + self.processed += BUFFER_SIZE; |
| 269 | + } |
| 270 | + |
| 271 | + // A write function for byte slices. |
| 272 | + #[inline] |
| 273 | + fn slice_write(&mut self, msg: &[u8]) { |
| 274 | + let length = msg.len(); |
| 275 | + let nbuf = self.nbuf; |
| 276 | + debug_assert!(nbuf < BUFFER_SIZE); |
| 277 | + |
| 278 | + if nbuf + length < BUFFER_SIZE { |
| 279 | + unsafe { |
| 280 | + let dst = (self.buf.as_mut_ptr() as *mut u8).add(nbuf); |
| 281 | + |
| 282 | + if length <= 8 { |
| 283 | + copy_nonoverlapping_small(msg.as_ptr(), dst, length); |
| 284 | + } else { |
| 285 | + // This memcpy is *not* optimized away. |
| 286 | + ptr::copy_nonoverlapping(msg.as_ptr(), dst, length); |
| 287 | + } |
| 288 | + } |
| 289 | + |
| 290 | + self.nbuf = nbuf + length; |
| 291 | + |
192 | 292 | return;
|
193 | 293 | }
|
194 | 294 |
|
195 |
| - // `self.tail` is full, process it. |
196 |
| - self.state.v3 ^= self.tail; |
197 |
| - Sip24Rounds::c_rounds(&mut self.state); |
198 |
| - self.state.v0 ^= self.tail; |
199 |
| - |
200 |
| - // Continuing scenario 2: we have one byte left over from the input. We |
201 |
| - // set `self.ntail` to 1 and `self.tail` to `0x0000_0000_IIHH_GGFF >> |
202 |
| - // 8*3`, which is 0x0000_0000_0000_00II. (Or on a big-endian machine |
203 |
| - // the prior byte-swapping would leave us with 0x0000_0000_0000_00FF.) |
204 |
| - // |
205 |
| - // The `if` is needed to avoid shifting by 64 bits, which Rust |
206 |
| - // complains about. |
207 |
| - self.ntail = size - needed; |
208 |
| - self.tail = if needed < 8 { x >> (8 * needed) } else { 0 }; |
| 295 | + unsafe { self.slice_write_process_buffer(msg) } |
| 296 | + } |
| 297 | + |
| 298 | + // A write function for byte slices that should only be called when the |
| 299 | + // write would cause the buffer to fill. |
| 300 | + // |
| 301 | + // SAFETY: `self.buf` must be initialized up to the byte offset `self.nbuf`, |
| 302 | + // and `msg` must contain enough bytes to initialize the rest of the element |
| 303 | + // containing the byte offset `self.nbuf`. |
| 304 | + #[inline(never)] |
| 305 | + unsafe fn slice_write_process_buffer(&mut self, msg: &[u8]) { |
| 306 | + let length = msg.len(); |
| 307 | + let nbuf = self.nbuf; |
| 308 | + debug_assert!(nbuf < BUFFER_SIZE); |
| 309 | + debug_assert!(nbuf + length >= BUFFER_SIZE); |
| 310 | + |
| 311 | + // Always copy first part of input into current element of buffer. |
| 312 | + // This function should only be called when the write fills the buffer, |
| 313 | + // so we know that there is enough input to fill the current element. |
| 314 | + let valid_in_elem = nbuf % ELEM_SIZE; |
| 315 | + let needed_in_elem = ELEM_SIZE - valid_in_elem; |
| 316 | + |
| 317 | + let src = msg.as_ptr(); |
| 318 | + let dst = (self.buf.as_mut_ptr() as *mut u8).add(nbuf); |
| 319 | + copy_nonoverlapping_small(src, dst, needed_in_elem); |
| 320 | + |
| 321 | + // Process buffer. |
| 322 | + |
| 323 | + // Using `nbuf / ELEM_SIZE + 1` rather than `(nbuf + needed_in_elem) / |
| 324 | + // ELEM_SIZE` to show the compiler that this loop's upper bound is > 0. |
| 325 | + // We know that is true, because last step ensured we have a full |
| 326 | + // element in the buffer. |
| 327 | + let last = nbuf / ELEM_SIZE + 1; |
| 328 | + |
| 329 | + for i in 0..last { |
| 330 | + let elem = self.buf.get_unchecked(i).assume_init().to_le(); |
| 331 | + self.state.v3 ^= elem; |
| 332 | + Sip24Rounds::c_rounds(&mut self.state); |
| 333 | + self.state.v0 ^= elem; |
| 334 | + } |
| 335 | + |
| 336 | + // Process the remaining element-sized chunks of input. |
| 337 | + let mut processed = needed_in_elem; |
| 338 | + let input_left = length - processed; |
| 339 | + let elems_left = input_left / ELEM_SIZE; |
| 340 | + let extra_bytes_left = input_left % ELEM_SIZE; |
| 341 | + |
| 342 | + for _ in 0..elems_left { |
| 343 | + let elem = (msg.as_ptr().add(processed) as *const u64).read_unaligned().to_le(); |
| 344 | + self.state.v3 ^= elem; |
| 345 | + Sip24Rounds::c_rounds(&mut self.state); |
| 346 | + self.state.v0 ^= elem; |
| 347 | + processed += ELEM_SIZE; |
| 348 | + } |
| 349 | + |
| 350 | + // Copy remaining input into start of buffer. |
| 351 | + let src = msg.as_ptr().add(processed); |
| 352 | + let dst = self.buf.as_mut_ptr() as *mut u8; |
| 353 | + copy_nonoverlapping_small(src, dst, extra_bytes_left); |
| 354 | + |
| 355 | + self.nbuf = extra_bytes_left; |
| 356 | + self.processed += nbuf + processed; |
209 | 357 | }
|
210 | 358 |
|
211 | 359 | #[inline]
|
212 | 360 | pub fn finish128(mut self) -> (u64, u64) {
|
213 |
| - let b: u64 = ((self.length as u64 & 0xff) << 56) | self.tail; |
| 361 | + debug_assert!(self.nbuf < BUFFER_SIZE); |
214 | 362 |
|
215 |
| - self.state.v3 ^= b; |
216 |
| - Sip24Rounds::c_rounds(&mut self.state); |
217 |
| - self.state.v0 ^= b; |
| 363 | + // Process full elements in buffer. |
| 364 | + let last = self.nbuf / ELEM_SIZE; |
218 | 365 |
|
219 |
| - self.state.v2 ^= 0xee; |
220 |
| - Sip24Rounds::d_rounds(&mut self.state); |
221 |
| - let _0 = self.state.v0 ^ self.state.v1 ^ self.state.v2 ^ self.state.v3; |
| 366 | + // Since we're consuming self, avoid updating members for a potential |
| 367 | + // performance gain. |
| 368 | + let mut state = self.state; |
| 369 | + |
| 370 | + for i in 0..last { |
| 371 | + let elem = unsafe { self.buf.get_unchecked(i).assume_init().to_le() }; |
| 372 | + state.v3 ^= elem; |
| 373 | + Sip24Rounds::c_rounds(&mut state); |
| 374 | + state.v0 ^= elem; |
| 375 | + } |
| 376 | + |
| 377 | + // Get remaining partial element. |
| 378 | + let elem = if self.nbuf % ELEM_SIZE != 0 { |
| 379 | + unsafe { |
| 380 | + // Ensure element is initialized by writing zero bytes. At most |
| 381 | + // `ELEM_SIZE - 1` are required given the above check. It's safe |
| 382 | + // to write this many because we have the spill and we maintain |
| 383 | + // `self.nbuf` such that this write will start before the spill. |
| 384 | + let dst = (self.buf.as_mut_ptr() as *mut u8).add(self.nbuf); |
| 385 | + ptr::write_bytes(dst, 0, ELEM_SIZE - 1); |
| 386 | + self.buf.get_unchecked(last).assume_init().to_le() |
| 387 | + } |
| 388 | + } else { |
| 389 | + 0 |
| 390 | + }; |
| 391 | + |
| 392 | + // Finalize the hash. |
| 393 | + let length = self.processed + self.nbuf; |
| 394 | + let b: u64 = ((length as u64 & 0xff) << 56) | elem; |
| 395 | + |
| 396 | + state.v3 ^= b; |
| 397 | + Sip24Rounds::c_rounds(&mut state); |
| 398 | + state.v0 ^= b; |
| 399 | + |
| 400 | + state.v2 ^= 0xee; |
| 401 | + Sip24Rounds::d_rounds(&mut state); |
| 402 | + let _0 = state.v0 ^ state.v1 ^ state.v2 ^ state.v3; |
| 403 | + |
| 404 | + state.v1 ^= 0xdd; |
| 405 | + Sip24Rounds::d_rounds(&mut state); |
| 406 | + let _1 = state.v0 ^ state.v1 ^ state.v2 ^ state.v3; |
222 | 407 |
|
223 |
| - self.state.v1 ^= 0xdd; |
224 |
| - Sip24Rounds::d_rounds(&mut self.state); |
225 |
| - let _1 = self.state.v0 ^ self.state.v1 ^ self.state.v2 ^ self.state.v3; |
226 | 408 | (_0, _1)
|
227 | 409 | }
|
228 | 410 | }
|
229 | 411 |
|
230 | 412 | impl Hasher for SipHasher128 {
|
231 | 413 | #[inline]
|
232 | 414 | fn write_u8(&mut self, i: u8) {
|
233 |
| - self.short_write(i, i as u64); |
| 415 | + self.short_write(i); |
234 | 416 | }
|
235 | 417 |
|
236 | 418 | #[inline]
|
237 | 419 | fn write_u16(&mut self, i: u16) {
|
238 |
| - self.short_write(i, i.to_le() as u64); |
| 420 | + self.short_write(i); |
239 | 421 | }
|
240 | 422 |
|
241 | 423 | #[inline]
|
242 | 424 | fn write_u32(&mut self, i: u32) {
|
243 |
| - self.short_write(i, i.to_le() as u64); |
| 425 | + self.short_write(i); |
244 | 426 | }
|
245 | 427 |
|
246 | 428 | #[inline]
|
247 | 429 | fn write_u64(&mut self, i: u64) {
|
248 |
| - self.short_write(i, i.to_le() as u64); |
| 430 | + self.short_write(i); |
249 | 431 | }
|
250 | 432 |
|
251 | 433 | #[inline]
|
252 | 434 | fn write_usize(&mut self, i: usize) {
|
253 |
| - self.short_write(i, i.to_le() as u64); |
| 435 | + self.short_write(i); |
254 | 436 | }
|
255 | 437 |
|
256 | 438 | #[inline]
|
257 | 439 | fn write_i8(&mut self, i: i8) {
|
258 |
| - self.short_write(i, i as u8 as u64); |
| 440 | + self.short_write(i as u8); |
259 | 441 | }
|
260 | 442 |
|
261 | 443 | #[inline]
|
262 | 444 | fn write_i16(&mut self, i: i16) {
|
263 |
| - self.short_write(i, (i as u16).to_le() as u64); |
| 445 | + self.short_write(i as u16); |
264 | 446 | }
|
265 | 447 |
|
266 | 448 | #[inline]
|
267 | 449 | fn write_i32(&mut self, i: i32) {
|
268 |
| - self.short_write(i, (i as u32).to_le() as u64); |
| 450 | + self.short_write(i as u32); |
269 | 451 | }
|
270 | 452 |
|
271 | 453 | #[inline]
|
272 | 454 | fn write_i64(&mut self, i: i64) {
|
273 |
| - self.short_write(i, (i as u64).to_le() as u64); |
| 455 | + self.short_write(i as u64); |
274 | 456 | }
|
275 | 457 |
|
276 | 458 | #[inline]
|
277 | 459 | fn write_isize(&mut self, i: isize) {
|
278 |
| - self.short_write(i, (i as usize).to_le() as u64); |
| 460 | + self.short_write(i as usize); |
279 | 461 | }
|
280 | 462 |
|
281 | 463 | #[inline]
|
282 | 464 | fn write(&mut self, msg: &[u8]) {
|
283 |
| - let length = msg.len(); |
284 |
| - self.length += length; |
285 |
| - |
286 |
| - let mut needed = 0; |
287 |
| - |
288 |
| - if self.ntail != 0 { |
289 |
| - needed = 8 - self.ntail; |
290 |
| - self.tail |= unsafe { u8to64_le(msg, 0, cmp::min(length, needed)) } << (8 * self.ntail); |
291 |
| - if length < needed { |
292 |
| - self.ntail += length; |
293 |
| - return; |
294 |
| - } else { |
295 |
| - self.state.v3 ^= self.tail; |
296 |
| - Sip24Rounds::c_rounds(&mut self.state); |
297 |
| - self.state.v0 ^= self.tail; |
298 |
| - self.ntail = 0; |
299 |
| - } |
300 |
| - } |
301 |
| - |
302 |
| - // Buffered tail is now flushed, process new input. |
303 |
| - let len = length - needed; |
304 |
| - let left = len & 0x7; |
305 |
| - |
306 |
| - let mut i = needed; |
307 |
| - while i < len - left { |
308 |
| - let mi = unsafe { load_int_le!(msg, i, u64) }; |
309 |
| - |
310 |
| - self.state.v3 ^= mi; |
311 |
| - Sip24Rounds::c_rounds(&mut self.state); |
312 |
| - self.state.v0 ^= mi; |
313 |
| - |
314 |
| - i += 8; |
315 |
| - } |
316 |
| - |
317 |
| - self.tail = unsafe { u8to64_le(msg, i, left) }; |
318 |
| - self.ntail = left; |
| 465 | + self.slice_write(msg); |
319 | 466 | }
|
320 | 467 |
|
321 | 468 | fn finish(&self) -> u64 {
|
|
0 commit comments