Skip to content

Commit

Permalink
Merge cba393e into 535cd20
Browse files Browse the repository at this point in the history
  • Loading branch information
saethlin authored Jun 19, 2022
2 parents 535cd20 + cba393e commit 5d6bd7c
Showing 1 changed file with 12 additions and 21 deletions.
33 changes: 12 additions & 21 deletions parquet/src/util/hash_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,27 +107,18 @@ unsafe fn crc32_hash(bytes: &[u8], seed: u32) -> u32 {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

let u32_num_bytes = std::mem::size_of::<u32>();
let mut num_bytes = bytes.len();
let num_words = num_bytes / u32_num_bytes;
num_bytes %= u32_num_bytes;

let bytes_u32: &[u32] = std::slice::from_raw_parts(
&bytes[0..num_words * u32_num_bytes] as *const [u8] as *const u32,
num_words,
);

let mut offset = 0;
let mut hash = seed;
while offset < num_words {
hash = _mm_crc32_u32(hash, bytes_u32[offset]);
offset += 1;
for chunk in bytes
.chunks_exact(4)
.map(|chunk| u32::from_le_bytes(chunk.try_into().unwrap()))
{
hash = _mm_crc32_u32(hash, chunk);
}

offset = num_words * u32_num_bytes;
while offset < num_bytes {
hash = _mm_crc32_u8(hash, bytes[offset]);
offset += 1;
let remainder = bytes.len() % 4;

for byte in &bytes[bytes.len() - remainder..] {
hash = _mm_crc32_u8(hash, *byte);
}

// The lower half of the CRC hash has poor uniformity, so swap the halves
Expand Down Expand Up @@ -158,13 +149,13 @@ mod tests {
if is_x86_feature_detected!("sse4.2") {
unsafe {
let result = crc32_hash(b"hello", 123);
assert_eq!(result, 2927487359);
assert_eq!(result, 3359043980);

let result = crc32_hash(b"helloworld", 123);
assert_eq!(result, 314229527);
assert_eq!(result, 3971745255);

let result = crc32_hash(b"helloworldparquet", 123);
assert_eq!(result, 667078870);
assert_eq!(result, 1124504676);
}
}
}
Expand Down

0 comments on commit 5d6bd7c

Please sign in to comment.