From 31172285f3555e73288ea15509b98bafcf9421a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sun, 6 Jun 2021 15:57:15 +0200 Subject: [PATCH 1/2] Fix out of bounds read in bit chunk iterator --- arrow/benches/boolean_kernels.rs | 19 ++++++++++++++ arrow/src/util/bit_chunk_iterator.rs | 38 ++++++++++++++++++---------- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/arrow/benches/boolean_kernels.rs b/arrow/benches/boolean_kernels.rs index 6559c4e4caf5..e9394f7a0231 100644 --- a/arrow/benches/boolean_kernels.rs +++ b/arrow/benches/boolean_kernels.rs @@ -45,6 +45,25 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("and", |b| b.iter(|| bench_and(&array1, &array2))); c.bench_function("or", |b| b.iter(|| bench_or(&array1, &array2))); c.bench_function("not", |b| b.iter(|| bench_not(&array1))); + + let array1_slice = array1.slice(1, size - 1); + let array1_slice = array1_slice + .as_any() + .downcast_ref::() + .unwrap(); + let array2_slice = array2.slice(1, size - 1); + let array2_slice = array2_slice + .as_any() + .downcast_ref::() + .unwrap(); + + c.bench_function("and_sliced", |b| { + b.iter(|| bench_and(&array1_slice, &array2_slice)) + }); + c.bench_function("or_sliced", |b| { + b.iter(|| bench_or(&array1_slice, &array2_slice)) + }); + c.bench_function("not_sliced", |b| b.iter(|| bench_not(&array1_slice))); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow/src/util/bit_chunk_iterator.rs index c57d0212d290..e3f1e37d7487 100644 --- a/arrow/src/util/bit_chunk_iterator.rs +++ b/arrow/src/util/bit_chunk_iterator.rs @@ -35,10 +35,10 @@ impl<'a> BitChunks<'a> { let byte_offset = offset / 8; let bit_offset = offset % 8; - let chunk_bits = 8 * std::mem::size_of::(); - - let chunk_len = len / chunk_bits; - let remainder_len = len & (chunk_bits - 1); + // number of complete u64 chunks + let chunk_len = len / 64; + // number of remaining bits + let remainder_len = len % 64; BitChunks::<'a> { buffer: &buffer[byte_offset..], @@ -137,14 +137,16 @@ impl Iterator for BitChunkIterator<'_> { // so when reading as u64 on a big-endian machine, the bytes need to be swapped let current = unsafe { std::ptr::read_unaligned(raw_data.add(index)).to_le() }; - let combined = if self.bit_offset == 0 { + let bit_offset = self.bit_offset; + + let combined = if bit_offset == 0 { current } else { - let next = - unsafe { std::ptr::read_unaligned(raw_data.add(index + 1)).to_le() }; + let next = unsafe { + std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64 + }; - current >> self.bit_offset - | (next & ((1 << self.bit_offset) - 1)) << (64 - self.bit_offset) + (current >> bit_offset) | (next << (64 - bit_offset)) }; self.index = index + 1; @@ -184,7 +186,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory fn test_iter_unaligned() { let input: &[u8] = &[ 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, @@ -206,7 +207,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory fn test_iter_unaligned_remainder_1_byte() { let input: &[u8] = &[ 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, @@ -228,7 +228,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory fn test_iter_unaligned_remainder_bits_across_bytes() { let input: &[u8] = &[0b00111111, 0b11111100]; let buffer: Buffer = Buffer::from(input); @@ -242,7 +241,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory fn test_iter_unaligned_remainder_bits_large() { let input: &[u8] = &[ 0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000, @@ -258,4 +256,18 @@ mod tests { bitchunks.remainder_bits() ); } + + #[test] + fn test_iter_remainder_out_of_bounds() { + // allocating a full page should trigger a fault when reading out of bounds + const ALLOC_SIZE: usize = 4 * 1024; + let input = vec![0xFF_u8; ALLOC_SIZE]; + + let buffer: Buffer = Buffer::from(input); + + let bitchunks = buffer.bit_chunks(57, ALLOC_SIZE * 8 - 57); + + assert_eq!(u64::MAX, bitchunks.iter().last().unwrap()); + assert_eq!(0x7F, bitchunks.remainder_bits()); + } } From cfd813254a0cce4189131ed03a1f39ecf4d3aa72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Mon, 7 Jun 2021 22:43:23 +0200 Subject: [PATCH 2/2] Add comment why reading one additional byte is enough --- arrow/src/util/bit_chunk_iterator.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow/src/util/bit_chunk_iterator.rs index e3f1e37d7487..ea9280cee3f1 100644 --- a/arrow/src/util/bit_chunk_iterator.rs +++ b/arrow/src/util/bit_chunk_iterator.rs @@ -142,6 +142,8 @@ impl Iterator for BitChunkIterator<'_> { let combined = if bit_offset == 0 { current } else { + // the constructor ensures that bit_offset is in 0..8 + // that means we need to read at most one additional byte to fill in the high bits let next = unsafe { std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64 };