From 584802093c20c940b76bb9c7f9163d100617efd5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 10 Mar 2022 12:55:19 +0000 Subject: [PATCH] Consistent DeltaBitPackEncoder bit width padding (#1416) Ignore non-zero padded bit widths in DeltaBitPackDecoder (#1417) --- parquet/src/encodings/decoding.rs | 20 +++++++++++++++----- parquet/src/encodings/encoding.rs | 15 +++++++++------ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 128659bdf98e..1b552cbb9aa0 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -433,7 +433,6 @@ pub struct DeltaBitPackDecoder { initialized: bool, // Header info - /// The number of values in each block block_size: usize, /// The number of values that remain to be read in the current page @@ -444,7 +443,6 @@ pub struct DeltaBitPackDecoder { values_per_mini_block: usize, // Per block info - /// The minimum delta in the block min_delta: T::T, /// The byte offset of the end of the current block @@ -488,7 +486,7 @@ where /// Returns the current offset pub fn get_offset(&self) -> usize { assert!(self.initialized, "Bit reader is not initialized"); - match self.values_left { + let offset = match self.values_left { // If we've exhausted this page report the end of the current block // as we may not have consumed the trailing padding // @@ -496,7 +494,9 @@ where // one value and therefore have no blocks, but still contain a page header 0 => self.bit_reader.get_byte_offset().max(self.block_end_offset), _ => self.bit_reader.get_byte_offset(), - } + }; + + offset } /// Initializes the next block and the first mini block within it @@ -639,6 +639,7 @@ where self.last_value = value; buffer[0] = value; read += 1; + self.values_left -= 1; } while read != to_read { @@ -668,9 +669,9 @@ where read += batch_read; self.mini_block_remaining -= batch_read; + self.values_left -= batch_read; } - self.values_left -= to_read; Ok(to_read) } @@ -1276,6 +1277,15 @@ mod tests { test_delta_bit_packed_decode::(data); } + #[test] + fn test_delta_bit_packed_int32_with_truncated_block() { + let data = vec![ + Int32Type::gen_vec(-1, 1023), + Int32Type::gen_vec(-1, 16), + ]; + test_delta_bit_packed_decode::(data); + } + #[test] fn test_delta_bit_packed_int64_empty() { let data = vec![vec![0; 0]]; diff --git a/parquet/src/encodings/encoding.rs b/parquet/src/encodings/encoding.rs index b0893434a2c7..385bd3ab886d 100644 --- a/parquet/src/encodings/encoding.rs +++ b/parquet/src/encodings/encoding.rs @@ -565,7 +565,7 @@ impl DeltaBitPackEncoder { return Ok(()); } - let mut min_delta = i64::max_value(); + let mut min_delta = i64::MAX; for i in 0..self.values_in_block { min_delta = cmp::min(min_delta, self.deltas[i]); } @@ -581,6 +581,13 @@ impl DeltaBitPackEncoder { // values left let n = cmp::min(self.mini_block_size, self.values_in_block); if n == 0 { + // Decoders should be agnostic to the padding value, we therefore use 0xFF + // when running tests. However, not all implementations may handle this correctly + // so pad with 0 when not running tests + let pad_value = cfg!(test).then(|| 0xFF).unwrap_or(0); + for j in i..self.num_mini_blocks { + self.bit_writer.write_at(offset + j, pad_value); + } break; } @@ -610,11 +617,7 @@ impl DeltaBitPackEncoder { self.values_in_block -= n; } - assert!( - self.values_in_block == 0, - "Expected 0 values in block, found {}", - self.values_in_block - ); + assert_eq!(self.values_in_block, 0); Ok(()) } }