Skip to content

Commit

Permalink
Consistent DeltaBitPackEncoder bit width padding (apache#1416)
Browse files Browse the repository at this point in the history
Ignore non-zero padded bit widths in DeltaBitPackDecoder (apache#1417)
  • Loading branch information
tustvold committed Mar 10, 2022
1 parent f19d1ed commit 5848020
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
20 changes: 15 additions & 5 deletions parquet/src/encodings/decoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,6 @@ pub struct DeltaBitPackDecoder<T: DataType> {
initialized: bool,

// Header info

/// The number of values in each block
block_size: usize,
/// The number of values that remain to be read in the current page
Expand All @@ -444,7 +443,6 @@ pub struct DeltaBitPackDecoder<T: DataType> {
values_per_mini_block: usize,

// Per block info

/// The minimum delta in the block
min_delta: T::T,
/// The byte offset of the end of the current block
Expand Down Expand Up @@ -488,15 +486,17 @@ where
/// Returns the current offset
pub fn get_offset(&self) -> usize {
assert!(self.initialized, "Bit reader is not initialized");
match self.values_left {
let offset = match self.values_left {
// If we've exhausted this page report the end of the current block
// as we may not have consumed the trailing padding
//
// The max is necessary to handle pages which don't contain more than
// one value and therefore have no blocks, but still contain a page header
0 => self.bit_reader.get_byte_offset().max(self.block_end_offset),
_ => self.bit_reader.get_byte_offset(),
}
};

offset
}

/// Initializes the next block and the first mini block within it
Expand Down Expand Up @@ -639,6 +639,7 @@ where
self.last_value = value;
buffer[0] = value;
read += 1;
self.values_left -= 1;
}

while read != to_read {
Expand Down Expand Up @@ -668,9 +669,9 @@ where

read += batch_read;
self.mini_block_remaining -= batch_read;
self.values_left -= batch_read;
}

self.values_left -= to_read;
Ok(to_read)
}

Expand Down Expand Up @@ -1276,6 +1277,15 @@ mod tests {
test_delta_bit_packed_decode::<Int32Type>(data);
}

#[test]
fn test_delta_bit_packed_int32_with_truncated_block() {
let data = vec![
Int32Type::gen_vec(-1, 1023),
Int32Type::gen_vec(-1, 16),
];
test_delta_bit_packed_decode::<Int32Type>(data);
}

#[test]
fn test_delta_bit_packed_int64_empty() {
let data = vec![vec![0; 0]];
Expand Down
15 changes: 9 additions & 6 deletions parquet/src/encodings/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ impl<T: DataType> DeltaBitPackEncoder<T> {
return Ok(());
}

let mut min_delta = i64::max_value();
let mut min_delta = i64::MAX;
for i in 0..self.values_in_block {
min_delta = cmp::min(min_delta, self.deltas[i]);
}
Expand All @@ -581,6 +581,13 @@ impl<T: DataType> DeltaBitPackEncoder<T> {
// values left
let n = cmp::min(self.mini_block_size, self.values_in_block);
if n == 0 {
// Decoders should be agnostic to the padding value, we therefore use 0xFF
// when running tests. However, not all implementations may handle this correctly
// so pad with 0 when not running tests
let pad_value = cfg!(test).then(|| 0xFF).unwrap_or(0);
for j in i..self.num_mini_blocks {
self.bit_writer.write_at(offset + j, pad_value);
}
break;
}

Expand Down Expand Up @@ -610,11 +617,7 @@ impl<T: DataType> DeltaBitPackEncoder<T> {
self.values_in_block -= n;
}

assert!(
self.values_in_block == 0,
"Expected 0 values in block, found {}",
self.values_in_block
);
assert_eq!(self.values_in_block, 0);
Ok(())
}
}
Expand Down

0 comments on commit 5848020

Please sign in to comment.