Skip to content

Commit 68be0f1

Browse files
tustvoldanliakho
andauthored
fix infinite loop in not fully packed bit-packed runs (#1555)
* fix infinite loop in not fully packed bit-packed runs * Add test and also fix get_batch_with_dict Co-authored-by: Andrei Liakhovich <anliakho@microsoft.com>
1 parent 544ec05 commit 68be0f1

File tree

1 file changed

+46
-0
lines changed

1 file changed

+46
-0
lines changed

parquet/src/encodings/rle.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,11 @@ impl RleDecoder {
419419
&mut buffer[values_read..values_read + num_values],
420420
self.bit_width as usize,
421421
);
422+
if num_values == 0 {
423+
// Handle writers which truncate the final block
424+
self.bit_packed_left = 0;
425+
continue;
426+
}
422427
self.bit_packed_left -= num_values as u32;
423428
values_read += num_values;
424429
} else if !self.reload() {
@@ -467,6 +472,11 @@ impl RleDecoder {
467472
&mut index_buf[..num_values],
468473
self.bit_width as usize,
469474
);
475+
if num_values == 0 {
476+
// Handle writers which truncate the final block
477+
self.bit_packed_left = 0;
478+
break;
479+
}
470480
for i in 0..num_values {
471481
buffer[values_read + i].clone_from(&dict[index_buf[i] as usize])
472482
}
@@ -743,6 +753,42 @@ mod tests {
743753
}
744754
}
745755

756+
#[test]
757+
fn test_truncated_rle() {
758+
// The final bit packed run within a page may not be a multiple of 8 values
759+
// Unfortunately the specification stores `(bit-packed-run-len) / 8`
760+
// This means we don't necessarily know how many values are present
761+
// and some writers may not add padding to compensate for this ambiguity
762+
763+
// Bit pack encode 20 values with a bit width of 8
764+
let mut data: Vec<u8> = vec![
765+
(3 << 1) | 1, // bit-packed run of 3 * 8
766+
];
767+
data.extend(std::iter::repeat(0xFF).take(20));
768+
let data = ByteBufferPtr::new(data);
769+
770+
let mut decoder = RleDecoder::new(8);
771+
decoder.set_data(data.clone());
772+
773+
let mut output = vec![0_u16; 100];
774+
let read = decoder.get_batch(&mut output).unwrap();
775+
776+
assert_eq!(read, 20);
777+
assert!(output.iter().take(20).all(|x| *x == 255));
778+
779+
// Reset decoder
780+
decoder.set_data(data);
781+
782+
let dict: Vec<u16> = (0..256).collect();
783+
let mut output = vec![0_u16; 100];
784+
let read = decoder
785+
.get_batch_with_dict(&dict, &mut output, 100)
786+
.unwrap();
787+
788+
assert_eq!(read, 20);
789+
assert!(output.iter().take(20).all(|x| *x == 255));
790+
}
791+
746792
#[test]
747793
fn test_rle_specific_roundtrip() {
748794
let bit_width = 1;

0 commit comments

Comments
 (0)