From 102873be9cbdc392044a70007c0277961bb80cc5 Mon Sep 17 00:00:00 2001 From: Kornelijus Survila Date: Sun, 23 May 2021 19:00:42 -0600 Subject: [PATCH] parquet: Speed up `BitReader`/`DeltaBitPackDecoder` (#325) * parquet: Avoid temporary `BufferPtr`s in `BitReader` From a quick test, this speeds up reading delta-packed int columns by over 30%. * parquet: Avoid some allocations in `DeltaBitPackDecoder` From a quick test, it seems to decode around 10% faster overall. --- parquet/src/encodings/decoding.rs | 7 +++---- parquet/src/util/bit_util.rs | 13 +++---------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index b73ebf0285c6..e83e2772ecb0 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -395,16 +395,15 @@ impl DeltaBitPackDecoder { .get_zigzag_vlq_int() .ok_or_else(|| eof_err!("Not enough data to decode 'min_delta'"))?; - let mut widths = vec![]; + self.delta_bit_widths.clear(); for _ in 0..self.num_mini_blocks { let w = self .bit_reader .get_aligned::(1) .ok_or_else(|| eof_err!("Not enough data to decode 'width'"))?; - widths.push(w); + self.delta_bit_widths.push(w); } - self.delta_bit_widths.set_data(widths); self.mini_block_idx = 0; self.delta_bit_width = self.delta_bit_widths.data()[0]; self.values_current_mini_block = self.values_per_mini_block; @@ -417,7 +416,6 @@ impl DeltaBitPackDecoder { where T::T: FromBytes, { - self.deltas_in_mini_block.clear(); if self.use_batch { self.deltas_in_mini_block .resize(self.values_current_mini_block, T::T::default()); @@ -427,6 +425,7 @@ impl DeltaBitPackDecoder { ); assert!(loaded == self.values_current_mini_block); } else { + self.deltas_in_mini_block.clear(); for _ in 0..self.values_current_mini_block { // TODO: load one batch at a time similar to int32 let delta = self diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 677b669287bd..8dfb63122bcc 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -603,11 +603,7 @@ impl BitReader { // Advance byte_offset to next unread byte and read num_bytes self.byte_offset += bytes_read; - let v = read_num_bytes!( - T, - num_bytes, - self.buffer.start_from(self.byte_offset).as_ref() - ); + let v = read_num_bytes!(T, num_bytes, self.buffer.data()[self.byte_offset..]); self.byte_offset += num_bytes; // Reset buffered_values @@ -657,11 +653,8 @@ impl BitReader { fn reload_buffer_values(&mut self) { let bytes_to_read = cmp::min(self.total_bytes - self.byte_offset, 8); - self.buffered_values = read_num_bytes!( - u64, - bytes_to_read, - self.buffer.start_from(self.byte_offset).as_ref() - ); + self.buffered_values = + read_num_bytes!(u64, bytes_to_read, self.buffer.data()[self.byte_offset..]); } }