From 6a21ad2c79960d59bdc1c66769b7e63823368b6c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Sat, 1 Jan 2022 19:52:03 +0000 Subject: [PATCH] Further doc tweaks --- parquet/src/arrow/record_reader.rs | 2 +- parquet/src/arrow/record_reader/buffer.rs | 22 ++++++++++++---- .../arrow/record_reader/definition_levels.rs | 3 ++- parquet/src/column/reader/decoder.rs | 25 +++++++++++++++++++ 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/record_reader.rs b/parquet/src/arrow/record_reader.rs index 9ebac432dd68..5edb59477eee 100644 --- a/parquet/src/arrow/record_reader.rs +++ b/parquet/src/arrow/record_reader.rs @@ -236,7 +236,7 @@ where ) })?; - let iter = def_levels.valid_position_iter( + let iter = def_levels.rev_valid_positions_iter( self.values_written..self.values_written + levels_read, ); diff --git a/parquet/src/arrow/record_reader/buffer.rs b/parquet/src/arrow/record_reader/buffer.rs index 7f223393bdba..55b672abdaab 100644 --- a/parquet/src/arrow/record_reader/buffer.rs +++ b/parquet/src/arrow/record_reader/buffer.rs @@ -29,7 +29,12 @@ pub trait BufferQueue: Sized { type Slice: ?Sized; - /// Split out the first `len` committed items + /// Split out the first `len` items + /// + /// # Panics + /// + /// Implementations must panic if `len` is beyond the length of [`BufferQueue`] + /// fn split_off(&mut self, len: usize) -> Self::Output; /// Returns a [`Self::Slice`] with at least `batch_size` capacity that can be used @@ -59,7 +64,7 @@ pub trait BufferQueue: Sized { fn set_len(&mut self, len: usize); } -/// A typed buffer similar to [`Vec`] but making use of [`MutableBuffer`] +/// A typed buffer similar to [`Vec`] but using [`MutableBuffer`] for storage pub struct TypedBuffer { buffer: MutableBuffer, @@ -152,11 +157,18 @@ impl BufferQueue for TypedBuffer { } } +/// A [`BufferQueue`] capable of storing column values pub trait ValuesBuffer: BufferQueue { + /// Iterate through the indexes in `range` in reverse order, moving the value at each + /// index to the next index returned by `rev_valid_position_iter` + /// + /// It is guaranteed that the `i`th index returned by `rev_valid_position_iter` is greater + /// than or equal to `range.end - i - 1` + /// fn pad_nulls( &mut self, range: Range, - rev_position_iter: impl Iterator, + rev_valid_position_iter: impl Iterator, ); } @@ -164,11 +176,11 @@ impl ValuesBuffer for TypedBuffer { fn pad_nulls( &mut self, range: Range, - rev_position_iter: impl Iterator, + rev_valid_position_iter: impl Iterator, ) { let slice = self.as_slice_mut(); - for (value_pos, level_pos) in range.rev().zip(rev_position_iter) { + for (value_pos, level_pos) in range.rev().zip(rev_valid_position_iter) { debug_assert!(level_pos >= value_pos); if level_pos <= value_pos { break; diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index e1daea6b3dc6..98559b2fc252 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -86,7 +86,8 @@ impl DefinitionLevelBuffer { old_bitmap } - pub fn valid_position_iter( + /// Returns an iterator of the valid positions in `range` in descending order + pub fn rev_valid_positions_iter( &self, range: Range, ) -> impl Iterator + '_ { diff --git a/parquet/src/column/reader/decoder.rs b/parquet/src/column/reader/decoder.rs index 854d8af1abf7..b501140c90b0 100644 --- a/parquet/src/column/reader/decoder.rs +++ b/parquet/src/column/reader/decoder.rs @@ -29,8 +29,10 @@ use crate::util::bit_util::BitReader; /// A slice of levels buffer data that is written to by a [`ColumnLevelDecoder`] pub trait LevelsBufferSlice { + /// Returns the capacity of this slice or `usize::MAX` if no limit fn capacity(&self) -> usize; + /// Count the number of levels in `range` not equal to `max_level` fn count_nulls(&self, range: Range, max_level: i16) -> usize; } @@ -46,6 +48,7 @@ impl LevelsBufferSlice for [i16] { /// A slice of values buffer data that is written to by a [`ColumnValueDecoder`] pub trait ValuesBufferSlice { + /// Returns the capacity of this slice or `usize::MAX` if no limit fn capacity(&self) -> usize; } @@ -59,8 +62,18 @@ impl ValuesBufferSlice for [T] { pub trait ColumnLevelDecoder { type Slice: LevelsBufferSlice + ?Sized; + /// Create a new [`ColumnLevelDecoder`] fn new(max_level: i16, encoding: Encoding, data: ByteBufferPtr) -> Self; + /// Read level data into `out[range]` returning the number of levels read + /// + /// `range` is provided by the caller to allow for types such as default-initialized `[T]` + /// that only track capacity and not length + /// + /// # Panics + /// + /// Implementations may panic if `range` overlaps with already written data + /// fn read(&mut self, out: &mut Self::Slice, range: Range) -> Result; } @@ -68,8 +81,10 @@ pub trait ColumnLevelDecoder { pub trait ColumnValueDecoder { type Slice: ValuesBufferSlice + ?Sized; + /// Create a new [`ColumnValueDecoder`] fn new(col: &ColumnDescPtr) -> Self; + /// Set the current dictionary page fn set_dict( &mut self, buf: ByteBufferPtr, @@ -78,6 +93,7 @@ pub trait ColumnValueDecoder { is_sorted: bool, ) -> Result<()>; + /// Set the current data page fn set_data( &mut self, encoding: Encoding, @@ -85,6 +101,15 @@ pub trait ColumnValueDecoder { num_values: usize, ) -> Result<()>; + /// Read values data into `out[range]` returning the number of values read + /// + /// `range` is provided by the caller to allow for types such as default-initialized `[T]` + /// that only track capacity and not length + /// + /// # Panics + /// + /// Implementations may panic if `range` overlaps with already written data + /// fn read(&mut self, out: &mut Self::Slice, range: Range) -> Result; }