Skip to content

Commit

Permalink
parquet: Optimized ByteArrayReader, Add UTF-8 Validation (#1040) (#1082
Browse files Browse the repository at this point in the history
)

* Optimized ByteArrayReader (#1040)

UTF-8 Validation (#786)

* Fix arrow_array_reader benchmark

* Allow running subset of arrow_array_reader benchmarks

* Faster UTF-8 validation

* Tweak null handling

* Add license

* Refine `ValuesBuffer::pad_nulls`

* Tweak error handling

* Use page null count if available

* Doc comments

* Test DELTA_BYTE_ARRAY encoding

* Support legacy Encoding::PLAIN_DICTIONARY

* Add OffsetBuffer unit tests

Review feedback

* More tests

* Fix lint

* Review feedback
  • Loading branch information
tustvold authored Jan 18, 2022
1 parent 4f1064e commit 0cc0c05
Show file tree
Hide file tree
Showing 10 changed files with 1,299 additions and 162 deletions.
146 changes: 72 additions & 74 deletions parquet/benches/arrow_array_reader.rs

Large diffs are not rendered by default.

92 changes: 41 additions & 51 deletions parquet/src/arrow/array_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ use crate::arrow::converter::{
DecimalConverter, FixedLenBinaryConverter, FixedSizeArrayConverter,
Int96ArrayConverter, Int96Converter, IntervalDayTimeArrayConverter,
IntervalDayTimeConverter, IntervalYearMonthArrayConverter,
IntervalYearMonthConverter, LargeBinaryArrayConverter, LargeBinaryConverter,
LargeUtf8ArrayConverter, LargeUtf8Converter,
IntervalYearMonthConverter, Utf8ArrayConverter, Utf8Converter,
};
use crate::arrow::record_reader::buffer::{ScalarValue, ValuesBuffer};
use crate::arrow::record_reader::{GenericRecordReader, RecordReader};
Expand All @@ -81,6 +80,11 @@ use crate::schema::types::{
};
use crate::schema::visitor::TypeVisitor;

mod byte_array;
mod offset_buffer;

pub use byte_array::make_byte_array_reader;

/// Array reader reads parquet data into arrow array.
pub trait ArrayReader {
fn as_any(&self) -> &dyn Any;
Expand Down Expand Up @@ -1778,57 +1782,43 @@ impl<'a> ArrayReaderBuilder {
null_mask_only,
)?,
)),
PhysicalType::BYTE_ARRAY => {
if cur_type.get_basic_info().converted_type() == ConvertedType::UTF8 {
if let Some(ArrowType::LargeUtf8) = arrow_type {
let converter =
LargeUtf8Converter::new(LargeUtf8ArrayConverter {});
Ok(Box::new(ComplexObjectArrayReader::<
ByteArrayType,
LargeUtf8Converter,
>::new(
page_iterator,
column_desc,
converter,
arrow_type,
)?))
} else {
use crate::arrow::arrow_array_reader::{
ArrowArrayReader, StringArrayConverter,
};
let converter = StringArrayConverter::new();
Ok(Box::new(ArrowArrayReader::try_new(
*page_iterator,
column_desc,
converter,
arrow_type,
)?))
PhysicalType::BYTE_ARRAY => match arrow_type {
// TODO: Replace with optimised dictionary reader (#171)
Some(ArrowType::Dictionary(_, _)) => {
match cur_type.get_basic_info().converted_type() {
ConvertedType::UTF8 => {
let converter = Utf8Converter::new(Utf8ArrayConverter {});
Ok(Box::new(ComplexObjectArrayReader::<
ByteArrayType,
Utf8Converter,
>::new(
page_iterator,
column_desc,
converter,
arrow_type,
)?))
}
_ => {
let converter = BinaryConverter::new(BinaryArrayConverter {});
Ok(Box::new(ComplexObjectArrayReader::<
ByteArrayType,
BinaryConverter,
>::new(
page_iterator,
column_desc,
converter,
arrow_type,
)?))
}
}
} else if let Some(ArrowType::LargeBinary) = arrow_type {
let converter =
LargeBinaryConverter::new(LargeBinaryArrayConverter {});
Ok(Box::new(ComplexObjectArrayReader::<
ByteArrayType,
LargeBinaryConverter,
>::new(
page_iterator,
column_desc,
converter,
arrow_type,
)?))
} else {
let converter = BinaryConverter::new(BinaryArrayConverter {});
Ok(Box::new(ComplexObjectArrayReader::<
ByteArrayType,
BinaryConverter,
>::new(
page_iterator,
column_desc,
converter,
arrow_type,
)?))
}
}
_ => make_byte_array_reader(
page_iterator,
column_desc,
arrow_type,
null_mask_only,
),
},
PhysicalType::FIXED_LEN_BYTE_ARRAY
if cur_type.get_basic_info().converted_type()
== ConvertedType::DECIMAL =>
Expand Down
Loading

0 comments on commit 0cc0c05

Please sign in to comment.