Skip to content

Commit 77ca6dc

Browse files
fix: bug when struct nullability determined from Dict<_, ByteArray>> column (#8573)
# Which issue does this PR close? - Closes #8404 # Rationale for this change A regression was reported in issue #8404 which was introduced in #7585. This PR resolves the issue. # What changes are included in this PR? The root cause of the issue was that the behaviour of `ByteArrayDictionaryReader` is to return a new empty length array of values if the record reader has already been consumed. The problem was that the repetition and definition level buffers were not being advanced in this early return case. https://github.com/apache/arrow-rs/blob/521f219e308613811aeae11300bf7a7b0fb5ec29/parquet/src/arrow/array_reader/byte_array_dictionary.rs#L167-L183 The `StructArrayReader` reads the repetition and definition levels from the first child to determine the nullability of the struct array. When we returned the empty values buffer for the child, without advancing the repetition and definition buffers, the `StructArrayReader` a length mismatch between the empty child array and the non-empty nullability bitmask, and this produces the error. https://github.com/apache/arrow-rs/blob/521f219e308613811aeae11300bf7a7b0fb5ec29/parquet/src/arrow/array_reader/struct_array.rs#L137-L170 The fix is simple, always have `ByteArrayDictionaryReader` advance the repetition and definition level buffers when `consume_next_batch` is called. # Are these changes tested? Yes, a new unit test was added `test_read_nullable_structs_with_binary_dict_as_first_child_column`, which before the changes introduced in this PR would replicate the issue. # Are there any user-facing changes? No --------- Co-authored-by: Ed Seidl <etseidl@live.com> Co-authored-by: Ed Seidl <etseidl@users.noreply.github.com>
1 parent 615a144 commit 77ca6dc

File tree

2 files changed

+62
-4
lines changed

2 files changed

+62
-4
lines changed

parquet/src/arrow/array_reader/byte_array_dictionary.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ where
165165
}
166166

167167
fn consume_batch(&mut self) -> Result<ArrayRef> {
168+
// advance the def & rep level buffers
169+
self.def_levels_buffer = self.record_reader.consume_def_levels();
170+
self.rep_levels_buffer = self.record_reader.consume_rep_levels();
171+
168172
if self.record_reader.num_values() == 0 {
169173
// once the record_reader has been consumed, we've replaced its values with the default
170174
// variant of DictionaryBuffer (Offset). If `consume_batch` then gets called again, we
@@ -175,9 +179,6 @@ where
175179
let buffer = self.record_reader.consume_record_data();
176180
let null_buffer = self.record_reader.consume_bitmap_buffer();
177181
let array = buffer.into_array(null_buffer, &self.data_type)?;
178-
179-
self.def_levels_buffer = self.record_reader.consume_def_levels();
180-
self.rep_levels_buffer = self.record_reader.consume_rep_levels();
181182
self.record_reader.reset();
182183

183184
Ok(array)

parquet/src/arrow/arrow_reader/mod.rs

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1160,7 +1160,7 @@ mod tests {
11601160
Time64MicrosecondType,
11611161
};
11621162
use arrow_array::*;
1163-
use arrow_buffer::{ArrowNativeType, Buffer, IntervalDayTime, i256};
1163+
use arrow_buffer::{ArrowNativeType, Buffer, IntervalDayTime, NullBuffer, i256};
11641164
use arrow_data::{ArrayData, ArrayDataBuilder};
11651165
use arrow_schema::{
11661166
ArrowError, DataType as ArrowDataType, Field, Fields, Schema, SchemaRef, TimeUnit,
@@ -2360,6 +2360,63 @@ mod tests {
23602360
assert_eq!(&batch, &read[0])
23612361
}
23622362

2363+
#[test]
2364+
fn test_read_nullable_structs_with_binary_dict_as_first_child_column() {
2365+
// the `StructArrayReader` will check the definition and repetition levels of the first
2366+
// child column in the struct to determine nullability for the struct. If the first
2367+
// column's is being read by `ByteArrayDictionaryReader` we need to ensure that the
2368+
// nullability is interpreted correctly from the rep/def level buffers managed by the
2369+
// buffers managed by this array reader.
2370+
2371+
let struct_fields = Fields::from(vec![
2372+
Field::new(
2373+
"city",
2374+
ArrowDataType::Dictionary(
2375+
Box::new(ArrowDataType::UInt8),
2376+
Box::new(ArrowDataType::Utf8),
2377+
),
2378+
true,
2379+
),
2380+
Field::new("name", ArrowDataType::Utf8, true),
2381+
]);
2382+
let schema = Arc::new(Schema::new(vec![Field::new(
2383+
"items",
2384+
ArrowDataType::Struct(struct_fields.clone()),
2385+
true,
2386+
)]));
2387+
2388+
let items_arr = StructArray::new(
2389+
struct_fields,
2390+
vec![
2391+
Arc::new(DictionaryArray::new(
2392+
UInt8Array::from_iter_values(vec![0, 1, 1, 0, 2]),
2393+
Arc::new(StringArray::from_iter_values(vec![
2394+
"quebec",
2395+
"fredericton",
2396+
"halifax",
2397+
])),
2398+
)),
2399+
Arc::new(StringArray::from_iter_values(vec![
2400+
"albert", "terry", "lance", "", "tim",
2401+
])),
2402+
],
2403+
Some(NullBuffer::from_iter(vec![true, true, true, false, true])),
2404+
);
2405+
2406+
let batch = RecordBatch::try_new(schema, vec![Arc::new(items_arr)]).unwrap();
2407+
let mut buffer = Vec::with_capacity(1024);
2408+
let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap();
2409+
writer.write(&batch).unwrap();
2410+
writer.close().unwrap();
2411+
let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 8)
2412+
.unwrap()
2413+
.collect::<Result<Vec<_>, _>>()
2414+
.unwrap();
2415+
2416+
assert_eq!(read.len(), 1);
2417+
assert_eq!(&batch, &read[0])
2418+
}
2419+
23632420
/// Parameters for single_column_reader_test
23642421
#[derive(Clone)]
23652422
struct TestOptions {

0 commit comments

Comments
 (0)