From 97af22a94093d379ba4b37d24e48ecd7d1bba698 Mon Sep 17 00:00:00 2001 From: Navin Keswani Date: Mon, 20 Dec 2021 11:57:58 +1100 Subject: [PATCH 1/2] Failing test to reproduce bug. https://github.com/apache/arrow-rs/issues/1036 --- parquet/src/arrow/arrow_writer.rs | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index 643f5a29f488..20fdad8416dc 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -1546,6 +1546,38 @@ mod tests { one_column_roundtrip("list_single_column", values, true, Some(SMALL_SIZE / 2)); } + #[test] + fn null_list_single_column() { + let null_field = Field::new("item", DataType::Null, true); + let list_field = Field::new( + "emptylist", + DataType::List(Box::new(null_field)), + true, + ); + + let schema = Schema::new(vec![list_field]); + + // Build a ListArray[NullArray(0)] + + let a_values = NullArray::new(0); + let a_value_offsets = + arrow::buffer::Buffer::from(&[0, 0].to_byte_slice()); + let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + "item", + DataType::Null, true + )))) + .len(1) + .add_buffer(a_value_offsets) + .null_bit_buffer(Buffer::from(vec![0b00011011])) + .add_child_data(a_values.data().clone()) + .build() + .unwrap(); + + let a = ListArray::from(a_list_data); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); + roundtrip("test_null_list_single_column.parquet", batch, None); + } + #[test] fn large_list_single_column() { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); From d41c93c81ac580b7218d1619a2480292e7df9dc7 Mon Sep 17 00:00:00 2001 From: Navin Keswani Date: Thu, 23 Dec 2021 09:47:16 +1100 Subject: [PATCH 2/2] Read fails -- Nulls are UNKNOWN? --- parquet/src/arrow/array_reader.rs | 9 ++++++++- parquet/src/arrow/arrow_writer.rs | 19 +++++++++---------- parquet/src/arrow/levels.rs | 4 +++- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/parquet/src/arrow/array_reader.rs b/parquet/src/arrow/array_reader.rs index ef8cf701090b..82607f49d47d 100644 --- a/parquet/src/arrow/array_reader.rs +++ b/parquet/src/arrow/array_reader.rs @@ -1277,7 +1277,7 @@ struct ArrayReaderBuilder { } /// Used in type visitor. -#[derive(Clone)] +#[derive(Clone, Debug)] struct ArrayReaderBuilderContext { def_level: i16, rep_level: i16, @@ -1491,6 +1491,11 @@ impl<'a> TypeVisitor>, &'a ArrayReaderBuilderContext } _ => (), } + + dbg!(&list_type); + dbg!(&context); + dbg!(&item_type); + dbg!(&new_context); let item_reader = self .dispatch(item_type.clone(), &new_context) @@ -1836,6 +1841,8 @@ impl<'a> ArrayReaderBuilder { for child in cur_type.get_fields() { let mut struct_context = context.clone(); + dbg!(&context); + dbg!(&child); if let Some(child_reader) = self.dispatch(child.clone(), context)? { // TODO: this results in calling get_arrow_field twice, it could be reused // from child_reader above, by making child_reader carry its `Field` diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index 20fdad8416dc..50684c88c8a2 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -1549,22 +1549,18 @@ mod tests { #[test] fn null_list_single_column() { let null_field = Field::new("item", DataType::Null, true); - let list_field = Field::new( - "emptylist", - DataType::List(Box::new(null_field)), - true, - ); + let list_field = + Field::new("emptylist", DataType::List(Box::new(null_field)), true); let schema = Schema::new(vec![list_field]); // Build a ListArray[NullArray(0)] - - let a_values = NullArray::new(0); - let a_value_offsets = - arrow::buffer::Buffer::from(&[0, 0].to_byte_slice()); + let a_values = NullArray::new(SMALL_SIZE); + let a_value_offsets = arrow::buffer::Buffer::from(&[0, 0].to_byte_slice()); let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( "item", - DataType::Null, true + DataType::Null, + true, )))) .len(1) .add_buffer(a_value_offsets) @@ -1574,6 +1570,9 @@ mod tests { .unwrap(); let a = ListArray::from(a_list_data); + // let values = Arc::new(a); + // one_column_roundtrip("null_list_single_column", values, true, Some(SMALL_SIZE / 2)); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); roundtrip("test_null_list_single_column.parquet", batch, None); } diff --git a/parquet/src/arrow/levels.rs b/parquet/src/arrow/levels.rs index c9b6052aeb87..7e376461e459 100644 --- a/parquet/src/arrow/levels.rs +++ b/parquet/src/arrow/levels.rs @@ -751,9 +751,10 @@ impl LevelInfo { /// Given a level's information, calculate the offsets required to index an array correctly. pub(crate) fn filter_array_indices(&self) -> Vec { - // happy path if not dealing with lists + dbg!(&self); let is_nullable = match self.level_type { LevelType::Primitive(is_nullable) => is_nullable, + LevelType::List(is_nullable) => is_nullable, _ => panic!( "Cannot filter indices on a non-primitive array, found {:?}", self.level_type @@ -784,6 +785,7 @@ impl LevelInfo { index += 1; } }); + dbg!(&filtered); filtered } }