Skip to content

Commit

Permalink
Fix reading/writing nested null arrays (apache#1480) (apache#1036) (a…
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Mar 24, 2022
1 parent e778c10 commit a1e37df
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 9 deletions.
7 changes: 3 additions & 4 deletions parquet/src/arrow/array_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,10 @@ where

/// Reads at most `batch_size` records into array.
fn next_batch(&mut self, batch_size: usize) -> Result<ArrayRef> {
let records_read =
read_records(&mut self.record_reader, self.pages.as_mut(), batch_size)?;
read_records(&mut self.record_reader, self.pages.as_mut(), batch_size)?;

// convert to arrays
let array = arrow::array::NullArray::new(records_read);
let array = arrow::array::NullArray::new(self.record_reader.num_values());

// save definition and repetition buffers
self.def_levels_buffer = self.record_reader.consume_def_levels()?;
Expand Down Expand Up @@ -887,7 +886,7 @@ fn remove_indices(
Ok(Arc::new(StructArray::from((new_columns, valid.finish()))))
}
}
ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len()))),
ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len() - indices.len()))),
_ => Err(ParquetError::General(format!(
"ListArray of type List({:?}) is not supported by array_reader",
item_type
Expand Down
56 changes: 56 additions & 0 deletions parquet/src/arrow/arrow_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1513,6 +1513,43 @@ mod tests {
required_and_optional::<LargeStringArray, _>(raw_strs);
}

#[test]
fn null_list_single_column() {
let null_field = Field::new("item", DataType::Null, true);
let list_field =
Field::new("emptylist", DataType::List(Box::new(null_field)), true);

let schema = Schema::new(vec![list_field]);

// Build [[], null, [null, null]]
let a_values = NullArray::new(2);
let a_value_offsets = arrow::buffer::Buffer::from(&[0, 0, 0, 2].to_byte_slice());
let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new(
"item",
DataType::Null,
true,
))))
.len(3)
.add_buffer(a_value_offsets)
.null_bit_buffer(Buffer::from(vec![0b00000101]))
.add_child_data(a_values.data().clone())
.build()
.unwrap();

let a = ListArray::from(a_list_data);

assert!(a.is_valid(0));
assert!(!a.is_valid(1));
assert!(a.is_valid(2));

assert_eq!(a.value(0).len(), 0);
assert_eq!(a.value(2).len(), 2);
assert_eq!(a.value(2).null_count(), 2);

let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap();
roundtrip(batch, None);
}

#[test]
fn list_single_column() {
let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
Expand Down Expand Up @@ -1564,6 +1601,25 @@ mod tests {
one_column_roundtrip(values, true, Some(SMALL_SIZE / 2));
}

#[test]
fn list_nested_nulls() {
use arrow::datatypes::Int32Type;
let data = vec![
Some(vec![Some(1)]),
Some(vec![Some(2), Some(3)]),
None,
Some(vec![Some(4), Some(5), None]),
Some(vec![None]),
Some(vec![Some(6), Some(7)]),
];

let list = ListArray::from_iter_primitive::<Int32Type, _, _>(data.clone());
one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2));

let list = LargeListArray::from_iter_primitive::<Int32Type, _, _>(data);
one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2));
}

#[test]
fn struct_single_column() {
let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
Expand Down
10 changes: 5 additions & 5 deletions parquet/src/arrow/levels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,8 @@ impl LevelInfo {
);

match child_array.data_type() {
// TODO: The behaviour of a <list<null>> is untested
DataType::Null => vec![list_level],
DataType::Boolean
DataType::Null
| DataType::Boolean
| DataType::Int8
| DataType::Int16
| DataType::Int32
Expand Down Expand Up @@ -677,8 +676,9 @@ impl LevelInfo {
len: usize,
) -> (Vec<i64>, Vec<bool>) {
match array.data_type() {
DataType::Null
| DataType::Boolean
// A NullArray is entirely nulls, despite not containing a null buffer
DataType::Null => ((0..=(len as i64)).collect(), vec![false; len]),
DataType::Boolean
| DataType::Int8
| DataType::Int16
| DataType::Int32
Expand Down

0 comments on commit a1e37df

Please sign in to comment.