Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implemented with_field() for FixedSizeListBuilder #5541

Merged
282 changes: 249 additions & 33 deletions arrow-array/src/builder/fixed_size_list_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use crate::builder::ArrayBuilder;
use crate::{ArrayRef, FixedSizeListArray};
use arrow_buffer::NullBufferBuilder;
use arrow_data::ArrayData;
use arrow_schema::{DataType, Field};
use arrow_schema::{DataType, Field, FieldRef};
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -67,6 +67,7 @@ pub struct FixedSizeListBuilder<T: ArrayBuilder> {
null_buffer_builder: NullBufferBuilder,
values_builder: T,
list_len: i32,
field: Option<FieldRef>,
}

impl<T: ArrayBuilder> FixedSizeListBuilder<T> {
Expand All @@ -89,6 +90,20 @@ impl<T: ArrayBuilder> FixedSizeListBuilder<T> {
null_buffer_builder: NullBufferBuilder::new(capacity),
values_builder,
list_len: value_length,
field: None,
}
}

/// Override the field passed to [`ArrayData::builder`]
///
/// By default a nullable field is created with the name `item`
///
/// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the
/// field's data type does not match that of `T`
pub fn with_field(self, field: impl Into<FieldRef>) -> Self {
Self {
field: Some(field.into()),
..self
}
}
}
Expand Down Expand Up @@ -166,13 +181,40 @@ where
);

let nulls = self.null_buffer_builder.finish();
let array_data = ArrayData::builder(DataType::FixedSizeList(
Arc::new(Field::new("item", values_data.data_type().clone(), true)),
self.list_len,
))
.len(len)
.add_child_data(values_data)
.nulls(nulls);

let field = match &self.field {
Some(f) => {
let size = self.value_length();
assert_eq!(
f.data_type(),
values_data.data_type(),
"DataType of field ({}) should be the same as the values_builder DataType ({})",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

f.data_type(),
values_data.data_type()
);

if let Some(a) = values_arr.logical_nulls() {
let nulls_valid = f.is_nullable()
|| nulls
.as_ref()
.map(|n| n.expand(size as _).contains(&a))
.unwrap_or_default();

assert!(
nulls_valid,
"Found unmasked nulls for non-nullable FixedSizeListBuilder field {:?}",
f.name()
);
}
f.clone()
}
None => Arc::new(Field::new("item", values_data.data_type().clone(), true)),
};

let array_data = ArrayData::builder(DataType::FixedSizeList(field, self.list_len))
.len(len)
.add_child_data(values_data)
.nulls(nulls);

let array_data = unsafe { array_data.build_unchecked() };

Expand All @@ -194,13 +236,39 @@ where
);

let nulls = self.null_buffer_builder.finish_cloned();
let array_data = ArrayData::builder(DataType::FixedSizeList(
Arc::new(Field::new("item", values_data.data_type().clone(), true)),
self.list_len,
))
.len(len)
.add_child_data(values_data)
.nulls(nulls);

let field = match &self.field {
Some(f) => {
let size = self.value_length();
assert_eq!(
f.data_type(),
values_data.data_type(),
"DataType of field ({}) should be the same as the values_builder DataType ({})",
f.data_type(),
values_data.data_type()
);
if let Some(a) = values_arr.logical_nulls() {
let nulls_valid = f.is_nullable()
|| nulls
.as_ref()
.map(|n| n.expand(size as _).contains(&a))
.unwrap_or_default();

assert!(
nulls_valid,
"Found unmasked nulls for non-nullable FixedSizeListBuilder field {:?}",
f.name()
);
}
f.clone()
}
None => Arc::new(Field::new("item", values_data.data_type().clone(), true)),
};

let array_data = ArrayData::builder(DataType::FixedSizeList(field, self.list_len))
.len(len)
.add_child_data(values_data)
.nulls(nulls);

let array_data = unsafe { array_data.build_unchecked() };
alamb marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -216,28 +284,54 @@ mod tests {
use crate::Array;
use crate::Int32Array;

#[test]
fn test_fixed_size_list_array_builder() {
fn make_list_builder(
include_null_element: bool,
include_null_in_values: bool,
) -> FixedSizeListBuilder<crate::builder::PrimitiveBuilder<crate::types::Int32Type>> {
let values_builder = Int32Builder::new();
let mut builder = FixedSizeListBuilder::new(values_builder, 3);

// [[0, 1, 2], null, [3, null, 5], [6, 7, null]]
builder.values().append_value(0);
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true);
builder.values().append_null();
builder.values().append_null();
builder.values().append_null();
builder.append(false);

builder.values().append_value(2);
builder.values().append_value(3);
builder.values().append_null();
builder.values().append_value(5);
builder.append(true);
builder.values().append_value(6);
builder.values().append_value(7);
builder.values().append_null();
builder.values().append_value(4);
builder.append(true);

if include_null_element {
builder.values().append_null();
builder.values().append_null();
builder.values().append_null();
builder.append(false);
} else {
builder.values().append_value(2);
builder.values().append_value(3);
builder.values().append_value(4);
builder.append(true);
}

if include_null_in_values {
builder.values().append_value(3);
builder.values().append_null();
builder.values().append_value(5);
builder.append(true);
} else {
builder.values().append_value(3);
builder.values().append_value(4);
builder.values().append_value(5);
builder.append(true);
}

builder
}

#[test]
fn test_fixed_size_list_array_builder() {
let mut builder = make_list_builder(true, true);

let list_array = builder.finish();

assert_eq!(DataType::Int32, list_array.value_type());
Expand All @@ -248,9 +342,48 @@ mod tests {
}

#[test]
fn test_fixed_size_list_array_builder_finish_cloned() {
fn test_fixed_size_list_array_builder_with_field() {
let builder = make_list_builder(false, false);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alamb , I had a question about this test in particular. With the new capability to set the field as a non-nullable we have the situation where the builder has a null element built like this:

 builder.values().append_null();
 builder.values().append_null();
 builder.values().append_null();
 builder.append(false);

If the field is set to non-nullable, this element will fail the null check. I wanted to check with you if this is logically right or not? Are we making a distinction between nulls in the values builder, where let's say 1 of the 3 elements of a particular fixed size list is null (which would be a valid fail on non-nullability) or nulls in the main builder (which is built up as 3 nulls + a append(false) in the values builder). To me the second one is questionable and comes down to how you want this to work. If this test should not fail with the second complete null case, then I have to go back and revise the null check of the values builder and somehow control for null elements of the main builder.

Copy link
Contributor

@tustvold tustvold Apr 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic we apply for FixedSizeList and StructArray elsewhere is to permit "masked" nulls, this is because a null must always take up space. I am not sure if we want to replicate this logic here.

See https://docs.rs/arrow-array/latest/arrow_array/array/struct.FixedSizeListArray.html#method.try_new

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the field is set to non-nullable, this element will fail the null check. I wanted to check with you if this is logically right or not?

I think I would expect it to be consistent with whatever FixedSlizeListArray::try_new did

let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false));
let list_array = builder.finish();

assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(0, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}

#[test]
fn test_fixed_size_list_array_builder_with_field_and_null() {
let builder = make_list_builder(true, false);
let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false));
let list_array = builder.finish();
alamb marked this conversation as resolved.
Show resolved Hide resolved

assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}

#[test]
#[should_panic(expected = "Found unmasked nulls for non-nullable FixedSizeListBuilder field")]
fn test_fixed_size_list_array_builder_with_field_null_panic() {
let builder = make_list_builder(true, true);
let mut builder = builder.with_field(Field::new("list_item", DataType::Int32, false));

builder.finish();
}

#[test]
#[should_panic(
expected = "DataType of field (Int64) should be the same as the values_builder DataType (Int32)"
)]
fn test_fixed_size_list_array_builder_with_field_type_panic() {
let values_builder = Int32Builder::new();
let mut builder = FixedSizeListBuilder::new(values_builder, 3);
let builder = FixedSizeListBuilder::new(values_builder, 3);
let mut builder = builder.with_field(Field::new("list_item", DataType::Int64, true));

// [[0, 1, 2], null, [3, null, 5], [6, 7, null]]
builder.values().append_value(0);
Expand All @@ -262,13 +395,68 @@ mod tests {
builder.values().append_null();
builder.append(false);
builder.values().append_value(3);
builder.values().append_null();
builder.values().append_value(4);
builder.values().append_value(5);
builder.append(true);

builder.finish();
}

#[test]
fn test_fixed_size_list_array_builder_cloned_with_field() {
let builder = make_list_builder(true, true);
let builder = builder.with_field(Field::new("list_element", DataType::Int32, true));

let list_array = builder.finish_cloned();

assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}

#[test]
#[should_panic(expected = "Found unmasked nulls for non-nullable FixedSizeListBuilder field")]
fn test_fixed_size_list_array_builder_cloned_with_field_null_panic() {
let builder = make_list_builder(true, true);
let builder = builder.with_field(Field::new("list_item", DataType::Int32, false));

builder.finish_cloned();
}

#[test]
fn test_fixed_size_list_array_builder_cloned_with_field_and_null() {
let builder = make_list_builder(true, false);
let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false));
let list_array = builder.finish();

assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}

#[test]
#[should_panic(
expected = "DataType of field (Int64) should be the same as the values_builder DataType (Int32)"
)]
fn test_fixed_size_list_array_builder_cloned_with_field_type_panic() {
let builder = make_list_builder(false, false);
let builder = builder.with_field(Field::new("list_item", DataType::Int64, true));

builder.finish_cloned();
}

#[test]
fn test_fixed_size_list_array_builder_finish_cloned() {
let mut builder = make_list_builder(true, true);

let mut list_array = builder.finish_cloned();

assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(3, list_array.len());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(3, list_array.value_length());

Expand All @@ -283,12 +471,40 @@ mod tests {
list_array = builder.finish();

assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(5, list_array.len());
assert_eq!(6, list_array.len());
assert_eq!(2, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}

#[test]
fn test_fixed_size_list_array_builder_with_field_empty() {
let values_builder = Int32Array::builder(0);
let mut builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new(
"list_item",
DataType::Int32,
false,
));
assert!(builder.is_empty());
let arr = builder.finish();
assert_eq!(0, arr.len());
assert_eq!(0, builder.len());
}

#[test]
fn test_fixed_size_list_array_builder_cloned_with_field_empty() {
let values_builder = Int32Array::builder(0);
let builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new(
"list_item",
DataType::Int32,
false,
));
assert!(builder.is_empty());
let arr = builder.finish_cloned();
assert_eq!(0, arr.len());
assert_eq!(0, builder.len());
}

#[test]
fn test_fixed_size_list_array_builder_empty() {
let values_builder = Int32Array::builder(5);
Expand Down
Loading