From 0e597ab1ac62f12a4cf020994b2097643fdb9657 Mon Sep 17 00:00:00 2001 From: LucasG0 <44552904+LucasG0@users.noreply.github.com> Date: Thu, 4 Jan 2024 00:12:24 +0100 Subject: [PATCH] GH-34316: [Python] FixedSizeListArray.from_arrays supports mask parameter (#39396) ### What changes are included in this PR? Add `mask` / `null_bitmap` parameters in corresponding Cython / C++ `FixedSizeListArray` methods, and propagate this bitmap instead of using the current dummy `validity_buf`. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, `mask` parameter has been added to `FixedSizeListArray.from_arrays` * Closes: #34316 Authored-by: LucasG0 Signed-off-by: Will Jones --- cpp/src/arrow/array/array_nested.cc | 16 ++++++++-------- cpp/src/arrow/array/array_nested.h | 16 ++++++++++++---- python/pyarrow/array.pxi | 13 +++++++++---- python/pyarrow/includes/libarrow.pxd | 8 ++++++-- python/pyarrow/tests/test_array.py | 10 ++++++++++ 5 files changed, 45 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index acdd0a0742468..0b0e340a67d4e 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -894,7 +894,8 @@ const std::shared_ptr& FixedSizeListArray::value_type() const { const std::shared_ptr& FixedSizeListArray::values() const { return values_; } Result> FixedSizeListArray::FromArrays( - const std::shared_ptr& values, int32_t list_size) { + const std::shared_ptr& values, int32_t list_size, + std::shared_ptr null_bitmap, int64_t null_count) { if (list_size <= 0) { return Status::Invalid("list_size needs to be a strict positive integer"); } @@ -905,14 +906,14 @@ Result> FixedSizeListArray::FromArrays( } int64_t length = values->length() / list_size; auto list_type = std::make_shared(values->type(), list_size); - std::shared_ptr validity_buf; - return std::make_shared(list_type, length, values, validity_buf, - /*null_count=*/0, /*offset=*/0); + return std::make_shared(list_type, length, values, null_bitmap, + null_count); } Result> FixedSizeListArray::FromArrays( - const std::shared_ptr& values, std::shared_ptr type) { + const std::shared_ptr& values, std::shared_ptr type, + std::shared_ptr null_bitmap, int64_t null_count) { if (type->id() != Type::FIXED_SIZE_LIST) { return Status::TypeError("Expected fixed size list type, got ", type->ToString()); } @@ -926,10 +927,9 @@ Result> FixedSizeListArray::FromArrays( "The length of the values Array needs to be a multiple of the list size"); } int64_t length = values->length() / list_type.list_size(); - std::shared_ptr validity_buf; - return std::make_shared(type, length, values, validity_buf, - /*null_count=*/0, /*offset=*/0); + return std::make_shared(type, length, values, null_bitmap, + null_count); } Result> FixedSizeListArray::Flatten( diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 61606e1592d61..768a630e0af54 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -599,17 +599,25 @@ class ARROW_EXPORT FixedSizeListArray : public Array { /// /// \param[in] values Array containing list values /// \param[in] list_size The fixed length of each list + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap /// \return Will have length equal to values.length() / list_size - static Result> FromArrays(const std::shared_ptr& values, - int32_t list_size); + static Result> FromArrays( + const std::shared_ptr& values, int32_t list_size, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); /// \brief Construct FixedSizeListArray from child value array and type /// /// \param[in] values Array containing list values /// \param[in] type The fixed sized list type + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap /// \return Will have length equal to values.length() / type.list_size() - static Result> FromArrays(const std::shared_ptr& values, - std::shared_ptr type); + static Result> FromArrays( + const std::shared_ptr& values, std::shared_ptr type, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); protected: void SetData(const std::shared_ptr& data); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 74a196002bfa6..751dfbcce4342 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2484,7 +2484,7 @@ cdef class MapArray(ListArray): Examples -------- - First, let's understand the structure of our dataset when viewed in a rectangular data model. + First, let's understand the structure of our dataset when viewed in a rectangular data model. The total of 5 respondents answered the question "How much did you like the movie x?". The value -1 in the integer array means that the value is missing. The boolean array represents the null bitmask corresponding to the missing values in the integer array. @@ -2590,7 +2590,7 @@ cdef class FixedSizeListArray(BaseListArray): """ @staticmethod - def from_arrays(values, list_size=None, DataType type=None): + def from_arrays(values, list_size=None, DataType type=None, mask=None): """ Construct FixedSizeListArray from array of values and a list length. @@ -2602,6 +2602,9 @@ cdef class FixedSizeListArray(BaseListArray): type : DataType, optional If not specified, a default ListType with the values' type and `list_size` length is used. + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + Returns ------- @@ -2652,19 +2655,21 @@ cdef class FixedSizeListArray(BaseListArray): _values = asarray(values) + c_mask = c_mask_inverted_from_obj(mask, None) + if type is not None: if list_size is not None: raise ValueError("Cannot specify both list_size and type") with nogil: c_result = CFixedSizeListArray.FromArraysAndType( - _values.sp_array, type.sp_type) + _values.sp_array, type.sp_type, c_mask) else: if list_size is None: raise ValueError("Should specify one of list_size and type") _list_size = list_size with nogil: c_result = CFixedSizeListArray.FromArrays( - _values.sp_array, _list_size) + _values.sp_array, _list_size, c_mask) cdef Array result = pyarrow_wrap_array(GetResultValue(c_result)) result.validate() return result diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index bad5ec606c268..82b888f584813 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -673,11 +673,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeListArray" arrow::FixedSizeListArray"(CArray): @staticmethod CResult[shared_ptr[CArray]] FromArrays( - const shared_ptr[CArray]& values, int32_t list_size) + const shared_ptr[CArray]& values, + int32_t list_size, + shared_ptr[CBuffer] null_bitmap) @staticmethod CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( - const shared_ptr[CArray]& values, shared_ptr[CDataType]) + const shared_ptr[CArray]& values, + shared_ptr[CDataType], + shared_ptr[CBuffer] null_bitmap) int64_t value_offset(int i) int64_t value_length(int i) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 599d15d023a55..d598630dc2103 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1091,6 +1091,16 @@ def test_fixed_size_list_from_arrays(): assert result.type.equals(typ) assert result.type.value_field.name == "name" + result = pa.FixedSizeListArray.from_arrays(values, + type=typ, + mask=pa.array([False, True, False])) + assert result.to_pylist() == [[0, 1, 2, 3], None, [8, 9, 10, 11]] + + result = pa.FixedSizeListArray.from_arrays(values, + list_size=4, + mask=pa.array([False, True, False])) + assert result.to_pylist() == [[0, 1, 2, 3], None, [8, 9, 10, 11]] + # raise on invalid values / list_size with pytest.raises(ValueError): pa.FixedSizeListArray.from_arrays(values, -4)