From c400b4625143d97cc3a83efc5970e2d0fb391440 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui Date: Thu, 18 Apr 2024 22:53:35 +0800 Subject: [PATCH] 1. Expose recursive flatten for logical lists on list_flatten kernel function 2. Support [Large]ListView for some kernel functions: list_flatten,list_value_length, list_element 3. Support recursive flatten for pyarrow bindinds and simplify [Large]ListView's pyarrow bindings 4. Refactor vector_nested_test.cc for better support [Large]ListView types. --- cpp/src/arrow/compute/api_vector.cc | 7 + cpp/src/arrow/compute/api_vector.h | 12 + .../arrow/compute/kernels/codegen_internal.cc | 10 +- .../arrow/compute/kernels/scalar_nested.cc | 49 +++- .../compute/kernels/scalar_nested_test.cc | 17 +- .../arrow/compute/kernels/vector_nested.cc | 50 +++- .../compute/kernels/vector_nested_test.cc | 187 +++++++++++---- python/pyarrow/_compute.pyx | 20 ++ python/pyarrow/array.pxi | 215 +++++++----------- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 + python/pyarrow/tests/test_array.py | 14 +- python/pyarrow/tests/test_compute.py | 1 + 13 files changed, 381 insertions(+), 207 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index d47ee42ebf239..6bf0fac429f94 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -153,6 +153,8 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("tiebreaker", &RankOptions::tiebreaker)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); +static auto kListFlattenOptionsType = GetFunctionOptionsType( + DataMember("recursively", &ListFlattenOptions::recursively)); } // namespace } // namespace internal @@ -224,6 +226,10 @@ PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; +ListFlattenOptions::ListFlattenOptions(bool recursively) + : FunctionOptions(internal::kListFlattenOptionsType), recursively(recursively) {} +constexpr char ListFlattenOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -237,6 +243,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 919572f16ee69..ff88203ed22ed 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -245,6 +245,18 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { int64_t periods = 1; }; +/// \brief Options for list_flatten function +class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { + public: + explicit ListFlattenOptions(bool recursively = false); + static constexpr char const kTypeName[] = "ListFlattenOptions"; + static ListFlattenOptions Defaults() { return ListFlattenOptions(); } + + /// Control the version of 'Flatten' that keeps recursively flattening + /// until an array of non-list values is reached. + bool recursively = false; +}; + /// @} /// \brief Filter with a boolean selection filter diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 00a833742f957..abde2175de849 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -57,8 +57,14 @@ Result LastType(KernelContext*, const std::vector& types } Result ListValuesType(KernelContext*, const std::vector& args) { - const auto& list_type = checked_cast(*args[0].type); - return list_type.value_type().get(); + auto list_type = checked_cast(args[0].type); + auto value_type = list_type->value_type().get(); + for (auto value_kind = value_type->id(); + is_list(value_kind) || is_list_view(value_kind); value_kind = value_type->id()) { + list_type = checked_cast(list_type->value_type().get()); + value_type = list_type->value_type().get(); + } + return value_type; } void EnsureDictionaryDecoded(std::vector* types) { diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 733ab9c0dc287..35320e285b928 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -39,12 +39,26 @@ namespace { template Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& arr = batch[0].array; + const auto kind = arr.type->id(); ArraySpan* out_arr = out->array_span_mutable(); auto out_values = out_arr->GetValues(1); - const offset_type* offsets = arr.GetValues(1); - // Offsets are always well-defined and monotonic, even for null values - for (int64_t i = 0; i < arr.length; ++i) { - *out_values++ = offsets[i + 1] - offsets[i]; + if (is_list_view(kind)) { + // [Large]ListView's buffer layout: + // buffer1 : valid bitmap + // buffer2 : elements' start offset in current array + // buffer3 : elements' size + // + // It's unnecessary to calculate according offsets. + const auto* sizes = arr.GetValues(2); + for (int64_t i = 0; i < arr.length; i++) { + *out_values++ = sizes[i]; + } + } else { + const offset_type* offsets = arr.GetValues(1); + // Offsets are always well-defined and monotonic, even for null values + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = offsets[i + 1] - offsets[i]; + } } return Status::OK(); } @@ -59,6 +73,24 @@ Status FixedSizeListValueLength(KernelContext* ctx, const ExecSpan& batch, return Status::OK(); } +template +void AddListValueLengthKernel(ScalarFunction* func, + const std::shared_ptr& out_type) { + auto in_type = {InputType(InListType::type_id)}; + ScalarKernel kernel(in_type, out_type, ListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddListValueLengthKernels(ScalarFunction* func) { + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + + DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), + FixedSizeListValueLength)); +} + const FunctionDoc list_value_length_doc{ "Compute list lengths", ("`lists` must have a list-like type.\n" @@ -399,6 +431,8 @@ void AddListElementKernels(ScalarFunction* func) { void AddListElementKernels(ScalarFunction* func) { AddListElementKernels(func); AddListElementKernels(func); + AddListElementKernels(func); + AddListElementKernels(func); AddListElementKernels(func); } @@ -824,12 +858,7 @@ const FunctionDoc map_lookup_doc{ void RegisterScalarNested(FunctionRegistry* registry) { auto list_value_length = std::make_shared( "list_value_length", Arity::Unary(), list_value_length_doc); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(), - ListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), - FixedSizeListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(), - ListValueLength)); + AddListValueLengthKernels(list_value_length.get()); DCHECK_OK(registry->AddFunction(std::move(list_value_length))); auto list_element = diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b82..32bea8246954d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -30,11 +30,21 @@ namespace arrow { namespace compute { static std::shared_ptr GetOffsetType(const DataType& type) { - return type.id() == Type::LIST ? int32() : int64(); + switch (type.id()) { + case Type::LIST: + case Type::LIST_VIEW: + return int32(); + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return int64(); + default: + Unreachable("Unexpected type"); + } } TEST(TestScalarNested, ListValueLength) { - for (auto ty : {list(int32()), large_list(int32())}) { + for (auto ty : {list(int32()), large_list(int32()), list_view(int32()), + large_list_view(int32())}) { CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]", GetOffsetType(*ty), "[3, null, 2, 0]"); } @@ -47,7 +57,8 @@ TEST(TestScalarNested, ListValueLength) { TEST(TestScalarNested, ListElementNonFixedListWithNulls) { auto sample = "[[7, 5, 81], [6, null, 4, 7, 8], [3, 12, 2, 0], [1, 9], null]"; for (auto ty : NumericTypes()) { - for (auto list_type : {list(ty), large_list(ty)}) { + for (auto list_type : + {list(ty), large_list(ty), list_view(ty), large_list_view(ty)}) { auto input = ArrayFromJSON(list_type, sample); auto null_input = ArrayFromJSON(list_type, "[null]"); for (auto index_type : IntTypes()) { diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 08930e589f7b4..53ceec41d6a80 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -18,6 +18,7 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/visit_type_inline.h" @@ -29,8 +30,16 @@ namespace { template Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + auto recursively = OptionsWrapper::Get(ctx).recursively; typename TypeTraits::ArrayType list_array(batch[0].array.ToArrayData()); - ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool())); + + std::shared_ptr result; + if (!recursively) { + ARROW_ASSIGN_OR_RAISE(result, list_array.Flatten(ctx->memory_pool())); + } else { + ARROW_ASSIGN_OR_RAISE(result, list_array.FlattenRecursively(ctx->memory_pool())); + } + out->value = std::move(result->data()); return Status::OK(); } @@ -70,6 +79,10 @@ struct ListParentIndicesArray { Status Visit(const LargeListType& type) { return VisitList(type); } + Status Visit(const ListViewType& type) { return VisitList(type); } + + Status Visit(const LargeListViewType& type) { return VisitList(type); } + Status Visit(const FixedSizeListType& type) { using offset_type = typename FixedSizeListType::offset_type; const offset_type slot_length = type.list_size(); @@ -110,7 +123,7 @@ const FunctionDoc list_flatten_doc( ("`lists` must have a list-like type.\n" "Return an array with the top list level flattened.\n" "Top-level null values in `lists` do not emit anything in the input."), - {"lists"}); + {"lists"}, "ListFlattenOptions"); const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", @@ -153,17 +166,34 @@ class ListParentIndicesFunction : public MetaFunction { } }; +const ListFlattenOptions* GetDefaultListFlattenOptions() { + static const auto kDefaultListFlattenOptions = ListFlattenOptions::Defaults(); + return &kDefaultListFlattenOptions; +} + +template +void AddBaseListFlattenKernels(VectorFunction* func) { + auto in_type = {InputType(InListType::type_id)}; + auto out_type = OutputType(ListValuesType); + VectorKernel kernel(in_type, out_type, ListFlatten, + OptionsWrapper::Init); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddBaseListFlattenKernels(VectorFunction* func) { + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); +} + } // namespace void RegisterVectorNested(FunctionRegistry* registry) { - auto flatten = - std::make_shared("list_flatten", Arity::Unary(), list_flatten_doc); - DCHECK_OK(flatten->AddKernel({Type::LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::FIXED_SIZE_LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::LARGE_LIST}, OutputType(ListValuesType), - ListFlatten)); + auto flatten = std::make_shared( + "list_flatten", Arity::Unary(), list_flatten_doc, GetDefaultListFlattenOptions()); + AddBaseListFlattenKernels(flatten.get()); DCHECK_OK(registry->AddFunction(std::move(flatten))); DCHECK_OK(registry->AddFunction(std::make_shared())); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index eef1b6835ffb5..a03883d145aa1 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -19,6 +19,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/result.h" #include "arrow/testing/gtest_util.h" @@ -29,41 +30,143 @@ namespace compute { using arrow::internal::checked_cast; -TEST(TestVectorNested, ListFlatten) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]"); - auto expected = ArrayFromJSON(int16(), "[0, null, 1, 2, 3]"); +using ListAndListViewTypes = + ::testing::Types; + +// ---------------------------------------------------------------------- +// [Large]List and [Large]ListView tests +template +class TestVectorLogicalList : public ::testing::Test { + public: + using TypeClass = T; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + } + + public: + void TestListFlatten() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], []]"); + auto expected = ArrayFromJSON(value_type_, "[0, null, 1, 2, 3]"); CheckVectorUnary("list_flatten", input, expected); // Construct a list with a non-empty null slot auto tweaked = TweakValidityBit(input, 0, false); - expected = ArrayFromJSON(int16(), "[2, 3]"); + expected = ArrayFromJSON(value_type_, "[2, 3]"); CheckVectorUnary("list_flatten", tweaked, expected); } -} -TEST(TestVectorNested, ListFlattenNulls) { - const auto ty = list(int32()); - auto input = ArrayFromJSON(ty, "[null, null]"); - auto expected = ArrayFromJSON(int32(), "[]"); - CheckVectorUnary("list_flatten", input, expected); -} + void TestListFlattenNulls() { + value_type_ = int32(); + type_ = std::make_shared(value_type_); + auto input = ArrayFromJSON(type_, "[null, null]"); + auto expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected); + } -TEST(TestVectorNested, ListFlattenChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - ARROW_SCOPED_TRACE(ty->ToString()); - auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], []]"}); - auto expected = ChunkedArrayFromJSON(int16(), {"[0, null, 1]", "[2, 3]"}); + void TestListFlattenChunkedArray() { + ARROW_SCOPED_TRACE(type_->ToString()); + auto input = ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], []]"}); + auto expected = ChunkedArrayFromJSON(value_type_, {"[0, null, 1]", "[2, 3]"}); CheckVectorUnary("list_flatten", input, expected); ARROW_SCOPED_TRACE("empty"); - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int16(), {}); + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(value_type_, {}); CheckVectorUnary("list_flatten", input, expected); } + + void TestListFlattenRecursively() { + auto inner_type = std::make_shared(value_type_); + type_ = std::make_shared(inner_type); + + ListFlattenOptions opts; + opts.recursively = true; + + // List types with two nested level: list> + auto input = ArrayFromJSON(type_, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])"); + auto expected = ArrayFromJSON(value_type_, "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // Empty nested list should flatten until non-list type is reached + input = ArrayFromJSON(type_, R"([null])"); + expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // List types with three nested level: list>> + type_ = std::make_shared(std::make_shared(fixed_size_list(value_type_, 2))); + input = ArrayFromJSON(type_, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])"); + expected = ArrayFromJSON(value_type_, "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + } + + void TestListParentIndices() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); + auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", input, expected); + + // Construct a list with a non-empty null slot + input = ArrayFromJSON(type_, "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); + auto tweaked = TweakValidityBit(input, 1, false); + expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", tweaked, expected); + } + + void TestListParentIndicesChunkedArray() { + auto input = + ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); + auto expected = ChunkedArrayFromJSON(int64(), {"[0, 0, 0]", "[2, 2, 4, 4]"}); + CheckVectorUnary("list_parent_indices", input, expected); + + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(int64(), {}); + CheckVectorUnary("list_parent_indices", input, expected); + } + + protected: + std::shared_ptr type_; + std::shared_ptr value_type_; +}; + +TYPED_TEST_SUITE(TestVectorLogicalList, ListAndListViewTypes); + +TYPED_TEST(TestVectorLogicalList, ListFlatten) { this->TestListFlatten(); } + +TYPED_TEST(TestVectorLogicalList, ListFlattenNulls) { this->TestListFlattenNulls(); } + +TYPED_TEST(TestVectorLogicalList, ListFlattenChunkedArray) { + this->TestListFlattenChunkedArray(); } -TEST(TestVectorNested, ListFlattenFixedSizeList) { +TYPED_TEST(TestVectorLogicalList, ListFlattenRecursively) { + this->TestListFlattenRecursively(); +} + +TYPED_TEST(TestVectorLogicalList, ListParentIndices) { this->TestListParentIndices(); } + +TYPED_TEST(TestVectorLogicalList, ListParentIndicesChunkedArray) { + this->TestListParentIndicesChunkedArray(); +} + +TEST(TestVectorFixedSizeList, ListFlattenFixedSizeList) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { const auto& out_ty = checked_cast(*ty).value_type(); { @@ -85,43 +188,29 @@ TEST(TestVectorNested, ListFlattenFixedSizeList) { } } -TEST(TestVectorNested, ListFlattenFixedSizeListNulls) { +TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListNulls) { const auto ty = fixed_size_list(int32(), 1); auto input = ArrayFromJSON(ty, "[null, null]"); auto expected = ArrayFromJSON(int32(), "[]"); CheckVectorUnary("list_flatten", input, expected); } -TEST(TestVectorNested, ListParentIndices) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); - - auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); - CheckVectorUnary("list_parent_indices", input, expected); - } - - // Construct a list with a non-empty null slot - auto input = ArrayFromJSON(list(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); - auto tweaked = TweakValidityBit(input, 1, false); - auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); - CheckVectorUnary("list_parent_indices", tweaked, expected); -} - -TEST(TestVectorNested, ListParentIndicesChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = - ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); - - auto expected = ChunkedArrayFromJSON(int64(), {"[0, 0, 0]", "[2, 2, 4, 4]"}); - CheckVectorUnary("list_parent_indices", input, expected); - - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int64(), {}); - CheckVectorUnary("list_parent_indices", input, expected); - } +TEST(TestVectorFixedSizeList, ListFlattenFixedSizeListRecursively) { + ListFlattenOptions opts; + opts.recursively = true; + + auto inner_type = fixed_size_list(int32(), 2); + auto type = fixed_size_list(inner_type, 2); + auto input = ArrayFromJSON(type, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])"); + auto expected = ArrayFromJSON(int32(), "[0, 1, null, 3, 7, null, 2, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); } -TEST(TestVectorNested, ListParentIndicesFixedSizeList) { +TEST(TestVectorFixedSizeList, ListParentIndicesFixedSizeList) { for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) { { auto input = ArrayFromJSON(ty, "[[0, null], null, [1, 2], [3, 4], [null, 5]]"); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a267d53599436..d0fc93fc9af5d 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2035,6 +2035,26 @@ class PairwiseOptions(_PairwiseOptions): self._set_options(period) +cdef class _ListFlattenOptions(FunctionOptions): + def _set_options(self, recursively): + self.wrapped.reset(new CListFlattenOptions(recursively)) + + +class ListFlattenOptions(_ListFlattenOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursively : bool, defalut false + When true, do list flatten recursively until an array of + non-list values is reached. + """ + + def __init__(self, recursively=False): + self._set_options(recursively) + + cdef class _ArraySortOptions(FunctionOptions): def _set_options(self, order, null_placement): self.wrapped.reset(new CArraySortOptions( diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 60fc09ea861b6..2c762cc2c6cf3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2141,22 +2141,99 @@ cdef class Decimal256Array(FixedSizeBinaryArray): cdef class BaseListArray(Array): - def flatten(self): + def flatten(self, recursively=False): """ - Unnest this ListArray/LargeListArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. + Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray + according to 'recursively'. Note that this method is different from ``self.values`` in that it takes care of the slicing offset as well as null elements backed by non-empty sub-lists. + Parameters + ---------- + recursively : bool, defalut false, optional + When true, flatten this logical list-array recursively until an + array of non-list values is reached. + When false, flatten this logical list-array by one level + Returns ------- result : Array + + Examples + -------- + + Basic logical list-array's flatten + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + + If an logical list-array is nested with multi-level, the array will + be flattened recursively until an array of non-list values is reached + if we enable recursively=True. + + >>> array = pa.array([ + None, + [ + [1, None, 2], + None, + [3, 4] + ], + [], + [ + [], + [5, 6], + None + ], + [ + [7, 8] + ] + ], type=pa.list_(pa.list_(pa.int64()))) + >>> array.flatten(True) + + [ + 1, + None, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ] """ - return _pc().list_flatten(self) + options = _pc().ListFlattenOptions(recursively) + return _pc().list_flatten(self, options) def value_parent_indices(self): """ @@ -2527,7 +2604,7 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a list view data type. """ @@ -2747,69 +2824,8 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this ListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a large list view data type. @@ -3037,67 +3053,6 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this LargeListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - cdef class MapArray(ListArray): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 205ab393b8b09..83612f66d21e2 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -44,6 +44,7 @@ IndexOptions, JoinOptions, ListSliceOptions, + ListFlattenOptions, MakeStructOptions, MapLookupOptions, MatchSubstringOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6dae45ab80b1c..180a0e116bb19 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2589,6 +2589,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CPairwiseOptions(int64_t period) int64_t period + cdef cppclass CListFlattenOptions\ + "arrow::compute::ListFlattenOptions"(CFunctionOptions): + CListFlattenOptions(bool recursively) + bool recursively + cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): CArraySortOptions(CSortOrder, CNullPlacement) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 156d58326b961..83e35d7b603af 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2704,7 +2704,8 @@ def test_invalid_tensor_construction(): @pytest.mark.parametrize(('offset_type', 'list_type_factory'), - [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) + [(pa.int32(), pa.list_), (pa.int64(), pa.large_list), + (pa.int32(), pa.list_view), (pa.int64(), pa.large_list_view)]) def test_list_array_flatten(offset_type, list_type_factory): typ2 = list_type_factory( list_type_factory( @@ -2757,12 +2758,15 @@ def test_list_array_flatten(offset_type, list_type_factory): assert arr1.values.equals(arr0) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) @pytest.mark.parametrize('list_type', [ pa.list_(pa.int32()), pa.list_(pa.int32(), list_size=2), - pa.large_list(pa.int32())]) + pa.large_list(pa.int32()), + pa.list_view(pa.int32()), + pa.large_list_view(pa.int32())]) def test_list_value_parent_indices(list_type): arr = pa.array( [ @@ -2778,7 +2782,9 @@ def test_list_value_parent_indices(list_type): @pytest.mark.parametrize(('offset_type', 'list_type'), [(pa.int32(), pa.list_(pa.int32())), (pa.int32(), pa.list_(pa.int32(), list_size=2)), - (pa.int64(), pa.large_list(pa.int32()))]) + (pa.int64(), pa.large_list(pa.int32())), + (pa.int32(), pa.list_view(pa.int32())), + (pa.int64(), pa.large_list_view(pa.int32()))]) def test_list_value_lengths(offset_type, list_type): # FixedSizeListArray needs fixed list sizes @@ -2876,6 +2882,7 @@ def test_fixed_size_list_array_flatten(): assert arr0.type.equals(typ0) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0) + assert arr2.flatten(True).equals(arr0) def test_fixed_size_list_array_flatten_with_slice(): @@ -3844,6 +3851,7 @@ def test_list_view_flatten(list_array_type, list_type_factory, offset_type): assert arr2.values.equals(arr1) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) # test out of order offsets values = [1, 2, 3, 4] diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 98cbd920b509b..0453dbf34e637 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -152,6 +152,7 @@ def test_option_class_equality(): pc.IndexOptions(pa.scalar(1)), pc.JoinOptions(), pc.ListSliceOptions(0, -1, 1, True), + pc.ListFlattenOptions(recursively=False), pc.MakeStructOptions(["field", "names"], field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}),