Skip to content

Commit

Permalink
GH-42235: [C++] list_parent_indices: Add support for list-view types (#…
Browse files Browse the repository at this point in the history
…42236)

### Rationale for this change

Completing the support of list-views.

### What changes are included in this PR?

Implementation of the `list_parent_indices` function for `ListViewType` and `LargeListViewType`.

### Are these changes tested?

Yes, by new unit tests.

### Are there any user-facing changes?

More complete support of list-views.
* GitHub Issue: #42235

Lead-authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
felipecrv and pitrou authored Jun 26, 2024
1 parent 9dec272 commit 508bdaa
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 6 deletions.
67 changes: 66 additions & 1 deletion cpp/src/arrow/compute/kernels/vector_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,16 @@
#include "arrow/compute/api_vector.h"
#include "arrow/compute/kernels/common_internal.h"
#include "arrow/result.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/list_util.h"
#include "arrow/visit_type_inline.h"

namespace arrow {

using internal::CountSetBits;
using list_util::internal::RangeOfValuesUsed;

namespace compute {
namespace internal {
namespace {
Expand Down Expand Up @@ -76,6 +83,63 @@ struct ListParentIndicesArray {

Status Visit(const LargeListType& type) { return VisitList(type); }

template <typename Type, typename offset_type = typename Type::offset_type>
Status VisitListView(const Type&) {
ArraySpan list_view{*input};

const offset_type* offsets = list_view.GetValues<offset_type>(1);
const offset_type* sizes = list_view.GetValues<offset_type>(2);
int64_t values_offset;
int64_t values_length;
ARROW_ASSIGN_OR_RAISE(std::tie(values_offset, values_length),
RangeOfValuesUsed(list_view));

ARROW_ASSIGN_OR_RAISE(auto indices_validity,
AllocateEmptyBitmap(values_length, ctx->memory_pool()));
auto* out_indices_validity = indices_validity->mutable_data();
int64_t total_pop_count = 0;

ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int64_t)));
auto* out_indices = indices->template mutable_data_as<int64_t>();
memset(out_indices, -1, values_length * sizeof(int64_t));

const auto* validity = list_view.GetValues<uint8_t>(0, 0);
RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
validity, list_view.offset, list_view.length,
[this, offsets, sizes, out_indices, out_indices_validity, values_offset,
&total_pop_count](int64_t run_start, int64_t run_length) {
for (int64_t i = run_start; i < run_start + run_length; ++i) {
auto validity_offset = offsets[i] - values_offset;
const int64_t pop_count =
CountSetBits(out_indices_validity, validity_offset, sizes[i]);
if (ARROW_PREDICT_FALSE(pop_count > 0)) {
return Status::Invalid(
"Function 'list_parent_indices' cannot produce parent indices for "
"values used by more than one list-view array element.");
}
bit_util::SetBitmap(out_indices_validity, validity_offset, sizes[i]);
total_pop_count += sizes[i];
for (auto j = static_cast<int64_t>(offsets[i]);
j < static_cast<int64_t>(offsets[i]) + sizes[i]; ++j) {
out_indices[j - values_offset] = i + base_output_offset;
}
}
return Status::OK();
}));

DCHECK_LE(total_pop_count, values_length);
const int64_t null_count = values_length - total_pop_count;
BufferVector buffers{null_count > 0 ? std::move(indices_validity) : nullptr,
std::move(indices)};
out = std::make_shared<ArrayData>(int64(), values_length, std::move(buffers),
null_count);
return Status::OK();
}

Status Visit(const ListViewType& type) { return VisitListView(type); }

Status Visit(const LargeListViewType& type) { return VisitListView(type); }

Status Visit(const FixedSizeListType& type) {
using offset_type = typename FixedSizeListType::offset_type;
const offset_type slot_length = type.list_size();
Expand Down Expand Up @@ -125,7 +189,7 @@ const FunctionDoc list_flatten_doc(

const FunctionDoc list_parent_indices_doc(
"Compute parent indices of nested list values",
("`lists` must have a list-like type.\n"
("`lists` must have a list-like or list-view type.\n"
"For each value in each list of `lists`, the top-level list index\n"
"is emitted."),
{"lists"});
Expand All @@ -147,6 +211,7 @@ class ListParentIndicesFunction : public MetaFunction {

int64_t base_output_offset = 0;
ArrayVector out_chunks;
out_chunks.reserve(input->num_chunks());
for (const auto& chunk : input->chunks()) {
ARROW_ASSIGN_OR_RAISE(auto out_chunk,
ListParentIndicesArray::Exec(&kernel_ctx, chunk->data(),
Expand Down
83 changes: 81 additions & 2 deletions cpp/src/arrow/compute/kernels/vector_nested_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,50 @@ TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) {
CheckVectorUnary("list_flatten", input, expected, &opts);
}

template <typename T = ListViewType, typename offset_type = typename T::offset_type>
void SwapListView(ArrayData* array, int64_t i, int64_t j) {
ASSERT_TRUE(is_list_view(array->type->id()));
ASSERT_EQ(array->type->id(), T::type_id);
ASSERT_LT(i, array->length);
ASSERT_LT(j, array->length);
auto* validity = array->GetMutableValues<uint8_t>(0);
if (validity) {
const bool is_valid_i = bit_util::GetBit(validity, array->offset + i);
const bool is_valid_j = bit_util::GetBit(validity, array->offset + j);
if (is_valid_i ^ is_valid_j) {
bit_util::SetBitTo(validity, array->offset + i, is_valid_j);
bit_util::SetBitTo(validity, array->offset + j, is_valid_i);
}
}
auto* offsets = array->GetMutableValues<offset_type>(1);
auto* sizes = array->GetMutableValues<offset_type>(2);
std::swap(offsets[i], offsets[j]);
std::swap(sizes[i], sizes[j]);
}

template <typename T = ListViewType, typename offset_type = typename T::offset_type>
void SetListView(ArrayData* array, int64_t i, offset_type offset, offset_type size) {
ASSERT_TRUE(is_list_view(array->type->id()));
ASSERT_EQ(array->type->id(), T::type_id);
ASSERT_LT(i, array->length);
auto* validity = array->GetMutableValues<uint8_t>(0);
if (validity) {
bit_util::SetBit(validity, array->offset + i);
}
auto* offsets = array->GetMutableValues<offset_type>(1);
auto* sizes = array->GetMutableValues<offset_type>(2);
offsets[i] = offset;
sizes[i] = size;
}

TEST(TestVectorNested, ListParentIndices) {
for (auto ty : {list(int16()), large_list(int16())}) {
const auto types = {
list(int16()),
large_list(int16()),
list_view(int16()),
large_list_view(int16()),
};
for (auto ty : types) {
auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]");

auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]");
Expand All @@ -196,10 +238,47 @@ TEST(TestVectorNested, ListParentIndices) {
auto tweaked = TweakValidityBit(input, 1, false);
auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]");
CheckVectorUnary("list_parent_indices", tweaked, expected);

{
// Construct a list-view with a non-empty null slot
auto input =
ArrayFromJSON(list_view(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]");
auto tweaked = TweakValidityBit(input, 1, false);
auto expected = ArrayFromJSON(int64(), "[0, 0, 0, null, null, 2, 2, 4, 4]");
CheckVectorUnary("list_parent_indices", tweaked, expected);

// Swap some list-view entries
auto swapped = tweaked->data()->Copy();
SwapListView(swapped.get(), 0, 2);
SwapListView(swapped.get(), 1, 4);
AssertDatumsEqual(
swapped,
ArrayFromJSON(list_view(int16()), "[[2, 3], [4, 5], [0, null, 1], [], null]"),
/*verbose=*/true);
expected = ArrayFromJSON(int64(), "[2, 2, 2, null, null, 0, 0, 1, 1]");
CheckVectorUnary("list_parent_indices", swapped, expected);

// Make one view use values that are used by other list-views
SetListView(swapped.get(), 3, 1, 4);
AssertDatumsEqual(
swapped,
ArrayFromJSON(list_view(int16()),
"[[2, 3], [4, 5], [0, null, 1], [null, 1, 0, 0], null]"),
/*verbose=*/true);
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr("values used by more than one list-view"),
CallFunction("list_parent_indices", {input}));
}
}

TEST(TestVectorNested, ListParentIndicesChunkedArray) {
for (auto ty : {list(int16()), large_list(int16())}) {
const auto types = {
list(int16()),
large_list(int16()),
list_view(int16()),
large_list_view(int16()),
};
for (auto ty : types) {
auto input =
ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"});

Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ using internal::checked_cast;
constexpr Type::type NullType::type_id;
constexpr Type::type ListType::type_id;
constexpr Type::type LargeListType::type_id;
constexpr Type::type ListViewType::type_id;
constexpr Type::type LargeListViewType::type_id;

constexpr Type::type MapType::type_id;

Expand Down
9 changes: 6 additions & 3 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ of general type categories:

* "String-like": String, LargeString.

* "List-like": List, LargeList, sometimes also FixedSizeList.
* "List-like": List, LargeList, ListView, LargeListView, and sometimes also
FixedSizeList.

* "Nested": List-likes (including FixedSizeList), Struct, Union, and
related types like Map.
Expand Down Expand Up @@ -1813,8 +1814,10 @@ Structural transforms
list array are discarded.

* \(3) For each value in the list child array, the index at which it is found
in the list array is appended to the output. Nulls in the parent list array
are discarded.
in the list-like array is appended to the output. Indices of null lists in the
parent array might still be present in the output if they are non-empty null
lists. If the parent is a list-view, child array values that are not used by
any non-null list-view are null in the output.

* \(4) For each list element, compute the slice of that list element, then
return another list-like array of those slices. Can return either a
Expand Down

0 comments on commit 508bdaa

Please sign in to comment.