Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-34315: [C++] Correct is_null kernel for Union and RunEndEncoded logical nulls #35036

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,18 @@ void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_siz
span->buffers[buffer_index].size = 2 * sizeof(offset_type);
}

template <typename RunEndType>
void FillRunEndsArrayForScalar(ArraySpan* span, const DataType* run_end_type) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these changes related for this PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If so, can you perhaps add a test for FillFromScalar on a run-end-encoded scalar? For example in arrow/array/array_run_end_test.cc.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the tests for scalar kernels automatically run the kernel also on actual scalars, and then this ends up creating an array from the scalar. So those changes to FillFromScalar are needed to be able to run any kernel on a REE scalar.

At the moment we don't actually have any test for FillFromScalar directly (also not for other types), only through testing the kernels on scalars.

using RunEndCType = typename RunEndType::c_type;
auto buffer = reinterpret_cast<RunEndCType*>(span->scratch_space);
buffer[0] = static_cast<RunEndCType>(1);
span->type = run_end_type;
span->length = 1;
span->null_count = 0;
span->buffers[1].data = reinterpret_cast<uint8_t*>(buffer);
span->buffers[1].size = sizeof(RunEndCType);
}

int GetNumBuffers(const DataType& type) {
switch (type.id()) {
case Type::NA:
Expand Down Expand Up @@ -304,9 +316,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {

Type::type type_id = value.type->id();

// Populate null count and validity bitmap (only for non-union/null types)
this->null_count = value.is_valid ? 0 : 1;
if (!is_union(type_id) && type_id != Type::NA) {
// Populate null count and validity bitmap
if (type_id == Type::NA) {
this->null_count = 1;
} else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
} else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
} else if (!HasValidityBitmap(type_id)) {

this->null_count = 0;
} else {
this->null_count = value.is_valid ? 0 : 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have dictionary scalars?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do, but they have the same semantics as dictionary arrays: the top-level validity refers to the indices, regardless of the underlying dictionary values.

this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit;
this->buffers[0].size = 1;
}
Expand Down Expand Up @@ -422,6 +438,22 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
this->child_data[i].FillFromScalar(*scalar.value[i]);
}
}
} else if (type_id == Type::RUN_END_ENCODED) {
const auto& scalar = checked_cast<const RunEndEncodedScalar&>(value);
this->child_data.resize(2);
auto& run_end_type = scalar.run_end_type();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: style

Suggested change
auto& run_end_type = scalar.run_end_type();
const auto& run_end_type = scalar.run_end_type();

switch (run_end_type->id()) {
case Type::INT16:
FillRunEndsArrayForScalar<Int16Type>(&this->child_data[0], run_end_type.get());
break;
case Type::INT32:
FillRunEndsArrayForScalar<Int32Type>(&this->child_data[0], run_end_type.get());
break;
default:
DCHECK_EQ(run_end_type->id(), Type::INT64);
FillRunEndsArrayForScalar<Int64Type>(&this->child_data[0], run_end_type.get());
}
this->child_data[1].FillFromScalar(*scalar.value);
} else if (type_id == Type::EXTENSION) {
// Pass through storage
const auto& scalar = checked_cast<const ExtensionScalar&>(value);
Expand Down
78 changes: 78 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_validity.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/ree_util.h"

namespace arrow {

Expand Down Expand Up @@ -82,6 +84,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of
}
}

static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto* sparse_union_type =
::arrow::internal::checked_cast<const SparseUnionType*>(span.type);
DCHECK_LE(span.child_data.size(), 128);

const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
for (int64_t i = 0; i < span.length; i++) {
const int8_t child_id = sparse_union_type->child_ids()[types[i]];
if (span.child_data[child_id].IsNull(i + span.offset)) {
bit_util::SetBit(out_bitmap, i + out_offset);
}
}
}

static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto* dense_union_type =
::arrow::internal::checked_cast<const DenseUnionType*>(span.type);
DCHECK_LE(span.child_data.size(), 128);

const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
const int32_t* offsets = span.GetValues<int32_t>(2); // NOLINT
for (int64_t i = 0; i < span.length; i++) {
const int8_t child_id = dense_union_type->child_ids()[types[i]];
const int32_t offset = offsets[i];
if (span.child_data[child_id].IsNull(offset)) {
bit_util::SetBit(out_bitmap, i + out_offset);
}
}
}

template <typename RunEndCType>
void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto& values = arrow::ree_util::ValuesArray(span);
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
DCHECK(!is_nested(values.type->id()));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, it is not forbidden to have nested REE values. So this should be turned into an error return (perhaps Status::NotImplemented) at a higher level.

However, instead of refusing to implement this, you could use another strategy:

  • call IsNull on the values array
  • REE-decode the REE array comprised of (run ends, run values IsNull)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been meaning to send a PR forbidding nested REEs because it's pointless to run-end encode a values array since all the runs already have length 1 and the run-end arrays contains a strictly increasing sequence of indexes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@felipecrv I think you're talking about something else? "Nested" here means any type with child fields (e.g. list, struct...).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could use another strategy:

That would indeed be a good alternative and would be more robust for whathever type is used for the REE values. I did a quick benchmark comparing both strategies in python:

In [2]: run_lengths = np.random.randint(1, 10, 100_000)

In [3]: run_values = [1, 2, 3, 4, None] * 20000

In [4]: arr = pa.RunEndEncodedArray.from_arrays(run_lengths.cumsum(), run_values)

In [5]: res1 = pc.is_null(arr)

In [6]: res2 = pc.run_end_decode(pa.RunEndEncodedArray.from_arrays(np.asarray(arr.run_ends), pc.is_null(arr.values)))

In [7]: res1.equals(res2)
Out[7]: True

In [8]: %timeit pc.is_null(arr)
309 µs ± 843 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

In [9]: %timeit pc.run_end_decode(pa.RunEndEncodedArray.from_arrays(np.asarray(arr.run_ends), pc.is_null(arr.values)))
1.07 ms ± 17.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

This is running with this branch (in release mode), so pc.is_null is using this PR's implementation, and the other is the python equivalent of what you propose (IIUC).
The alternative seems significantly slower, although I don't know how much of that is due to overhead of going through python several times.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea, but the proposal here is to make things correct first ;-) It's also possible that run-end-decoding could be optimized...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on a quick profile, a significant part of the time is spent in the actual RunEndDecode impl, indicating it's not just from the python overhead. But it's also true that this implementation is not necessary optimized ;)

Now maybe the DCHECK(!is_nested(values.type->id())); isn't actually correct. What it is protecting from is that I am getting the first buffer of the data and assuming this is a validity bitmap. But that's only not true for REE/union, and not for "nested" types in general.

Although for supporting nan_is_null, that will still be easier to do through recursively calling the is_null kernel. Will take a look at doing it that way.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pitrou I misunderstood "nested REE" as a REE inside another REE.

const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR;
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

if (!values_bitmap) {
return;
}

arrow::ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(span);
auto end = ree_span.end();
for (auto it = ree_span.begin(); it != end; ++it) {
if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) {
bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(),
true);
}
}
Comment on lines +130 to +137
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking my understanding here. If the function IsNull is given an array of type REE<X> then the return type is bool (and not REE<bool>) and so here we are exploding the (smaller) bitmap of the values child into a larger output bitmap?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. For each run, a bunch of bits is set to 1, when the single bit at values.offset + it.index_into_array() is 0.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the function IsNull is given an array of type REE<X> then the return type is bool (and not REE<bool>)

We could actually also have a function that preserves the REE structure, but currently the "is_null" kernel only has signatures of "any -> bool" (and in general I think people will expect a bool result, and as long as REE isn't really considered as a bool type (eg in filters), we should certainly return plain bool here, I think)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have an open PR implementing REE-based filters which can help make this possible.

But note that each run in a REE<Boolean> costs sizeof(RunEndCType) + 1 bit of memory. We need boolean runs of lengths higher than 17, 33, 65, for REE<Boolean> to start being better than the bitmap encoding.

}

void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto type_id = arrow::ree_util::RunEndsArray(span).type->id();
if (type_id == Type::INT16) {
SetREELogicalNullBits<int16_t>(span, out_bitmap, out_offset);
} else if (type_id == Type::INT32) {
SetREELogicalNullBits<int32_t>(span, out_bitmap, out_offset);
} else {
DCHECK_EQ(type_id, Type::INT64);
SetREELogicalNullBits<int64_t>(span, out_bitmap, out_offset);
}
}

Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& arr = batch[0].array;
ArraySpan* out_span = out->array_span_mutable();
Expand All @@ -100,6 +168,16 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
} else {
// Input has no nulls => output is entirely false.
bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false);
// Except for union/ree which never has physical nulls, but can have logical
// nulls from the child arrays -> set those bits to true
const auto t = arr.type->id();
if (t == Type::SPARSE_UNION) {
SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset);
} else if (t == Type::DENSE_UNION) {
SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset);
} else if (t == Type::RUN_END_ENCODED) {
SetREELogicalNullBits(arr, out_bitmap, out_span->offset);
}
}

if (is_floating(arr.type->id()) && options.nan_is_null) {
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_validity_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) {
"[true, false, false, true]", &nan_is_null_options);
}

TEST_F(TestBooleanValidityKernels, IsNullUnion) {
auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]");
auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])");
auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])");
ASSERT_OK_AND_ASSIGN(auto arr1,
SparseUnionArray::Make(*type_ids, {field_i64, field_str}));
auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]");
CheckScalarUnary("is_null", arr1, expected);

auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]");
auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])");
auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])");
ASSERT_OK_AND_ASSIGN(auto arr2,
DenseUnionArray::Make(*type_ids, *value_offsets,
{dense_field_i64, dense_field_str}));
CheckScalarUnary("is_null", arr2, expected);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add floating-point child union values and test with/without the nan_is_null option set?


TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) {
auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])");
auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])");
ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values));
ASSERT_OK(ree_array->ValidateFull());
auto expected =
ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]");
CheckScalarUnary("is_null", ree_array, expected);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test with floating-point run end values and clearing/setting the nan_is_null option?


TEST(TestValidityKernels, IsFinite) {
for (const auto& ty : IntTypes()) {
CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")},
Expand Down
42 changes: 42 additions & 0 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1637,6 +1637,48 @@ def test_is_null():
assert result.equals(expected)


def test_is_null_union():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to add these tests on the Python side? You have similar tests in C++ already.

arr = pa.UnionArray.from_sparse(
pa.array([0, 1, 0, 0, 1], type=pa.int8()),
[
pa.array([0.0, 1.1, None, 3.3, 4.4]),
pa.array([True, None, False, True, False]),
]
)
assert arr.to_pylist() == [0.0, None, None, 3.3, False]
result = arr.is_null()
expected = pa.array([False, True, True, False, False])
assert result.equals(expected)
result = arr.slice(1, 2).is_null()
assert result.equals(expected.slice(1, 2))

arr = pa.UnionArray.from_dense(
pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()),
pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()),
[
pa.array([0.0, 1.1, None, 3.3]),
pa.array([True, None, False])
]
)
assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False]
result = arr.is_null()
expected = pa.array([False, False, False, True, False, True, False])
assert result.equals(expected)
result = arr.slice(1, 3).is_null()
assert result.equals(expected.slice(1, 3))


@pytest.mark.parametrize("typ", ["int16", "int32", "int64"])
def test_is_null_run_end_encoded(typ):
decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1])
arr = pc.run_end_encode(decoded, run_end_type=typ)
result = arr.is_null()
expected = pa.array([False, False, False, True, False, False, True, True, False])
assert result.equals(expected)
result = arr.slice(2, 5).is_null()
assert result.equals(expected.slice(2, 5))


def test_is_nan():
arr = pa.array([1, 2, 3, None, np.nan])
result = arr.is_nan()
Expand Down