Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-34315: [C++] Correct is_null kernel for Union and RunEndEncoded logical nulls #35036

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {

Type::type type_id = value.type->id();

// Populate null count and validity bitmap (only for non-union/null types)
this->null_count = value.is_valid ? 0 : 1;
if (!is_union(type_id) && type_id != Type::NA) {
// Populate null count and validity bitmap
if (type_id == Type::NA) {
this->null_count = 1;
} else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
} else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
} else if (!HasValidityBitmap(type_id)) {

this->null_count = 0;
} else {
this->null_count = value.is_valid ? 0 : 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have dictionary scalars?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do, but they have the same semantics as dictionary arrays: the top-level validity refers to the indices, regardless of the underlying dictionary values.

this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit;
this->buffers[0].size = 1;
}
Expand Down Expand Up @@ -422,6 +426,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
this->child_data[i].FillFromScalar(*scalar.value[i]);
}
}
} else if (type_id == Type::RUN_END_ENCODED) {
const auto& scalar = checked_cast<const RunEndEncodedScalar&>(value);
this->child_data.resize(2);
auto& run_end_type = scalar.run_end_type();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: style

Suggested change
auto& run_end_type = scalar.run_end_type();
const auto& run_end_type = scalar.run_end_type();

auto run_end = MakeScalar(run_end_type, 1).ValueOrDie();
this->child_data[0].FillFromScalar(*run_end);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I am setting a child ArraySpan for the run ends from a newly created scalar. Similarly to SetOffsetsForScalar for list-like types, I should be creating this child array using the scratch space?
(the main difference is that here I need to create a child ArraySpan from the scratch space, instead of just setting the scratch space to some value and assigning it to one of the span's buffers.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I pushed an attempt to fix this, it feels a bit complicated though, see the last commit

Copy link
Contributor

@felipecrv felipecrv May 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it feels a bit complicated though

It's faster than going through the switch of MakeScalar, so let's go with your new solution.

this->child_data[1].FillFromScalar(*scalar.value);
} else if (type_id == Type::EXTENSION) {
// Pass through storage
const auto& scalar = checked_cast<const ExtensionScalar&>(value);
Expand Down
78 changes: 78 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_validity.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/ree_util.h"

namespace arrow {

Expand Down Expand Up @@ -82,6 +84,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of
}
}

static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto* sparse_union_type =
::arrow::internal::checked_cast<const SparseUnionType*>(span.type);
DCHECK_LE(span.child_data.size(), 128);

const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
for (int64_t i = 0; i < span.length; i++) {
const int8_t child_id = sparse_union_type->child_ids()[types[i]];
if (span.child_data[child_id].IsNull(i + span.offset)) {
bit_util::SetBit(out_bitmap, i + out_offset);
}
}
}

static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto* dense_union_type =
::arrow::internal::checked_cast<const DenseUnionType*>(span.type);
DCHECK_LE(span.child_data.size(), 128);

const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
const int32_t* offsets = span.GetValues<int32_t>(2); // NOLINT
for (int64_t i = 0; i < span.length; i++) {
const int8_t child_id = dense_union_type->child_ids()[types[i]];
const int32_t offset = offsets[i];
if (span.child_data[child_id].IsNull(offset)) {
bit_util::SetBit(out_bitmap, i + out_offset);
}
}
}

template <typename RunEndCType>
void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
DCHECK(!is_nested(span.type->id()));
const auto& values = arrow::ree_util::ValuesArray(span);
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR;
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

if (!values_bitmap) {
return;
}

arrow::ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(span);
auto end = ree_span.end();
for (auto it = ree_span.begin(); it != end; ++it) {
if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) {
bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(),
true);
}
}
Comment on lines +130 to +137
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking my understanding here. If the function IsNull is given an array of type REE<X> then the return type is bool (and not REE<bool>) and so here we are exploding the (smaller) bitmap of the values child into a larger output bitmap?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. For each run, a bunch of bits is set to 1, when the single bit at values.offset + it.index_into_array() is 0.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the function IsNull is given an array of type REE<X> then the return type is bool (and not REE<bool>)

We could actually also have a function that preserves the REE structure, but currently the "is_null" kernel only has signatures of "any -> bool" (and in general I think people will expect a bool result, and as long as REE isn't really considered as a bool type (eg in filters), we should certainly return plain bool here, I think)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have an open PR implementing REE-based filters which can help make this possible.

But note that each run in a REE<Boolean> costs sizeof(RunEndCType) + 1 bit of memory. We need boolean runs of lengths higher than 17, 33, 65, for REE<Boolean> to start being better than the bitmap encoding.

}

void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto type_id = arrow::ree_util::RunEndsArray(span).type->id();
if (type_id == Type::INT16) {
SetREELogicalNullBits<int16_t>(span, out_bitmap, out_offset);
} else if (type_id == Type::INT32) {
SetREELogicalNullBits<int32_t>(span, out_bitmap, out_offset);
} else {
DCHECK_EQ(type_id, Type::INT64);
SetREELogicalNullBits<int64_t>(span, out_bitmap, out_offset);
}
}

Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& arr = batch[0].array;
ArraySpan* out_span = out->array_span_mutable();
Expand All @@ -100,6 +168,16 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
} else {
// Input has no nulls => output is entirely false.
bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false);
// Except for union/ree which never has physical nulls, but can have logical
// nulls from the child arrays -> set those bits to true
const auto t = arr.type->id();
if (t == Type::SPARSE_UNION) {
SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset);
} else if (t == Type::DENSE_UNION) {
SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset);
} else if (t == Type::RUN_END_ENCODED) {
SetREELogicalNullBits(arr, out_bitmap, out_span->offset);
}
}

if (is_floating(arr.type->id()) && options.nan_is_null) {
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_validity_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) {
"[true, false, false, true]", &nan_is_null_options);
}

TEST_F(TestBooleanValidityKernels, IsNullUnion) {
auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]");
auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])");
auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])");
ASSERT_OK_AND_ASSIGN(auto arr1,
SparseUnionArray::Make(*type_ids, {field_i64, field_str}));
auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]");
CheckScalarUnary("is_null", arr1, expected);

auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]");
auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])");
auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])");
ASSERT_OK_AND_ASSIGN(auto arr2,
DenseUnionArray::Make(*type_ids, *value_offsets,
{dense_field_i64, dense_field_str}));
CheckScalarUnary("is_null", arr2, expected);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add floating-point child union values and test with/without the nan_is_null option set?


TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) {
auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])");
auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])");
ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values));
ASSERT_OK(ree_array->ValidateFull());
auto expected =
ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]");
CheckScalarUnary("is_null", ree_array, expected);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test with floating-point run end values and clearing/setting the nan_is_null option?


TEST(TestValidityKernels, IsFinite) {
for (const auto& ty : IntTypes()) {
CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")},
Expand Down
42 changes: 42 additions & 0 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1637,6 +1637,48 @@ def test_is_null():
assert result.equals(expected)


def test_is_null_union():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to add these tests on the Python side? You have similar tests in C++ already.

arr = pa.UnionArray.from_sparse(
pa.array([0, 1, 0, 0, 1], type=pa.int8()),
[
pa.array([0.0, 1.1, None, 3.3, 4.4]),
pa.array([True, None, False, True, False]),
]
)
assert arr.to_pylist() == [0.0, None, None, 3.3, False]
result = arr.is_null()
expected = pa.array([False, True, True, False, False])
assert result.equals(expected)
result = arr.slice(1, 2).is_null()
assert result.equals(expected.slice(1, 2))

arr = pa.UnionArray.from_dense(
pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()),
pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()),
[
pa.array([0.0, 1.1, None, 3.3]),
pa.array([True, None, False])
]
)
assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False]
result = arr.is_null()
expected = pa.array([False, False, False, True, False, True, False])
assert result.equals(expected)
result = arr.slice(1, 3).is_null()
assert result.equals(expected.slice(1, 3))


@pytest.mark.parametrize("typ", ["int16", "int32", "int64"])
def test_is_null_run_end_encoded(typ):
decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1])
arr = pc.run_end_encode(decoded, run_end_type=typ)
result = arr.is_null()
expected = pa.array([False, False, False, True, False, False, True, True, False])
assert result.equals(expected)
result = arr.slice(2, 5).is_null()
assert result.equals(expected.slice(2, 5))


def test_is_nan():
arr = pa.array([1, 2, 3, None, np.nan])
result = arr.is_nan()
Expand Down