-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-34315: [C++] Correct is_null kernel for Union and RunEndEncoded logical nulls #35036
base: main
Are you sure you want to change the base?
Changes from all commits
0ded622
6fa5eeb
d661a16
197005a
45a1cdb
f93ecae
621f024
0cb0d8b
07a9c58
0535a01
9a6e4b1
e836da6
3e086be
9ee9820
bd34e35
3f174b0
c89f43d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -236,6 +236,18 @@ void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_siz | |||||
span->buffers[buffer_index].size = 2 * sizeof(offset_type); | ||||||
} | ||||||
|
||||||
template <typename RunEndType> | ||||||
void FillRunEndsArrayForScalar(ArraySpan* span, const DataType* run_end_type) { | ||||||
using RunEndCType = typename RunEndType::c_type; | ||||||
auto buffer = reinterpret_cast<RunEndCType*>(span->scratch_space); | ||||||
buffer[0] = static_cast<RunEndCType>(1); | ||||||
span->type = run_end_type; | ||||||
span->length = 1; | ||||||
span->null_count = 0; | ||||||
span->buffers[1].data = reinterpret_cast<uint8_t*>(buffer); | ||||||
span->buffers[1].size = sizeof(RunEndCType); | ||||||
} | ||||||
|
||||||
int GetNumBuffers(const DataType& type) { | ||||||
switch (type.id()) { | ||||||
case Type::NA: | ||||||
|
@@ -304,9 +316,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) { | |||||
|
||||||
Type::type type_id = value.type->id(); | ||||||
|
||||||
// Populate null count and validity bitmap (only for non-union/null types) | ||||||
this->null_count = value.is_valid ? 0 : 1; | ||||||
if (!is_union(type_id) && type_id != Type::NA) { | ||||||
// Populate null count and validity bitmap | ||||||
if (type_id == Type::NA) { | ||||||
this->null_count = 1; | ||||||
} else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
this->null_count = 0; | ||||||
} else { | ||||||
this->null_count = value.is_valid ? 0 : 1; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have dictionary scalars? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do, but they have the same semantics as dictionary arrays: the top-level validity refers to the indices, regardless of the underlying dictionary values. |
||||||
this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit; | ||||||
this->buffers[0].size = 1; | ||||||
} | ||||||
|
@@ -422,6 +438,22 @@ void ArraySpan::FillFromScalar(const Scalar& value) { | |||||
this->child_data[i].FillFromScalar(*scalar.value[i]); | ||||||
} | ||||||
} | ||||||
} else if (type_id == Type::RUN_END_ENCODED) { | ||||||
const auto& scalar = checked_cast<const RunEndEncodedScalar&>(value); | ||||||
this->child_data.resize(2); | ||||||
auto& run_end_type = scalar.run_end_type(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: style
Suggested change
|
||||||
switch (run_end_type->id()) { | ||||||
case Type::INT16: | ||||||
FillRunEndsArrayForScalar<Int16Type>(&this->child_data[0], run_end_type.get()); | ||||||
break; | ||||||
case Type::INT32: | ||||||
FillRunEndsArrayForScalar<Int32Type>(&this->child_data[0], run_end_type.get()); | ||||||
break; | ||||||
default: | ||||||
DCHECK_EQ(run_end_type->id(), Type::INT64); | ||||||
FillRunEndsArrayForScalar<Int64Type>(&this->child_data[0], run_end_type.get()); | ||||||
} | ||||||
this->child_data[1].FillFromScalar(*scalar.value); | ||||||
} else if (type_id == Type::EXTENSION) { | ||||||
// Pass through storage | ||||||
const auto& scalar = checked_cast<const ExtensionScalar&>(value); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,8 @@ | |
|
||
#include "arrow/util/bit_util.h" | ||
#include "arrow/util/bitmap_ops.h" | ||
#include "arrow/util/checked_cast.h" | ||
#include "arrow/util/ree_util.h" | ||
|
||
namespace arrow { | ||
|
||
|
@@ -82,6 +84,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of | |
} | ||
} | ||
|
||
static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, | ||
int64_t out_offset) { | ||
const auto* sparse_union_type = | ||
::arrow::internal::checked_cast<const SparseUnionType*>(span.type); | ||
DCHECK_LE(span.child_data.size(), 128); | ||
|
||
const int8_t* types = span.GetValues<int8_t>(1); // NOLINT | ||
for (int64_t i = 0; i < span.length; i++) { | ||
const int8_t child_id = sparse_union_type->child_ids()[types[i]]; | ||
if (span.child_data[child_id].IsNull(i + span.offset)) { | ||
bit_util::SetBit(out_bitmap, i + out_offset); | ||
} | ||
} | ||
} | ||
|
||
static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, | ||
int64_t out_offset) { | ||
const auto* dense_union_type = | ||
::arrow::internal::checked_cast<const DenseUnionType*>(span.type); | ||
DCHECK_LE(span.child_data.size(), 128); | ||
|
||
const int8_t* types = span.GetValues<int8_t>(1); // NOLINT | ||
const int32_t* offsets = span.GetValues<int32_t>(2); // NOLINT | ||
for (int64_t i = 0; i < span.length; i++) { | ||
const int8_t child_id = dense_union_type->child_ids()[types[i]]; | ||
const int32_t offset = offsets[i]; | ||
if (span.child_data[child_id].IsNull(offset)) { | ||
bit_util::SetBit(out_bitmap, i + out_offset); | ||
} | ||
} | ||
} | ||
|
||
template <typename RunEndCType> | ||
void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, | ||
int64_t out_offset) { | ||
const auto& values = arrow::ree_util::ValuesArray(span); | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
DCHECK(!is_nested(values.type->id())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAIK, it is not forbidden to have nested REE values. So this should be turned into an error return (perhaps However, instead of refusing to implement this, you could use another strategy:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've been meaning to send a PR forbidding nested REEs because it's pointless to run-end encode a values array since all the runs already have length 1 and the run-end arrays contains a strictly increasing sequence of indexes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @felipecrv I think you're talking about something else? "Nested" here means any type with child fields (e.g. list, struct...). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That would indeed be a good alternative and would be more robust for whathever type is used for the REE values. I did a quick benchmark comparing both strategies in python:
This is running with this branch (in release mode), so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No idea, but the proposal here is to make things correct first ;-) It's also possible that run-end-decoding could be optimized... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on a quick profile, a significant part of the time is spent in the actual RunEndDecode impl, indicating it's not just from the python overhead. But it's also true that this implementation is not necessary optimized ;) Now maybe the Although for supporting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @pitrou I misunderstood "nested REE" as a REE inside another REE. |
||
const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR; | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if (!values_bitmap) { | ||
return; | ||
} | ||
|
||
arrow::ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(span); | ||
auto end = ree_span.end(); | ||
for (auto it = ree_span.begin(); it != end; ++it) { | ||
if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) { | ||
bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(), | ||
true); | ||
} | ||
} | ||
Comment on lines
+130
to
+137
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just checking my understanding here. If the function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. For each run, a bunch of bits is set to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We could actually also have a function that preserves the REE structure, but currently the "is_null" kernel only has signatures of "any -> bool" (and in general I think people will expect a bool result, and as long as REE isn't really considered as a bool type (eg in filters), we should certainly return plain bool here, I think) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have an open PR implementing REE-based filters which can help make this possible. But note that each run in a |
||
} | ||
|
||
void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, | ||
int64_t out_offset) { | ||
const auto type_id = arrow::ree_util::RunEndsArray(span).type->id(); | ||
if (type_id == Type::INT16) { | ||
SetREELogicalNullBits<int16_t>(span, out_bitmap, out_offset); | ||
} else if (type_id == Type::INT32) { | ||
SetREELogicalNullBits<int32_t>(span, out_bitmap, out_offset); | ||
} else { | ||
DCHECK_EQ(type_id, Type::INT64); | ||
SetREELogicalNullBits<int64_t>(span, out_bitmap, out_offset); | ||
} | ||
} | ||
|
||
Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { | ||
const ArraySpan& arr = batch[0].array; | ||
ArraySpan* out_span = out->array_span_mutable(); | ||
|
@@ -100,6 +168,16 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { | |
} else { | ||
// Input has no nulls => output is entirely false. | ||
bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); | ||
// Except for union/ree which never has physical nulls, but can have logical | ||
// nulls from the child arrays -> set those bits to true | ||
const auto t = arr.type->id(); | ||
if (t == Type::SPARSE_UNION) { | ||
SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); | ||
} else if (t == Type::DENSE_UNION) { | ||
SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); | ||
} else if (t == Type::RUN_END_ENCODED) { | ||
SetREELogicalNullBits(arr, out_bitmap, out_span->offset); | ||
} | ||
} | ||
|
||
if (is_floating(arr.type->id()) && options.nan_is_null) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) { | |
"[true, false, false, true]", &nan_is_null_options); | ||
} | ||
|
||
TEST_F(TestBooleanValidityKernels, IsNullUnion) { | ||
auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]"); | ||
auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])"); | ||
auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])"); | ||
ASSERT_OK_AND_ASSIGN(auto arr1, | ||
SparseUnionArray::Make(*type_ids, {field_i64, field_str})); | ||
auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]"); | ||
CheckScalarUnary("is_null", arr1, expected); | ||
|
||
auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]"); | ||
auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])"); | ||
auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])"); | ||
ASSERT_OK_AND_ASSIGN(auto arr2, | ||
DenseUnionArray::Make(*type_ids, *value_offsets, | ||
{dense_field_i64, dense_field_str})); | ||
CheckScalarUnary("is_null", arr2, expected); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add floating-point child union values and test with/without the |
||
|
||
TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) { | ||
auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])"); | ||
auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])"); | ||
ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values)); | ||
ASSERT_OK(ree_array->ValidateFull()); | ||
auto expected = | ||
ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]"); | ||
CheckScalarUnary("is_null", ree_array, expected); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a test with floating-point run end values and clearing/setting the |
||
|
||
TEST(TestValidityKernels, IsFinite) { | ||
for (const auto& ty : IntTypes()) { | ||
CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")}, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1637,6 +1637,48 @@ def test_is_null(): | |
assert result.equals(expected) | ||
|
||
|
||
def test_is_null_union(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it necessary to add these tests on the Python side? You have similar tests in C++ already. |
||
arr = pa.UnionArray.from_sparse( | ||
pa.array([0, 1, 0, 0, 1], type=pa.int8()), | ||
[ | ||
pa.array([0.0, 1.1, None, 3.3, 4.4]), | ||
pa.array([True, None, False, True, False]), | ||
] | ||
) | ||
assert arr.to_pylist() == [0.0, None, None, 3.3, False] | ||
result = arr.is_null() | ||
expected = pa.array([False, True, True, False, False]) | ||
assert result.equals(expected) | ||
result = arr.slice(1, 2).is_null() | ||
assert result.equals(expected.slice(1, 2)) | ||
|
||
arr = pa.UnionArray.from_dense( | ||
pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()), | ||
pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()), | ||
[ | ||
pa.array([0.0, 1.1, None, 3.3]), | ||
pa.array([True, None, False]) | ||
] | ||
) | ||
assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False] | ||
result = arr.is_null() | ||
expected = pa.array([False, False, False, True, False, True, False]) | ||
assert result.equals(expected) | ||
result = arr.slice(1, 3).is_null() | ||
assert result.equals(expected.slice(1, 3)) | ||
|
||
|
||
@pytest.mark.parametrize("typ", ["int16", "int32", "int64"]) | ||
def test_is_null_run_end_encoded(typ): | ||
decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1]) | ||
arr = pc.run_end_encode(decoded, run_end_type=typ) | ||
result = arr.is_null() | ||
expected = pa.array([False, False, False, True, False, False, True, True, False]) | ||
assert result.equals(expected) | ||
result = arr.slice(2, 5).is_null() | ||
assert result.equals(expected.slice(2, 5)) | ||
|
||
|
||
def test_is_nan(): | ||
arr = pa.array([1, 2, 3, None, np.nan]) | ||
result = arr.is_nan() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are these changes related for this PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If so, can you perhaps add a test for
FillFromScalar
on a run-end-encoded scalar? For example inarrow/array/array_run_end_test.cc
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the tests for scalar kernels automatically run the kernel also on actual scalars, and then this ends up creating an array from the scalar. So those changes to
FillFromScalar
are needed to be able to run any kernel on a REE scalar.At the moment we don't actually have any test for
FillFromScalar
directly (also not for other types), only through testing the kernels on scalars.