-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-34315: [C++] Correct is_null kernel for Union and RunEndEncoded logical nulls #35036
base: main
Are you sure you want to change the base?
Changes from 10 commits
0ded622
6fa5eeb
d661a16
197005a
45a1cdb
f93ecae
621f024
0cb0d8b
07a9c58
0535a01
9a6e4b1
e836da6
3e086be
9ee9820
bd34e35
3f174b0
c89f43d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -304,9 +304,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) { | |||||
|
||||||
Type::type type_id = value.type->id(); | ||||||
|
||||||
// Populate null count and validity bitmap (only for non-union/null types) | ||||||
this->null_count = value.is_valid ? 0 : 1; | ||||||
if (!is_union(type_id) && type_id != Type::NA) { | ||||||
// Populate null count and validity bitmap | ||||||
if (type_id == Type::NA) { | ||||||
this->null_count = 1; | ||||||
} else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) { | ||||||
this->null_count = 0; | ||||||
} else { | ||||||
this->null_count = value.is_valid ? 0 : 1; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have dictionary scalars? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do, but they have the same semantics as dictionary arrays: the top-level validity refers to the indices, regardless of the underlying dictionary values. |
||||||
this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit; | ||||||
this->buffers[0].size = 1; | ||||||
} | ||||||
|
@@ -422,6 +426,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) { | |||||
this->child_data[i].FillFromScalar(*scalar.value[i]); | ||||||
} | ||||||
} | ||||||
} else if (type_id == Type::RUN_END_ENCODED) { | ||||||
const auto& scalar = checked_cast<const RunEndEncodedScalar&>(value); | ||||||
this->child_data.resize(2); | ||||||
auto& run_end_type = scalar.run_end_type(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: style
Suggested change
|
||||||
auto run_end = MakeScalar(run_end_type, 1).ValueOrDie(); | ||||||
this->child_data[0].FillFromScalar(*run_end); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I am setting a child ArraySpan for the run ends from a newly created scalar. Similarly to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I pushed an attempt to fix this, it feels a bit complicated though, see the last commit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's faster than going through the switch of |
||||||
this->child_data[1].FillFromScalar(*scalar.value); | ||||||
} else if (type_id == Type::EXTENSION) { | ||||||
// Pass through storage | ||||||
const auto& scalar = checked_cast<const ExtensionScalar&>(value); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,8 @@ | |
|
||
#include "arrow/util/bit_util.h" | ||
#include "arrow/util/bitmap_ops.h" | ||
#include "arrow/util/checked_cast.h" | ||
#include "arrow/util/ree_util.h" | ||
|
||
namespace arrow { | ||
|
||
|
@@ -82,6 +84,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of | |
} | ||
} | ||
|
||
static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, | ||
int64_t out_offset) { | ||
const auto* sparse_union_type = | ||
::arrow::internal::checked_cast<const SparseUnionType*>(span.type); | ||
DCHECK_LE(span.child_data.size(), 128); | ||
|
||
const int8_t* types = span.GetValues<int8_t>(1); // NOLINT | ||
for (int64_t i = 0; i < span.length; i++) { | ||
const int8_t child_id = sparse_union_type->child_ids()[types[i]]; | ||
if (span.child_data[child_id].IsNull(i + span.offset)) { | ||
bit_util::SetBit(out_bitmap, i + out_offset); | ||
} | ||
} | ||
} | ||
|
||
static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, | ||
int64_t out_offset) { | ||
const auto* dense_union_type = | ||
::arrow::internal::checked_cast<const DenseUnionType*>(span.type); | ||
DCHECK_LE(span.child_data.size(), 128); | ||
|
||
const int8_t* types = span.GetValues<int8_t>(1); // NOLINT | ||
const int32_t* offsets = span.GetValues<int32_t>(2); // NOLINT | ||
for (int64_t i = 0; i < span.length; i++) { | ||
const int8_t child_id = dense_union_type->child_ids()[types[i]]; | ||
const int32_t offset = offsets[i]; | ||
if (span.child_data[child_id].IsNull(offset)) { | ||
bit_util::SetBit(out_bitmap, i + out_offset); | ||
} | ||
} | ||
} | ||
|
||
template <typename RunEndCType> | ||
void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, | ||
int64_t out_offset) { | ||
DCHECK(!is_nested(span.type->id())); | ||
const auto& values = arrow::ree_util::ValuesArray(span); | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR; | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if (!values_bitmap) { | ||
return; | ||
} | ||
|
||
arrow::ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(span); | ||
auto end = ree_span.end(); | ||
for (auto it = ree_span.begin(); it != end; ++it) { | ||
if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) { | ||
bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(), | ||
true); | ||
} | ||
} | ||
Comment on lines
+130
to
+137
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just checking my understanding here. If the function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. For each run, a bunch of bits is set to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We could actually also have a function that preserves the REE structure, but currently the "is_null" kernel only has signatures of "any -> bool" (and in general I think people will expect a bool result, and as long as REE isn't really considered as a bool type (eg in filters), we should certainly return plain bool here, I think) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have an open PR implementing REE-based filters which can help make this possible. But note that each run in a |
||
} | ||
|
||
void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, | ||
int64_t out_offset) { | ||
const auto type_id = arrow::ree_util::RunEndsArray(span).type->id(); | ||
if (type_id == Type::INT16) { | ||
SetREELogicalNullBits<int16_t>(span, out_bitmap, out_offset); | ||
} else if (type_id == Type::INT32) { | ||
SetREELogicalNullBits<int32_t>(span, out_bitmap, out_offset); | ||
} else { | ||
DCHECK_EQ(type_id, Type::INT64); | ||
SetREELogicalNullBits<int64_t>(span, out_bitmap, out_offset); | ||
} | ||
} | ||
|
||
Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { | ||
const ArraySpan& arr = batch[0].array; | ||
ArraySpan* out_span = out->array_span_mutable(); | ||
|
@@ -100,6 +168,16 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { | |
} else { | ||
// Input has no nulls => output is entirely false. | ||
bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); | ||
// Except for union/ree which never has physical nulls, but can have logical | ||
// nulls from the child arrays -> set those bits to true | ||
const auto t = arr.type->id(); | ||
if (t == Type::SPARSE_UNION) { | ||
SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); | ||
} else if (t == Type::DENSE_UNION) { | ||
SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); | ||
} else if (t == Type::RUN_END_ENCODED) { | ||
SetREELogicalNullBits(arr, out_bitmap, out_span->offset); | ||
} | ||
} | ||
|
||
if (is_floating(arr.type->id()) && options.nan_is_null) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) { | |
"[true, false, false, true]", &nan_is_null_options); | ||
} | ||
|
||
TEST_F(TestBooleanValidityKernels, IsNullUnion) { | ||
auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]"); | ||
auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])"); | ||
auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])"); | ||
ASSERT_OK_AND_ASSIGN(auto arr1, | ||
SparseUnionArray::Make(*type_ids, {field_i64, field_str})); | ||
auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]"); | ||
CheckScalarUnary("is_null", arr1, expected); | ||
|
||
auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]"); | ||
auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])"); | ||
auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])"); | ||
ASSERT_OK_AND_ASSIGN(auto arr2, | ||
DenseUnionArray::Make(*type_ids, *value_offsets, | ||
{dense_field_i64, dense_field_str})); | ||
CheckScalarUnary("is_null", arr2, expected); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add floating-point child union values and test with/without the |
||
|
||
TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) { | ||
auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])"); | ||
auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])"); | ||
ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values)); | ||
ASSERT_OK(ree_array->ValidateFull()); | ||
auto expected = | ||
ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]"); | ||
CheckScalarUnary("is_null", ree_array, expected); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a test with floating-point run end values and clearing/setting the |
||
|
||
TEST(TestValidityKernels, IsFinite) { | ||
for (const auto& ty : IntTypes()) { | ||
CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")}, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1637,6 +1637,48 @@ def test_is_null(): | |
assert result.equals(expected) | ||
|
||
|
||
def test_is_null_union(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it necessary to add these tests on the Python side? You have similar tests in C++ already. |
||
arr = pa.UnionArray.from_sparse( | ||
pa.array([0, 1, 0, 0, 1], type=pa.int8()), | ||
[ | ||
pa.array([0.0, 1.1, None, 3.3, 4.4]), | ||
pa.array([True, None, False, True, False]), | ||
] | ||
) | ||
assert arr.to_pylist() == [0.0, None, None, 3.3, False] | ||
result = arr.is_null() | ||
expected = pa.array([False, True, True, False, False]) | ||
assert result.equals(expected) | ||
result = arr.slice(1, 2).is_null() | ||
assert result.equals(expected.slice(1, 2)) | ||
|
||
arr = pa.UnionArray.from_dense( | ||
pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()), | ||
pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()), | ||
[ | ||
pa.array([0.0, 1.1, None, 3.3]), | ||
pa.array([True, None, False]) | ||
] | ||
) | ||
assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False] | ||
result = arr.is_null() | ||
expected = pa.array([False, False, False, True, False, True, False]) | ||
assert result.equals(expected) | ||
result = arr.slice(1, 3).is_null() | ||
assert result.equals(expected.slice(1, 3)) | ||
|
||
|
||
@pytest.mark.parametrize("typ", ["int16", "int32", "int64"]) | ||
def test_is_null_run_end_encoded(typ): | ||
decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1]) | ||
arr = pc.run_end_encode(decoded, run_end_type=typ) | ||
result = arr.is_null() | ||
expected = pa.array([False, False, False, True, False, False, True, True, False]) | ||
assert result.equals(expected) | ||
result = arr.slice(2, 5).is_null() | ||
assert result.equals(expected.slice(2, 5)) | ||
|
||
|
||
def test_is_nan(): | ||
arr = pa.array([1, 2, 3, None, np.nan]) | ||
result = arr.is_nan() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.