Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-34361: [C++] Fix the handling of logical nulls for types without bitmaps like Unions and Run-End Encoded #34408

Merged
merged 11 commits into from
Apr 6, 2023
Merged
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ set(ARROW_SRCS
util/time.cc
util/tracing.cc
util/trie.cc
util/union_util.cc
util/unreachable.cc
util/uri.cc
util/utf8.cc
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/array_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ class ExtensionArray;

int64_t Array::null_count() const { return data_->GetNullCount(); }

int64_t Array::ComputeLogicalNullCount() const {
return data_->ComputeLogicalNullCount();
}

namespace internal {

struct ScalarFromArraySlotImpl {
Expand Down Expand Up @@ -175,6 +179,10 @@ struct ScalarFromArraySlotImpl {
array_.length());
}

// Skip checking for nulls in RUN_END_ENCODED arrays to avoid potentially
// making two O(log n) searches for the physical index of the slot -- one
// here and another in Visit(const RunEndEncodedArray&) in case the values
// is not null.
if (array_.type()->id() != Type::RUN_END_ENCODED && array_.IsNull(index_)) {
auto null = MakeNullScalar(array_.type());
if (is_dictionary(array_.type()->id())) {
Expand Down
37 changes: 29 additions & 8 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,28 @@ class ARROW_EXPORT Array {
virtual ~Array() = default;

/// \brief Return true if value at index is null. Does not boundscheck
bool IsNull(int64_t i) const {
return null_bitmap_data_ != NULLPTR
? !bit_util::GetBit(null_bitmap_data_, i + data_->offset)
: data_->null_count == data_->length;
}
bool IsNull(int64_t i) const { return !IsValid(i); }

/// \brief Return true if value at index is valid (not null). Does not
/// boundscheck
bool IsValid(int64_t i) const {
return null_bitmap_data_ != NULLPTR
? bit_util::GetBit(null_bitmap_data_, i + data_->offset)
: data_->null_count != data_->length;
if (null_bitmap_data_ != NULLPTR) {
return bit_util::GetBit(null_bitmap_data_, i + data_->offset);
}
// Dispatching with a few conditionals like this makes IsNull more
// efficient for how it is used in practice. Making IsNull virtual
// would add a vtable lookup to every call and prevent inlining +
// a potential inner-branch removal.
if (type_id() == Type::SPARSE_UNION) {
return !internal::IsNullSparseUnion(*data_, i);
}
if (type_id() == Type::DENSE_UNION) {
return !internal::IsNullDenseUnion(*data_, i);
}
if (type_id() == Type::RUN_END_ENCODED) {
return !internal::IsNullRunEndEncoded(*data_, i);
}
return data_->null_count != data_->length;
}

/// \brief Return a Scalar containing the value of this array at i
Expand All @@ -85,6 +95,17 @@ class ARROW_EXPORT Array {
/// function
int64_t null_count() const;

/// \brief Computes the logical null count for arrays of all types including
/// those that do not have a validity bitmap like union and run-end encoded
/// arrays
///
/// If the array has a validity bitmap, this function behaves the same as
/// null_count(). For types that have no validity bitmap, this function will
/// recompute the null count every time it is called.
///
/// \see GetNullCount
int64_t ComputeLogicalNullCount() const;

std::shared_ptr<DataType> type() const { return data_->type; }
Type::type type_id() const { return data_->type->id(); }

Expand Down
83 changes: 65 additions & 18 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_dict.h"
#include "arrow/array/builder_run_end.h"
#include "arrow/array/builder_time.h"
#include "arrow/array/data.h"
#include "arrow/array/util.h"
Expand Down Expand Up @@ -83,15 +84,48 @@ TEST_F(TestArray, TestNullCount) {
auto data = std::make_shared<Buffer>(nullptr, 0);
auto null_bitmap = std::make_shared<Buffer>(nullptr, 0);

std::unique_ptr<Int32Array> arr(new Int32Array(100, data, null_bitmap, 10));
std::shared_ptr<Int32Array> arr(new Int32Array(100, data, null_bitmap, 10));
ASSERT_EQ(10, arr->ComputeLogicalNullCount());
ASSERT_EQ(10, arr->null_count());
ASSERT_TRUE(arr->data()->MayHaveNulls());
ASSERT_TRUE(arr->data()->MayHaveLogicalNulls());

std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
std::shared_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
ASSERT_EQ(0, arr_no_nulls->ComputeLogicalNullCount());
ASSERT_EQ(0, arr_no_nulls->null_count());
ASSERT_FALSE(arr_no_nulls->data()->MayHaveNulls());
ASSERT_FALSE(arr_no_nulls->data()->MayHaveLogicalNulls());

std::unique_ptr<Int32Array> arr_default_null_count(
std::shared_ptr<Int32Array> arr_default_null_count(
new Int32Array(100, data, null_bitmap));
ASSERT_EQ(kUnknownNullCount, arr_default_null_count->data()->null_count);
ASSERT_TRUE(arr_default_null_count->data()->MayHaveNulls());
ASSERT_TRUE(arr_default_null_count->data()->MayHaveLogicalNulls());

RunEndEncodedBuilder ree_builder(pool_, std::make_shared<Int32Builder>(pool_),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have existing tests like this for sparse and dense union? (given we now have distinct paths for those)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mentioned you in PR comments pointing to code that tests IsNull/Valid for unions.

std::make_shared<Int32Builder>(pool_),
run_end_encoded(int32(), int32()));
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(2), 2));
ASSERT_OK(ree_builder.AppendNull());
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(4), 3));
ASSERT_OK(ree_builder.AppendNulls(2));
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(8), 5));
ASSERT_OK(ree_builder.AppendNulls(7));
ASSERT_OK_AND_ASSIGN(auto ree, ree_builder.Finish());

ASSERT_EQ(0, ree->null_count());
ASSERT_EQ(10, ree->ComputeLogicalNullCount());
ASSERT_FALSE(ree->data()->MayHaveNulls());
ASSERT_TRUE(ree->data()->MayHaveLogicalNulls());

ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(2), 2));
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(4), 3));
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(8), 5));
ASSERT_OK_AND_ASSIGN(auto ree_no_nulls, ree_builder.Finish());
ASSERT_EQ(0, ree_no_nulls->null_count());
ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls());
}

TEST_F(TestArray, TestSlicePreservesAllNullCount) {
Expand Down Expand Up @@ -377,20 +411,23 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
ASSERT_EQ(array->length(), length);
if (is_union(type->id())) {
ASSERT_EQ(array->null_count(), 0);
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Testing unions here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const auto& union_array = checked_cast<const UnionArray&>(*array);
for (int i = 0; i < union_array.num_fields(); ++i) {
ASSERT_EQ(union_array.field(i)->null_count(), union_array.field(i)->length());
}
} else if (type->id() == Type::RUN_END_ENCODED) {
ASSERT_EQ(array->null_count(), 0);
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
const auto& ree_array = checked_cast<const RunEndEncodedArray&>(*array);
ASSERT_EQ(ree_array.values()->null_count(), ree_array.values()->length());
} else {
ASSERT_EQ(array->null_count(), length);
for (int64_t i = 0; i < length; ++i) {
ASSERT_TRUE(array->IsNull(i));
ASSERT_FALSE(array->IsValid(i));
}
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
}
for (int64_t i = 0; i < length; ++i) {
ASSERT_TRUE(array->IsNull(i));
ASSERT_FALSE(array->IsValid(i));
}
Comment on lines +428 to 431
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@westonpace testing unions here.

}
}
Expand Down Expand Up @@ -482,35 +519,45 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr<Scalar>& scalar)
std::unique_ptr<arrow::ArrayBuilder> builder;
auto null_scalar = MakeNullScalar(scalar->type);
ASSERT_OK(MakeBuilderExactIndex(pool, scalar->type, &builder));
ASSERT_OK(builder->AppendScalar(*scalar));
ASSERT_OK(builder->AppendScalar(*scalar));
ASSERT_OK(builder->AppendScalar(*null_scalar));
ASSERT_OK(builder->AppendScalars({scalar, null_scalar}));
ASSERT_OK(builder->AppendScalar(*scalar, /*n_repeats=*/2));
ASSERT_OK(builder->AppendScalar(*null_scalar, /*n_repeats=*/2));
ASSERT_OK(builder->AppendScalar(*scalar)); // [0] = scalar
ASSERT_OK(builder->AppendScalar(*scalar)); // [1] = scalar
ASSERT_OK(builder->AppendScalar(*null_scalar)); // [2] = NULL
ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); // [3, 4] = {scalar, NULL}
ASSERT_OK(
builder->AppendScalar(*scalar, /*n_repeats=*/2)); // [5, 6] = {scalar, scalar}
ASSERT_OK(
builder->AppendScalar(*null_scalar, /*n_repeats=*/2)); // [7, 8] = {NULL, NULL}

std::shared_ptr<Array> out;
FinishAndCheckPadding(builder.get(), &out);
ASSERT_OK(out->ValidateFull());
AssertTypeEqual(scalar->type, out->type());
ASSERT_EQ(out->length(), 9);

const bool can_check_nulls = internal::HasValidityBitmap(out->type()->id());
auto out_type_id = out->type()->id();
const bool has_validity = internal::HasValidityBitmap(out_type_id);
// For a dictionary builder, the output dictionary won't necessarily be the same
const bool can_check_values = !is_dictionary(out->type()->id());
const bool can_check_values = !is_dictionary(out_type_id);

if (can_check_nulls) {
if (has_validity) {
ASSERT_EQ(out->null_count(), 4);
} else {
ASSERT_EQ(out->null_count(), 0);
}
if (scalar->is_valid) {
ASSERT_EQ(out->ComputeLogicalNullCount(), 4);
} else {
ASSERT_EQ(out->ComputeLogicalNullCount(), 9);
}

for (const auto index : {0, 1, 3, 5, 6}) {
ASSERT_FALSE(out->IsNull(index));
ASSERT_NE(out->IsNull(index), scalar->is_valid);
ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index));
ASSERT_OK(scalar_i->ValidateFull());
if (can_check_values) AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true);
}
for (const auto index : {2, 4, 7, 8}) {
ASSERT_EQ(out->IsNull(index), can_check_nulls);
ASSERT_TRUE(out->IsNull(index));
ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index));
ASSERT_OK(scalar_i->ValidateFull());
AssertScalarsEqual(*null_scalar, *scalar_i, /*verbose=*/true);
Expand Down
16 changes: 12 additions & 4 deletions cpp/src/arrow/array/builder_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,13 @@ class BaseListBuilder : public ArrayBuilder {
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
const offset_type* offsets = array.GetValues<offset_type>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
const bool all_valid = !array.MayHaveLogicalNulls();
const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
const bool is_valid =
all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
array.IsValid(row);
if (is_valid) {
ARROW_RETURN_NOT_OK(Append());
int64_t slot_length = offsets[row + 1] - offsets[row];
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0],
Expand Down Expand Up @@ -301,9 +305,13 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
const int32_t* offsets = array.GetValues<int32_t>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
const bool all_valid = !array.MayHaveLogicalNulls();
const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
const bool is_valid =
all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
array.IsValid(row);
if (is_valid) {
ARROW_RETURN_NOT_OK(Append());
const int64_t slot_length = offsets[row + 1] - offsets[row];
// Add together the inner StructArray offset to the Map/List offset
Expand Down
93 changes: 93 additions & 0 deletions cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ree_util.h"
#include "arrow/util/slice_util_internal.h"
#include "arrow/util/union_util.h"

namespace arrow {

Expand All @@ -60,6 +62,38 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length,
}
}

namespace internal {

bool IsNullSparseUnion(const ArrayData& data, int64_t i) {
auto* union_type = checked_cast<const SparseUnionType*>(data.type.get());
const auto* types = reinterpret_cast<const int8_t*>(data.buffers[1]->data());
const int child_id = union_type->child_ids()[types[data.offset + i]];
return data.child_data[child_id]->IsNull(i);
}

bool IsNullDenseUnion(const ArrayData& data, int64_t i) {
auto* union_type = checked_cast<const DenseUnionType*>(data.type.get());
const auto* types = reinterpret_cast<const int8_t*>(data.buffers[1]->data());
const int child_id = union_type->child_ids()[types[data.offset + i]];
const auto* offsets = reinterpret_cast<const int32_t*>(data.buffers[2]->data());
const int64_t child_offset = offsets[data.offset + i];
return data.child_data[child_id]->IsNull(child_offset);
}

bool IsNullRunEndEncoded(const ArrayData& data, int64_t i) {
return ArraySpan(data).IsNullRunEndEncoded(i);
}

bool UnionMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

} // namespace internal

std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count, int64_t offset) {
Expand Down Expand Up @@ -132,6 +166,13 @@ int64_t ArrayData::GetNullCount() const {
return precomputed;
}

int64_t ArrayData::ComputeLogicalNullCount() const {
if (this->buffers[0]) {
return GetNullCount();
}
return ArraySpan(*this).ComputeLogicalNullCount();
}

// ----------------------------------------------------------------------
// Methods for ArraySpan

Expand Down Expand Up @@ -407,6 +448,20 @@ int64_t ArraySpan::GetNullCount() const {
return precomputed;
}

int64_t ArraySpan::ComputeLogicalNullCount() const {
const auto t = this->type->id();
if (t == Type::SPARSE_UNION) {
return union_util::LogicalSparseUnionNullCount(*this);
}
if (t == Type::DENSE_UNION) {
return union_util::LogicalDenseUnionNullCount(*this);
}
if (t == Type::RUN_END_ENCODED) {
return ree_util::LogicalNullCount(*this);
}
return GetNullCount();
}

int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); }

std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
Expand Down Expand Up @@ -445,6 +500,44 @@ std::shared_ptr<Array> ArraySpan::ToArray() const {
return MakeArray(this->ToArrayData());
}

bool ArraySpan::IsNullSparseUnion(int64_t i) const {
auto* union_type = checked_cast<const SparseUnionType*>(this->type);
const auto* types = reinterpret_cast<const int8_t*>(this->buffers[1].data);
const int child_id = union_type->child_ids()[types[this->offset + i]];
return this->child_data[child_id].IsNull(i);
}

bool ArraySpan::IsNullDenseUnion(int64_t i) const {
auto* union_type = checked_cast<const DenseUnionType*>(this->type);
const auto* types = reinterpret_cast<const int8_t*>(this->buffers[1].data);
const auto* offsets = reinterpret_cast<const int32_t*>(this->buffers[2].data);
const int64_t child_id = union_type->child_ids()[types[this->offset + i]];
const int64_t child_offset = offsets[this->offset + i];
return this->child_data[child_id].IsNull(child_offset);
}

bool ArraySpan::IsNullRunEndEncoded(int64_t i) const {
const auto& values = ree_util::ValuesArray(*this);
if (values.MayHaveLogicalNulls()) {
const int64_t physical_offset = ree_util::FindPhysicalIndex(*this, i, this->offset);
return ree_util::ValuesArray(*this).IsNull(physical_offset);
}
return false;
}

bool ArraySpan::UnionMayHaveLogicalNulls() const {
for (auto& child : this->child_data) {
if (child.MayHaveLogicalNulls()) {
return true;
}
}
return false;
}

bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const {
return ree_util::ValuesArray(*this).MayHaveLogicalNulls();
}

// ----------------------------------------------------------------------
// Implement internal::GetArrayView

Expand Down
Loading