Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-35141: [C++] Versions of IsNull/IsValid that don't branch on type #35149

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,37 @@ class ARROW_EXPORT Array {
// a potential inner-branch removal.
if (type_id() == Type::SPARSE_UNION) {
return !internal::IsNullSparseUnion(*data_, i);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why change it to if else? I think it's not necessary here
clang-tidy has a check about this style ( https://clang.llvm.org/extra/clang-tidy/checks/readability/else-after-return.html ), although we don't enable this check, I think just return without if else is ok

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More compact. In sync with the if constexpr used in the other cases. Arrow's .clang-tidy doesn't complain about this. No extra indentation levels are introduced because of this.

if (type_id() == Type::DENSE_UNION) {
} else if (type_id() == Type::DENSE_UNION) {
return !internal::IsNullDenseUnion(*data_, i);
}
if (type_id() == Type::RUN_END_ENCODED) {
} else if (type_id() == Type::RUN_END_ENCODED) {
return !internal::IsNullRunEndEncoded(*data_, i);
}
return data_->null_count != data_->length;
}

template <typename ArrowType>
bool IsNullFast(int64_t i) const {
return !IsValidFast<ArrowType>(i);
}

template <typename ArrowType>
bool IsValidFast(int64_t i) const {
if constexpr (ArrowType::type_id == Type::NA) {
return false;
} else if constexpr (ArrowType::type_id == Type::SPARSE_UNION) {
return !internal::IsNullSparseUnion(*data_, i);
} else if constexpr (ArrowType::type_id == Type::DENSE_UNION) {
return !internal::IsNullDenseUnion(*data_, i);
} else if constexpr (ArrowType::type_id == Type::RUN_END_ENCODED) {
return !internal::IsNullRunEndEncoded(*data_, i);
} else {
if (null_bitmap_data_ != NULLPTR) {
return bit_util::GetBit(null_bitmap_data_, i + data_->offset);
}
return data_->null_count != data_->length;
}
}

/// \brief Return a Scalar containing the value of this array at i
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;

Expand Down
26 changes: 26 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,8 @@ TEST_F(TestArray, TestIsNullIsValid) {
for (size_t i = 0; i < null_bitmap.size(); ++i) {
EXPECT_EQ(null_bitmap[i] != 0, !arr->IsNull(i)) << i;
EXPECT_EQ(null_bitmap[i] != 0, arr->IsValid(i)) << i;
EXPECT_EQ(null_bitmap[i] != 0, !arr->IsNullFast<Int32Type>(i)) << i;
EXPECT_EQ(null_bitmap[i] != 0, arr->IsValidFast<Int32Type>(i)) << i;
}
}

Expand All @@ -341,6 +343,8 @@ TEST_F(TestArray, TestIsNullIsValidNoNulls) {
for (size_t i = 0; i < size; ++i) {
EXPECT_TRUE(arr->IsValid(i));
EXPECT_FALSE(arr->IsNull(i));
EXPECT_TRUE(arr->IsValidFast<Int32Type>(i));
EXPECT_FALSE(arr->IsNullFast<Int32Type>(i));
}
}

Expand Down Expand Up @@ -428,6 +432,25 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
for (int64_t i = 0; i < length; ++i) {
ASSERT_TRUE(array->IsNull(i));
ASSERT_FALSE(array->IsValid(i));
switch (type->id()) {
case Type::NA:
ASSERT_TRUE(array->IsNullFast<NullType>(i));
break;
case Type::SPARSE_UNION:
ASSERT_TRUE(array->IsNullFast<SparseUnionType>(i));
break;
case Type::DENSE_UNION:
ASSERT_TRUE(array->IsNullFast<DenseUnionType>(i));
break;
case Type::RUN_END_ENCODED:
ASSERT_TRUE(array->IsNullFast<RunEndEncodedType>(i));
break;
case Type::INT32: // a non-special type for IsNullFast
ASSERT_TRUE(array->IsNullFast<Int32Type>(i));
break;
default:
break;
}
}
}
}
Expand Down Expand Up @@ -1788,6 +1811,7 @@ TEST(TestBooleanBuilder, AppendNullsAdvanceBuilder) {
ASSERT_TRUE(barr.Value(0));
ASSERT_FALSE(barr.Value(1));
ASSERT_TRUE(barr.IsNull(2));
ASSERT_TRUE(barr.IsNullFast<BooleanType>(2));
ASSERT_TRUE(barr.Value(3));
}

Expand Down Expand Up @@ -1821,9 +1845,11 @@ TEST(TestBooleanBuilder, TestStdBoolVectorAppend) {
for (int i = 0; i < length; ++i) {
if (is_valid[i]) {
ASSERT_FALSE(arr.IsNull(i));
ASSERT_FALSE(arr.IsNullFast<BooleanType>(i));
ASSERT_EQ(values[i], arr.Value(i));
} else {
ASSERT_TRUE(arr.IsNull(i));
ASSERT_TRUE(arr.IsNullFast<BooleanType>(i));
}
ASSERT_EQ(values[i], arr_nn.Value(i));
}
Expand Down
71 changes: 56 additions & 15 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,25 +180,46 @@ struct ARROW_EXPORT ArrayData {

std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }

bool IsNull(int64_t i) const { return !IsValid(i); }
inline bool IsNull(int64_t i) const { return !IsValid(i); }

bool IsValid(int64_t i) const {
inline bool IsValid(int64_t i) const {
if (buffers[0] != NULLPTR) {
return bit_util::GetBit(buffers[0]->data(), i + offset);
}
const auto type = this->type->id();
if (type == Type::SPARSE_UNION) {
return !internal::IsNullSparseUnion(*this, i);
}
if (type == Type::DENSE_UNION) {
} else if (type == Type::DENSE_UNION) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this line of change seems to be complained by clang-tidy.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not on my editor. Where did you see this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this check is not enabled on Arrow's .clang-tidy and I don't think putting an else after a return is always bad.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this specific function, the use of else is not creating unnecessary indentation levels.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. I was overwhelmed by this kind of warning in the internal toolchain from my employer. :)

return !internal::IsNullDenseUnion(*this, i);
}
if (type == Type::RUN_END_ENCODED) {
} else if (type == Type::RUN_END_ENCODED) {
return !internal::IsNullRunEndEncoded(*this, i);
}
return null_count.load() != length;
}

template <typename ArrowType>
bool IsNullFast(int64_t i) const {
return !IsValidFast<ArrowType>(i);
}

template <typename ArrowType>
bool IsValidFast(int64_t i) const {
if constexpr (ArrowType::type_id == Type::NA) {
return false;
} else if constexpr (ArrowType::type_id == Type::SPARSE_UNION) {
return !internal::IsNullSparseUnion(*this, i);
} else if constexpr (ArrowType::type_id == Type::DENSE_UNION) {
return !internal::IsNullDenseUnion(*this, i);
} else if constexpr (ArrowType::type_id == Type::RUN_END_ENCODED) {
return !internal::IsNullRunEndEncoded(*this, i);
} else {
if (buffers[0] != NULLPTR) {
return bit_util::GetBit(buffers[0]->data(), i + offset);
}
return null_count.load() != length;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a dumb question: if the validity bitmap does not exist, shouldn't null_count be either 0 or -1 instead of other values?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NA is a known type that has no validity bitmap and null_count set to length. So in theory, this is very possible.

}
}

// Access a buffer's data as a typed C pointer
template <typename T>
inline const T* GetValues(int i, int64_t absolute_offset) const {
Expand Down Expand Up @@ -434,16 +455,36 @@ struct ARROW_EXPORT ArraySpan {
inline bool IsValid(int64_t i) const {
if (this->buffers[0].data != NULLPTR) {
return bit_util::GetBit(this->buffers[0].data, i + this->offset);
}
const auto type = this->type->id();
if (type == Type::SPARSE_UNION) {
return !IsNullSparseUnion(i);
} else if (type == Type::DENSE_UNION) {
return !IsNullDenseUnion(i);
} else if (type == Type::RUN_END_ENCODED) {
return !IsNullRunEndEncoded(i);
}
return this->null_count != this->length;
}

template <typename ArrowType>
inline bool IsNullFast(int64_t i) const {
return !IsValidFast<ArrowType>(i);
}

template <typename ArrowType>
inline bool IsValidFast(int64_t i) const {
if constexpr (ArrowType::type_id == Type::NA) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to add a DCHECK to check that ArrowType is equal to the real type in the Array?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was tempted, but it adds too high of an overhead for this function as it's used from tight loops and is expected to be fully inlined. Even considering that it would be affecting only debug builds.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we just need care the release performance? An array using outerside ArrowType without checking is so dangerous...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Debug perf can make CI run much slower. Imagine changing the code from IsNull to IsNullFast and getting a slower build in return. I'm counting on people defaulting to use IsNull (safe) instead of IsNullFast. The latter is for people working on kernels which usually do many unsafe things and have more carefully written code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mapleFU note that all the special branches perform runtime type checks, so this is not completely unchecked.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, though it's still a bit wired for me. I think we can hear about others idea.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be more clear: the body of these member functions in the .cc file, where logging.h can be used, contains runtime checks.

    } else if constexpr (ArrowType::type_id == Type::SPARSE_UNION) {
      return !IsNullSparseUnion(i);
    } else if constexpr (ArrowType::type_id == Type::DENSE_UNION) {
      return !IsNullDenseUnion(i);
    } else if constexpr (ArrowType::type_id == Type::RUN_END_ENCODED) {
      return !IsNullRunEndEncoded(i);

return false;
} else if constexpr (ArrowType::type_id == Type::SPARSE_UNION) {
return !IsNullSparseUnion(i);
} else if constexpr (ArrowType::type_id == Type::DENSE_UNION) {
return !IsNullDenseUnion(i);
} else if constexpr (ArrowType::type_id == Type::RUN_END_ENCODED) {
return !IsNullRunEndEncoded(i);
} else {
const auto type = this->type->id();
if (type == Type::SPARSE_UNION) {
return !IsNullSparseUnion(i);
}
if (type == Type::DENSE_UNION) {
return !IsNullDenseUnion(i);
}
if (type == Type::RUN_END_ENCODED) {
return !IsNullRunEndEncoded(i);
if (this->buffers[0].data != NULLPTR) {
return bit_util::GetBit(this->buffers[0].data, i + this->offset);
}
return this->null_count != this->length;
}
Expand Down