apache · jorisvandenbossche · Apr 11, 2023 · Apr 11, 2023 · Apr 12, 2023 · Apr 12, 2023
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
@@ -236,6 +236,18 @@ void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_siz
   span->buffers[buffer_index].size = 2 * sizeof(offset_type);
 }
 
+template <typename RunEndType>
+void FillRunEndsArrayForScalar(ArraySpan* span, const DataType* run_end_type) {
+  using RunEndCType = typename RunEndType::c_type;
+  auto buffer = reinterpret_cast<RunEndCType*>(span->scratch_space);
+  buffer[0] = static_cast<RunEndCType>(1);
+  span->type = run_end_type;
+  span->length = 1;
+  span->null_count = 0;
+  span->buffers[1].data = reinterpret_cast<uint8_t*>(buffer);
+  span->buffers[1].size = sizeof(RunEndCType);
+}
+
 int GetNumBuffers(const DataType& type) {
   switch (type.id()) {
     case Type::NA:
@@ -304,9 +316,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
 
   Type::type type_id = value.type->id();
 
-  // Populate null count and validity bitmap (only for non-union/null types)
-  this->null_count = value.is_valid ? 0 : 1;
-  if (!is_union(type_id) && type_id != Type::NA) {
+  // Populate null count and validity bitmap
+  if (type_id == Type::NA) {
+    this->null_count = 1;
+  } else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
-  } else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
+  } else if (!HasValidityBitmap(type_id)) {
-  } else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
+  } else if (!HasValidityBitmap(type_id)) {
+    this->null_count = 0;
+  } else {
+    this->null_count = value.is_valid ? 0 : 1;
     this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit;
     this->buffers[0].size = 1;
   }
@@ -422,6 +438,22 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
         this->child_data[i].FillFromScalar(*scalar.value[i]);
       }
     }
+  } else if (type_id == Type::RUN_END_ENCODED) {
+    const auto& scalar = checked_cast<const RunEndEncodedScalar&>(value);
+    this->child_data.resize(2);
+    auto& run_end_type = scalar.run_end_type();
-    auto& run_end_type = scalar.run_end_type();
+    const auto& run_end_type = scalar.run_end_type();
-    auto& run_end_type = scalar.run_end_type();
+    const auto& run_end_type = scalar.run_end_type();
+    switch (run_end_type->id()) {
+      case Type::INT16:
+        FillRunEndsArrayForScalar<Int16Type>(&this->child_data[0], run_end_type.get());
+        break;
+      case Type::INT32:
+        FillRunEndsArrayForScalar<Int32Type>(&this->child_data[0], run_end_type.get());
+        break;
+      default:
+        DCHECK_EQ(run_end_type->id(), Type::INT64);
+        FillRunEndsArrayForScalar<Int64Type>(&this->child_data[0], run_end_type.get());
+    }
+    this->child_data[1].FillFromScalar(*scalar.value);
   } else if (type_id == Type::EXTENSION) {
     // Pass through storage
     const auto& scalar = checked_cast<const ExtensionScalar&>(value);

diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc
@@ -22,6 +22,8 @@
 
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/ree_util.h"
 
 namespace arrow {
 
@@ -82,6 +84,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of
   }
 }
 
+static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                                          int64_t out_offset) {
+  const auto* sparse_union_type =
+      ::arrow::internal::checked_cast<const SparseUnionType*>(span.type);
+  DCHECK_LE(span.child_data.size(), 128);
+
+  const int8_t* types = span.GetValues<int8_t>(1);  // NOLINT
+  for (int64_t i = 0; i < span.length; i++) {
+    const int8_t child_id = sparse_union_type->child_ids()[types[i]];
+    if (span.child_data[child_id].IsNull(i + span.offset)) {
+      bit_util::SetBit(out_bitmap, i + out_offset);
+    }
+  }
+}
+
+static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                                         int64_t out_offset) {
+  const auto* dense_union_type =
+      ::arrow::internal::checked_cast<const DenseUnionType*>(span.type);
+  DCHECK_LE(span.child_data.size(), 128);
+
+  const int8_t* types = span.GetValues<int8_t>(1);      // NOLINT
+  const int32_t* offsets = span.GetValues<int32_t>(2);  // NOLINT
+  for (int64_t i = 0; i < span.length; i++) {
+    const int8_t child_id = dense_union_type->child_ids()[types[i]];
+    const int32_t offset = offsets[i];
+    if (span.child_data[child_id].IsNull(offset)) {
+      bit_util::SetBit(out_bitmap, i + out_offset);
+    }
+  }
+}
+
+template <typename RunEndCType>
+void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                           int64_t out_offset) {
+  const auto& values = arrow::ree_util::ValuesArray(span);
+  DCHECK(!is_nested(values.type->id()));
+  const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR;
+
+  if (!values_bitmap) {
+    return;
+  }
+
+  arrow::ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(span);
+  auto end = ree_span.end();
+  for (auto it = ree_span.begin(); it != end; ++it) {
+    if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) {
+      bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(),
+                          true);
+    }
+  }
+}
+
+void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                           int64_t out_offset) {
+  const auto type_id = arrow::ree_util::RunEndsArray(span).type->id();
+  if (type_id == Type::INT16) {
+    SetREELogicalNullBits<int16_t>(span, out_bitmap, out_offset);
+  } else if (type_id == Type::INT32) {
+    SetREELogicalNullBits<int32_t>(span, out_bitmap, out_offset);
+  } else {
+    DCHECK_EQ(type_id, Type::INT64);
+    SetREELogicalNullBits<int64_t>(span, out_bitmap, out_offset);
+  }
+}
+
 Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const ArraySpan& arr = batch[0].array;
   ArraySpan* out_span = out->array_span_mutable();
@@ -100,6 +168,16 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   } else {
     // Input has no nulls => output is entirely false.
     bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false);
+    // Except for union/ree which never has physical nulls, but can have logical
+    // nulls from the child arrays -> set those bits to true
+    const auto t = arr.type->id();
+    if (t == Type::SPARSE_UNION) {
+      SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset);
+    } else if (t == Type::DENSE_UNION) {
+      SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset);
+    } else if (t == Type::RUN_END_ENCODED) {
+      SetREELogicalNullBits(arr, out_bitmap, out_span->offset);
+    }
   }
 
   if (is_floating(arr.type->id()) && options.nan_is_null) {

diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc
@@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) {
                    "[true, false, false, true]", &nan_is_null_options);
 }
 
+TEST_F(TestBooleanValidityKernels, IsNullUnion) {
+  auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]");
+  auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])");
+  auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])");
+  ASSERT_OK_AND_ASSIGN(auto arr1,
+                       SparseUnionArray::Make(*type_ids, {field_i64, field_str}));
+  auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]");
+  CheckScalarUnary("is_null", arr1, expected);
+
+  auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]");
+  auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])");
+  auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])");
+  ASSERT_OK_AND_ASSIGN(auto arr2,
+                       DenseUnionArray::Make(*type_ids, *value_offsets,
+                                             {dense_field_i64, dense_field_str}));
+  CheckScalarUnary("is_null", arr2, expected);
+}
+
+TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) {
+  auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])");
+  auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])");
+  ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values));
+  ASSERT_OK(ree_array->ValidateFull());
+  auto expected =
+      ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]");
+  CheckScalarUnary("is_null", ree_array, expected);
+}
+
 TEST(TestValidityKernels, IsFinite) {
   for (const auto& ty : IntTypes()) {
     CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")},

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
@@ -1637,6 +1637,48 @@ def test_is_null():
     assert result.equals(expected)
 
 
+def test_is_null_union():
+    arr = pa.UnionArray.from_sparse(
+        pa.array([0, 1, 0, 0, 1], type=pa.int8()),
+        [
+            pa.array([0.0, 1.1, None, 3.3, 4.4]),
+            pa.array([True, None, False, True, False]),
+        ]
+    )
+    assert arr.to_pylist() == [0.0, None, None, 3.3, False]
+    result = arr.is_null()
+    expected = pa.array([False, True, True, False, False])
+    assert result.equals(expected)
+    result = arr.slice(1, 2).is_null()
+    assert result.equals(expected.slice(1, 2))
+
+    arr = pa.UnionArray.from_dense(
+        pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()),
+        pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()),
+        [
+            pa.array([0.0, 1.1, None, 3.3]),
+            pa.array([True, None, False])
+        ]
+    )
+    assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False]
+    result = arr.is_null()
+    expected = pa.array([False, False, False, True, False, True, False])
+    assert result.equals(expected)
+    result = arr.slice(1, 3).is_null()
+    assert result.equals(expected.slice(1, 3))
+
+
+@pytest.mark.parametrize("typ", ["int16", "int32", "int64"])
+def test_is_null_run_end_encoded(typ):
+    decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1])
+    arr = pc.run_end_encode(decoded, run_end_type=typ)
+    result = arr.is_null()
+    expected = pa.array([False, False, False, True, False, False, True, True, False])
+    assert result.equals(expected)
+    result = arr.slice(2, 5).is_null()
+    assert result.equals(expected.slice(2, 5))
+
+
 def test_is_nan():
     arr = pa.array([1, 2, 3, None, np.nan])
     result = arr.is_nan()