diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 15c726241b582..002e02df981cb 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -116,6 +116,19 @@ class ARROW_EXPORT ArrayBuilder { /// This method is useful when appending null values to a parent nested type. virtual Status AppendEmptyValues(int64_t length) = 0; + /// \brief Append a value from a scalar + Status AppendScalar(const Scalar& scalar); + Status AppendScalar(const Scalar& scalar, int64_t n_repeats); + Status AppendScalars(const ScalarVector& scalars); + + /// \brief Append a range of values from an array. + /// + /// The given array must be the same type as the builder. + virtual Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) { + return Status::NotImplemented("AppendArraySlice for builder for ", *type()); + } + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. @@ -182,6 +195,17 @@ class ARROW_EXPORT ArrayBuilder { null_count_ = null_bitmap_builder_.false_count(); } + // Vector append. Copy from a given bitmap. If bitmap is null assume + // all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) { + if (bitmap == NULLPTR) { + return UnsafeSetNotNull(length); + } + null_bitmap_builder_.UnsafeAppend(bitmap, offset, length); + length_ += length; + null_count_ = null_bitmap_builder_.false_count(); + } + // Append the same validity value a given number of times. void UnsafeAppendToBitmap(const int64_t num_bits, bool value) { if (value) { diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 6822dc89903bf..fd1be17981698 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -60,6 +60,14 @@ Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length, return byte_builder_.Append(data, length * byte_width_); } +Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length, + const uint8_t* validity, + int64_t bitmap_offset) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(validity, bitmap_offset, length); + return byte_builder_.Append(data, length * byte_width_); +} + Status FixedSizeBinaryBuilder::AppendNull() { RETURN_NOT_OK(Reserve(1)); UnsafeAppendNull(); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index bc49c7d6787f7..10c6b1a1f076d 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -241,6 +241,23 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { + auto bitmap = array.GetValues(0, 0); + auto offsets = array.GetValues(1); + auto data = array.GetValues(2, 0); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) { + const offset_type start = offsets[offset + i]; + const offset_type end = offsets[offset + i + 1]; + ARROW_RETURN_NOT_OK(Append(data + start, end - start)); + } else { + ARROW_RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + void Reset() override { ArrayBuilder::Reset(); offsets_builder_.Reset(); @@ -452,12 +469,22 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* valid_bytes = NULLPTR); + Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity, + int64_t bitmap_offset); + Status AppendNull() final; Status AppendNulls(int64_t length) final; Status AppendEmptyValue() final; Status AppendEmptyValues(int64_t length) final; + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { + return AppendValues( + array.GetValues(1, 0) + ((array.offset + offset) * byte_width_), length, + array.GetValues(0, 0), array.offset + offset); + } + void UnsafeAppend(const uint8_t* value) { UnsafeAppendToBitmap(true); if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 12b999b786eaa..e53b758efa35b 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -122,6 +122,23 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { + const offset_type* offsets = array.GetValues(1); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + ARROW_RETURN_NOT_OK(Append()); + int64_t slot_length = offsets[row + 1] - offsets[row]; + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0], + offsets[row], slot_length)); + } else { + ARROW_RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + Status FinishInternal(std::shared_ptr* out) override { ARROW_RETURN_NOT_OK(AppendNextOffset()); @@ -275,6 +292,25 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { + const int32_t* offsets = array.GetValues(1); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + ARROW_RETURN_NOT_OK(Append()); + const int64_t slot_length = offsets[row + 1] - offsets[row]; + ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice( + *array.child_data[0]->child_data[0], offsets[row], slot_length)); + ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice( + *array.child_data[0]->child_data[1], offsets[row], slot_length)); + } else { + ARROW_RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + /// \brief Get builder to append keys. /// /// Append a key with this builder should be followed by appending @@ -374,6 +410,20 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; + Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice( + *array.child_data[0], list_size_ * (array.offset + row), list_size_)); + ARROW_RETURN_NOT_OK(Append()); + } else { + ARROW_RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + ArrayBuilder* value_builder() const { return value_builder_.get(); } std::shared_ptr type() const override { @@ -467,6 +517,18 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { + for (int i = 0; static_cast(i) < children_.size(); i++) { + ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i], + array.offset + offset, length)); + } + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(validity, array.offset + offset, length); + return Status::OK(); + } + void Reset() override; ArrayBuilder* field_builder(int i) const { return children_[i].get(); } diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc index 037a1ecbf916f..7a832867d925f 100644 --- a/cpp/src/arrow/array/builder_primitive.cc +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -86,6 +86,14 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, return Status::OK(); } +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const uint8_t* validity, int64_t offset) { + RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, offset, length); + ArrayBuilder::UnsafeAppendToBitmap(validity, offset, length); + return Status::OK(); +} + Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, const std::vector& is_valid) { RETURN_NOT_OK(Reserve(length)); diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index e10f11fdd6cf6..4dcbdd03fe7c2 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -52,6 +52,10 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { Status Append(std::nullptr_t) { return AppendNull(); } + Status AppendArraySlice(const ArrayData&, int64_t, int64_t length) override { + return AppendNulls(length); + } + Status FinishInternal(std::shared_ptr* out) override; /// \cond FALSE @@ -152,6 +156,21 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] bitmap a validity bitmap to copy (may be null) + /// \param[in] bitmap_offset an offset into the validity bitmap + /// \return Status + Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap, + int64_t bitmap_offset) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length); + return Status::OK(); + } + /// \brief Append a sequence of elements in one shot /// \param[in] values a contiguous C array of values /// \param[in] length the number of values to append @@ -255,6 +274,12 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { + return AppendValues(array.GetValues(1) + offset, length, + array.GetValues(0, 0), array.offset + offset); + } + /// Append a single scalar under the assumption that the underlying Buffer is /// large enough. /// @@ -362,6 +387,15 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* valid_bytes = NULLPTR); + /// \brief Append a sequence of elements in one shot + /// \param[in] values a bitmap of values + /// \param[in] length the number of values to append + /// \param[in] validity a validity bitmap to copy (may be null) + /// \param[in] offset an offset into the values and validity bitmaps + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity, + int64_t offset); + /// \brief Append a sequence of elements in one shot /// \param[in] values a contiguous C array of values /// \param[in] length the number of values to append @@ -458,6 +492,12 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendValues(int64_t length, bool value); + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { + return AppendValues(array.GetValues(1, 0), length, + array.GetValues(0, 0), array.offset + offset); + } + Status FinishInternal(std::shared_ptr* out) override; /// \cond FALSE diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 90d4f42084af3..cae9acd7df297 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -45,6 +45,21 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } +Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, + const int64_t length) { + const int8_t* type_codes = array.GetValues(1); + const int32_t* offsets = array.GetValues(2); + for (int64_t row = offset; row < offset + length; row++) { + const int8_t type_code = type_codes[row]; + const int child_id = type_id_to_child_id_[type_code]; + const int32_t union_offset = offsets[row]; + RETURN_NOT_OK(Append(type_code)); + RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySlice( + *array.child_data[child_id], union_offset, /*length=*/1)); + } + return Status::OK(); +} + Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out)); (*out)->buffers.resize(3); @@ -64,6 +79,7 @@ BasicUnionBuilder::BasicUnionBuilder( type_codes_ = union_type.type_codes(); children_ = children; + type_id_to_child_id_.resize(union_type.max_type_code() + 1, -1); type_id_to_children_.resize(union_type.max_type_code() + 1, nullptr); DCHECK_LT( type_id_to_children_.size(), @@ -73,6 +89,7 @@ BasicUnionBuilder::BasicUnionBuilder( child_fields_[i] = union_type.field(static_cast(i)); auto type_id = union_type.type_codes()[i]; + type_id_to_child_id_[type_id] = static_cast(i); type_id_to_children_[type_id] = children[i].get(); } } @@ -82,6 +99,7 @@ int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_c children_.push_back(new_child); auto new_type_id = NextTypeId(); + type_id_to_child_id_[new_type_id] = static_cast(children_.size() - 1); type_id_to_children_[new_type_id] = new_child.get(); child_fields_.push_back(field(field_name, nullptr)); type_codes_.push_back(static_cast(new_type_id)); @@ -114,8 +132,20 @@ int8_t BasicUnionBuilder::NextTypeId() { static_cast(UnionType::kMaxTypeCode)); // type_id_to_children_ is already densely packed, so just append the new child + type_id_to_child_id_.resize(type_id_to_child_id_.size() + 1); type_id_to_children_.resize(type_id_to_children_.size() + 1); return dense_type_id_++; } +Status SparseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, + const int64_t length) { + for (size_t i = 0; i < type_codes_.size(); i++) { + RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendArraySlice( + *array.child_data[i], array.offset + offset, length)); + } + const int8_t* type_codes = array.GetValues(1); + RETURN_NOT_OK(types_builder_.Append(type_codes + offset, length)); + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 060be474fb875..c1a799e56bf13 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -74,6 +74,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { UnionMode::type mode_; std::vector type_id_to_children_; + std::vector type_id_to_child_id_; // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr int8_t dense_type_id_ = 0; TypedBufferBuilder types_builder_; @@ -155,6 +156,9 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return offsets_builder_.Append(offset); } + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override; + Status FinishInternal(std::shared_ptr* out) override; private: @@ -230,6 +234,9 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// The corresponding child builder must be appended to independently after this method /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } + + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override; }; } // namespace arrow diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 297745a2b1754..3d4c8b7a3dc1f 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -441,9 +441,12 @@ class NullArrayFactory { // First buffer is always null out_->buffers[0] = nullptr; - // Type codes are all zero, so we can use buffer_ which has had it's memory - // zeroed out_->buffers[1] = buffer_; + // buffer_ is zeroed, but 0 may not be a valid type code + if (type.type_codes()[0] != 0) { + ARROW_ASSIGN_OR_RAISE(out_->buffers[1], AllocateBuffer(length_, pool_)); + std::memset(out_->buffers[1]->mutable_data(), type.type_codes()[0], length_); + } // For sparse unions, we now create children with the same length as the // parent diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index f525ec23c58af..712a2deac06a8 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -28,6 +28,7 @@ #include "arrow/status.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" @@ -322,6 +323,7 @@ class TypedBufferBuilder { ++bit_length_; } + /// \brief Append bits from an array of bytes (one value per byte) void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) { if (num_elements == 0) return; int64_t i = 0; @@ -333,6 +335,14 @@ class TypedBufferBuilder { bit_length_ += num_elements; } + /// \brief Append bits from a packed bitmap + void UnsafeAppend(const uint8_t* bitmap, int64_t offset, int64_t num_elements) { + if (num_elements == 0) return; + internal::CopyBitmap(bitmap, offset, num_elements, mutable_data(), bit_length_); + false_count_ += num_elements - internal::CountSetBits(bitmap, offset, num_elements); + bit_length_ += num_elements; + } + void UnsafeAppend(const int64_t num_copies, bool value) { BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value); false_count_ += num_copies * !value; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 0f25ac077679d..7aee3a92fdf4f 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -157,6 +157,15 @@ using TemporalArrowTypes = using DecimalArrowTypes = ::testing::Types; +using BinaryArrowTypes = + ::testing::Types; + +using StringArrowTypes = ::testing::Types; + +using ListArrowTypes = ::testing::Types; + +using UnionArrowTypes = ::testing::Types; + class Array; class ChunkedArray; class RecordBatch; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index b74aa3b0adbcb..ccf58a92afc28 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -991,6 +991,17 @@ static inline bool is_nested(Type::type type_id) { return false; } +static inline bool is_union(Type::type type_id) { + switch (type_id) { + case Type::SPARSE_UNION: + case Type::DENSE_UNION: + return true; + default: + break; + } + return false; +} + static inline int offset_bit_width(Type::type type_id) { switch (type_id) { case Type::STRING: