diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 7ff3261ecba5e..743b40c79f1d1 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1154,6 +1154,54 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { } } } + +TEST_F(TestBinaryBuilder, TestCapacityReserve) { + vector strings = {"a", "bb", "cc", "ddddd", "eeeee"}; + int64_t N = static_cast(strings.size()); + int64_t length = 0; + int64_t data_length = 0; + int64_t capacity = N; + + ASSERT_OK(builder_->Reserve(capacity)); + ASSERT_OK(builder_->ReserveData(capacity)); + + ASSERT_EQ(builder_->length(), length); + ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(capacity)); + ASSERT_EQ(builder_->value_data_length(), data_length); + ASSERT_EQ(builder_->value_data_capacity(), capacity); + + for(const string& str : strings) { + ASSERT_OK(builder_->Append(str)); + length++; + data_length += static_cast(str.size()); + + ASSERT_EQ(builder_->length(), length); + ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(capacity)); + ASSERT_EQ(builder_->value_data_length(), data_length); + if (data_length <= capacity) { + ASSERT_EQ(builder_->value_data_capacity(), capacity); + } else { + ASSERT_EQ(builder_->value_data_capacity(), data_length); + } + } + + int extra_capacity = 20; + + ASSERT_OK(builder_->Reserve(extra_capacity)); + ASSERT_OK(builder_->ReserveData(extra_capacity)); + + ASSERT_EQ(builder_->length(), length); + ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(length + extra_capacity)); + ASSERT_EQ(builder_->value_data_length(), data_length); + ASSERT_EQ(builder_->value_data_capacity(), data_length + extra_capacity); + + Done(); + + ASSERT_EQ(result_->length(), N); + ASSERT_EQ(result_->null_count(), 0); + ASSERT_EQ(result_->value_data()->size(), data_length); + ASSERT_EQ(result_->value_data()->capacity(), BitUtil::RoundUpToMultipleOf64(data_length + extra_capacity)); +} TEST_F(TestBinaryBuilder, TestZeroLength) { // All buffers are null diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 712e4194526df..63588867dfce0 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1208,7 +1208,7 @@ ArrayBuilder* ListBuilder::value_builder() const { // String and binary BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} + : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool), data_capacity_(0) {} BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} @@ -1227,8 +1227,15 @@ Status BinaryBuilder::Resize(int64_t capacity) { } Status BinaryBuilder::ReserveData(int64_t capacity) { - DCHECK_LT(capacity, std::numeric_limits::max()); - return value_data_builder_.Resize(capacity * sizeof(int64_t)); + if (value_data_length() + capacity > data_capacity_) { + if (value_data_length() + capacity > std::numeric_limits::max()) { + return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 in length for binary data"); + } + + RETURN_NOT_OK(value_data_builder_.Resize(value_data_length() + capacity)); + data_capacity_ = value_data_length() + capacity; + } + return Status::OK(); } Status BinaryBuilder::AppendNextOffset() { @@ -1246,6 +1253,9 @@ Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { RETURN_NOT_OK(Reserve(1)); RETURN_NOT_OK(AppendNextOffset()); RETURN_NOT_OK(value_data_builder_.Append(value, length)); + if (data_capacity_ < value_data_length()) { + data_capacity_ = value_data_length(); + } UnsafeAppendToBitmap(true); return Status::OK(); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 045334e5e4e2e..a708122c2930b 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,13 +682,13 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; - Status ReserveData(int64_t bytes); + Status ReserveData(int64_t capacity); Status FinishInternal(std::shared_ptr* out) override; /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } /// \return capacity of values buffer - int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + int64_t value_data_capacity() const { return data_capacity_; } /// Temporary access to a value. /// @@ -699,6 +699,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; + int64_t data_capacity_; static constexpr int64_t kMaximumCapacity = std::numeric_limits::max() - 1; Status AppendNextOffset();