From 0a490222c4078a7cb0ff085cd5c9884fdda57998 Mon Sep 17 00:00:00 2001 From: Panchen Xue Date: Tue, 23 Jan 2018 22:03:50 -0500 Subject: [PATCH] ARROW-1712: [C++] Add method to BinaryBuilder to reserve space for value data Modified BinaryBuilder::Resize(int64_t) so that when building BinaryArrays with a known size, space is also reserved for value_data_builder_ to prevent internal reallocation. Author: Panchen Xue Closes #1481 from xuepanchen/master and squashes the following commits: 707b67bf [Panchen Xue] ARROW-1712: [C++] Fix lint errors 360e6018 [Panchen Xue] Merge branch 'master' of https://github.com/xuepanchen/arrow d4bbd151 [Panchen Xue] ARROW-1712: [C++] Modify test case for BinaryBuilder::ReserveData() and change arguments for offsets_builder_.Resize() 77f8f3c1 [Panchen Xue] Merge pull request #5 from apache/master bc5db7d3 [Panchen Xue] ARROW-1712: [C++] Remove unneeded data member in BinaryBuilder and modify test case 5a5b70e2 [Panchen Xue] Merge pull request #4 from apache/master 8e4c8925 [Panchen Xue] Merge pull request #3 from xuepanchen/xuepanchen-arrow-1712 d3c8202b [Panchen Xue] ARROW-1945: [C++] Fix a small typo 0b078955 [Panchen Xue] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data 18f90fb8 [Panchen Xue] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data bbc65270 [Panchen Xue] ARROW-1945: [C++] Update test case for BinaryBuild data value space reservation 15e045cc [Panchen Xue] Add test case for array-test.cc 5a5593e5 [Panchen Xue] Update again ReserveData(int64_t) method for BinaryBuilder 9b5e8059 [Panchen Xue] Update ReserveData(int64_t) method signature for BinaryBuilder 8dd5eaa9 [Panchen Xue] Update builder.cc b002e0bd [Panchen Xue] Remove override keyword from ReserveData(int64_t) method for BinaryBuilder de318f47 [Panchen Xue] Implement ReserveData(int64_t) method for BinaryBuilder e0434e61 [Panchen Xue] Add ReserveData(int64_t) and value_data_capacity() for methods for BinaryBuilder 5ebfb320 [Panchen Xue] Add capacity() method for TypedBufferBuilder 5b73c1c5 [Panchen Xue] Update again BinaryBuilder::Resize(int64_t capacity) in builder.cc d021c54b [Panchen Xue] Merge pull request #2 from xuepanchen/xuepanchen-arrow-1712 232024e3 [Panchen Xue] Update BinaryBuilder::Resize(int64_t capacity) in builder.cc c2f8dc4e [Panchen Xue] Merge pull request #1 from apache/master --- cpp/src/arrow/array-test.cc | 39 +++++++++++++++++++++++++++++++++++++ cpp/src/arrow/buffer.h | 1 + cpp/src/arrow/builder.cc | 18 +++++++++++++---- cpp/src/arrow/builder.h | 5 +++++ 4 files changed, 59 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 7ff3261ecba5e..c53da8591e94e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1155,6 +1155,45 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { } } +TEST_F(TestBinaryBuilder, TestCapacityReserve) { + vector strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", "dddddddddd"}; + int N = static_cast(strings.size()); + int reps = 15; + int64_t length = 0; + int64_t capacity = 1000; + int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity); + + ASSERT_OK(builder_->ReserveData(capacity)); + + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + ASSERT_OK(builder_->Append(strings[i])); + length += static_cast(strings[i].size()); + + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + } + } + + int extra_capacity = 500; + expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity); + + ASSERT_OK(builder_->ReserveData(extra_capacity)); + + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(0, result_->null_count()); + ASSERT_EQ(reps * 40, result_->value_data()->size()); + ASSERT_EQ(expected_capacity, result_->value_data()->capacity()); +} + TEST_F(TestBinaryBuilder, TestZeroLength) { // All buffers are null Done(); diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index b50b1a1aa041d..44c352a93f273 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -333,6 +333,7 @@ class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { const T* data() const { return reinterpret_cast(data_); } int64_t length() const { return size_ / sizeof(T); } + int64_t capacity() const { return capacity_ / sizeof(T); } }; /// \brief Allocate a fixed size mutable buffer from a memory pool diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index de132b5f6a0d1..db901526fc2ee 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1165,13 +1165,13 @@ Status ListBuilder::Init(int64_t elements) { DCHECK_LT(elements, std::numeric_limits::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets - return offsets_builder_.Resize((elements + 1) * sizeof(int64_t)); + return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status ListBuilder::Resize(int64_t capacity) { DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); } @@ -1216,16 +1216,26 @@ Status BinaryBuilder::Init(int64_t elements) { DCHECK_LT(elements, std::numeric_limits::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets - return offsets_builder_.Resize((elements + 1) * sizeof(int64_t)); + return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status BinaryBuilder::Resize(int64_t capacity) { DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); } +Status BinaryBuilder::ReserveData(int64_t elements) { + if (value_data_length() + elements > value_data_capacity()) { + if (value_data_length() + elements > std::numeric_limits::max()) { + return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary"); + } + RETURN_NOT_OK(value_data_builder_.Reserve(elements)); + } + return Status::OK(); +} + Status BinaryBuilder::AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) { diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index ce7b8cd197da3..d1611f60cd924 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -682,10 +682,15 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements); Status FinishInternal(std::shared_ptr* out) override; /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } /// Temporary access to a value. ///