Skip to content

Commit

Permalink
ARROW-1712: [C++] Add method to BinaryBuilder to reserve space for va…
Browse files Browse the repository at this point in the history
…lue data

Modified BinaryBuilder::Resize(int64_t) so that when building BinaryArrays with a known size, space is also reserved for value_data_builder_ to prevent internal reallocation.

Author: Panchen Xue <pan.panchen.xue@gmail.com>

Closes apache#1481 from xuepanchen/master and squashes the following commits:

707b67b [Panchen Xue] ARROW-1712: [C++] Fix lint errors
360e601 [Panchen Xue] Merge branch 'master' of https://github.com/xuepanchen/arrow
d4bbd15 [Panchen Xue] ARROW-1712: [C++] Modify test case for BinaryBuilder::ReserveData() and change arguments for offsets_builder_.Resize()
77f8f3c [Panchen Xue] Merge pull request apache#5 from apache/master
bc5db7d [Panchen Xue] ARROW-1712: [C++] Remove unneeded data member in BinaryBuilder and modify test case
5a5b70e [Panchen Xue] Merge pull request #4 from apache/master
8e4c892 [Panchen Xue] Merge pull request #3 from xuepanchen/xuepanchen-arrow-1712
d3c8202 [Panchen Xue] ARROW-1945: [C++] Fix a small typo
0b07895 [Panchen Xue] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data
18f90fb [Panchen Xue] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data
bbc6527 [Panchen Xue] ARROW-1945: [C++] Update test case for BinaryBuild data value space reservation
15e045c [Panchen Xue] Add test case for array-test.cc
5a5593e [Panchen Xue] Update again ReserveData(int64_t) method for BinaryBuilder
9b5e805 [Panchen Xue] Update ReserveData(int64_t) method signature for BinaryBuilder
8dd5eaa [Panchen Xue] Update builder.cc
b002e0b [Panchen Xue] Remove override keyword from ReserveData(int64_t) method for BinaryBuilder
de318f4 [Panchen Xue] Implement ReserveData(int64_t) method for BinaryBuilder
e0434e6 [Panchen Xue] Add ReserveData(int64_t) and value_data_capacity() for methods for BinaryBuilder
5ebfb32 [Panchen Xue] Add capacity() method for TypedBufferBuilder
5b73c1c [Panchen Xue] Update again BinaryBuilder::Resize(int64_t capacity) in builder.cc
d021c54 [Panchen Xue] Merge pull request #2 from xuepanchen/xuepanchen-arrow-1712
232024e [Panchen Xue] Update BinaryBuilder::Resize(int64_t capacity) in builder.cc
c2f8dc4 [Panchen Xue] Merge pull request #1 from apache/master
  • Loading branch information
xuepanchen authored and wesm committed Jan 24, 2018
1 parent 0930b1d commit 0a49022
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 4 deletions.
39 changes: 39 additions & 0 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,45 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
}
}

TEST_F(TestBinaryBuilder, TestCapacityReserve) {
vector<string> strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", "dddddddddd"};
int N = static_cast<int>(strings.size());
int reps = 15;
int64_t length = 0;
int64_t capacity = 1000;
int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity);

ASSERT_OK(builder_->ReserveData(capacity));

ASSERT_EQ(length, builder_->value_data_length());
ASSERT_EQ(expected_capacity, builder_->value_data_capacity());

for (int j = 0; j < reps; ++j) {
for (int i = 0; i < N; ++i) {
ASSERT_OK(builder_->Append(strings[i]));
length += static_cast<int>(strings[i].size());

ASSERT_EQ(length, builder_->value_data_length());
ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
}
}

int extra_capacity = 500;
expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity);

ASSERT_OK(builder_->ReserveData(extra_capacity));

ASSERT_EQ(length, builder_->value_data_length());
ASSERT_EQ(expected_capacity, builder_->value_data_capacity());

Done();

ASSERT_EQ(reps * N, result_->length());
ASSERT_EQ(0, result_->null_count());
ASSERT_EQ(reps * 40, result_->value_data()->size());
ASSERT_EQ(expected_capacity, result_->value_data()->capacity());
}

TEST_F(TestBinaryBuilder, TestZeroLength) {
// All buffers are null
Done();
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder {

const T* data() const { return reinterpret_cast<const T*>(data_); }
int64_t length() const { return size_ / sizeof(T); }
int64_t capacity() const { return capacity_ / sizeof(T); }
};

/// \brief Allocate a fixed size mutable buffer from a memory pool
Expand Down
18 changes: 14 additions & 4 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1165,13 +1165,13 @@ Status ListBuilder::Init(int64_t elements) {
DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
RETURN_NOT_OK(ArrayBuilder::Init(elements));
// one more then requested for offsets
return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
}

Status ListBuilder::Resize(int64_t capacity) {
DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
// one more then requested for offsets
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
return ArrayBuilder::Resize(capacity);
}

Expand Down Expand Up @@ -1216,16 +1216,26 @@ Status BinaryBuilder::Init(int64_t elements) {
DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
RETURN_NOT_OK(ArrayBuilder::Init(elements));
// one more then requested for offsets
return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
}

Status BinaryBuilder::Resize(int64_t capacity) {
DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
// one more then requested for offsets
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
return ArrayBuilder::Resize(capacity);
}

Status BinaryBuilder::ReserveData(int64_t elements) {
if (value_data_length() + elements > value_data_capacity()) {
if (value_data_length() + elements > std::numeric_limits<int32_t>::max()) {
return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary");
}
RETURN_NOT_OK(value_data_builder_.Reserve(elements));
}
return Status::OK();
}

Status BinaryBuilder::AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -682,10 +682,15 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {

Status Init(int64_t elements) override;
Status Resize(int64_t capacity) override;
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

/// \return size of values buffer so far
int64_t value_data_length() const { return value_data_builder_.length(); }
/// \return capacity of values buffer
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }

/// Temporary access to a value.
///
Expand Down

0 comments on commit 0a49022

Please sign in to comment.