Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
a2f5663
Add buffer zeroing on allocation and fix copy
Jul 10, 2018
bd1b3e4
Prefer using AllocateBuffer, zero padding where it's impossible
Jul 10, 2018
d2f0db7
Add warning on unfinished buffers and fix some problems
Jul 10, 2018
4cea9fc
Don't pass shrink_to_fit
Jul 10, 2018
e035716
Move finished state unsetting to UnsafeAppendBitmap
Jul 10, 2018
e90248c
More consistency with padding zeroing
Jul 10, 2018
d1f0ddb
Consume overflow dicts status
Jul 10, 2018
28c64b0
Fix NullBuilder and use it in python to arrow conversion
Jul 10, 2018
cf0d381
Prevent a builder unfinished warning
Jul 10, 2018
1f30d42
Formatting
Jul 10, 2018
45b66d8
Consume status
Jul 10, 2018
3c91f51
Decrease log level
Jul 12, 2018
d9ded42
Avoid raw data access on unfinished buffer in dictionary
Jul 13, 2018
633a022
Add a builder reset method and fix some tests
Jul 13, 2018
bed63f7
Remove data() and null_bitmap() methods from builder
Jul 13, 2018
bfeaa2c
Fix comment
Jul 13, 2018
b8485bc
Revert NullBuilder changes
Jul 13, 2018
515d316
Switch nones_ from NullBuilder to BooleanBuilder
Jul 13, 2018
1f712e3
Adjust orc adapter to not to use builders internals
Jul 13, 2018
8b97572
Add data() and null_bitmap() with deprecation warning
Jul 14, 2018
de9fe25
Remove outdated comments
Jul 16, 2018
c9e709a
Remove extra semicolon
Jul 16, 2018
8c8291d
Consume status in test
Jul 16, 2018
cccac05
Add AppendValues method which uses iterators and avoid copies in orc …
Jul 17, 2018
42827e8
WIP lazy iterator
Jul 18, 2018
6259a9c
Add tests and benchmark to lazy range
Jul 18, 2018
8fe8095
Fix lint warnings
Jul 18, 2018
227245a
Remove couple of unneeded things
Jul 18, 2018
114c0ad
Change the iterator category
Jul 18, 2018
c25f134
Fix comment
Jul 18, 2018
10ec6d5
Fix compiler warnings
Jul 18, 2018
dcd6111
Formatting
Jul 18, 2018
7dae17b
Fix comment
Jul 18, 2018
e912f9f
Avoid code duplication
Jul 18, 2018
6ddf882
Assert lazy iterator is const
Jul 18, 2018
f6c66db
Deduce type from 0 argument
Jul 18, 2018
25a729c
lol msvc
Jul 18, 2018
2906df2
Don't include benchmark_main in the benchmark
Jul 18, 2018
f616ac5
msvc complains about unchecked iterators
Jul 18, 2018
33b98c8
Whoops, old includes
Jul 18, 2018
8b681b0
Second try to make msvc happy about iterators
Jul 18, 2018
92d2b14
Suppress msvc compiler warning about narrowing conversion
Jul 18, 2018
924467c
Better disable warning
Jul 18, 2018
3d3d55e
Maybe like this?
Jul 18, 2018
44f02aa
Try function scope suppression
Jul 18, 2018
0a2c6cb
Handle custom template cases
Jul 18, 2018
ee591f4
Formatting
Jul 18, 2018
bca761f
Convert to the correct type
Jul 18, 2018
4e829cb
Fix lint warning about nullptr
Jul 18, 2018
e200072
More consisten handling of null values
Jul 18, 2018
3fd5cbb
Handle nullptr for iterators
Jul 19, 2018
27347bc
More precise nullptr use detection
Jul 19, 2018
95bae66
Cleanup
Jul 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/build-support/lint_cpp_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@


_STRIP_COMMENT_REGEX = re.compile('(.+)?(?=//)')
_NULLPTR_REGEX = re.compile(r'.*\bnullptr\b.*')


def _strip_comments(line):
Expand All @@ -41,7 +42,7 @@ def _strip_comments(line):
def lint_file(path):
fail_rules = [
(lambda x: '<mutex>' in x, 'Uses <mutex>'),
(lambda x: 'nullptr' in x, 'Uses nullptr')
(lambda x: re.match(_NULLPTR_REGEX, x), 'Uses nullptr')
]

with open(path) as f:
Expand Down
42 changes: 18 additions & 24 deletions cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "arrow/util/bit-util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/lazy.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"

Expand Down Expand Up @@ -521,18 +522,17 @@ class ORCFileReader::Impl {
if (length == 0) {
return Status::OK();
}
int64_t start = builder->length();

const uint8_t* valid_bytes = nullptr;
if (batch->hasNulls) {
valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
}
RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length));

const source_type* source = batch->data.data() + offset;
target_type* target = reinterpret_cast<target_type*>(builder->data()->mutable_data());
auto cast_iter = internal::MakeLazyRange(
[&source](int64_t index) { return static_cast<target_type>(source[index]); },
length);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool


std::copy(source, source + length, target + start);
RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));

return Status::OK();
}
Expand All @@ -545,24 +545,18 @@ class ORCFileReader::Impl {
if (length == 0) {
return Status::OK();
}
int64_t start = builder->length();

const uint8_t* valid_bytes = nullptr;
if (batch->hasNulls) {
valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
}
RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length));

const int64_t* source = batch->data.data() + offset;
uint8_t* target = reinterpret_cast<uint8_t*>(builder->data()->mutable_data());

for (int64_t i = 0; i < length; i++) {
if (source[i]) {
BitUtil::SetBit(target, start + i);
} else {
BitUtil::ClearBit(target, start + i);
}
}
auto cast_iter = internal::MakeLazyRange(
[&source](int64_t index) { return static_cast<bool>(source[index]); }, length);

RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));

return Status::OK();
}

Expand All @@ -574,23 +568,23 @@ class ORCFileReader::Impl {
if (length == 0) {
return Status::OK();
}
int64_t start = builder->length();

const uint8_t* valid_bytes = nullptr;
if (batch->hasNulls) {
valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
}
RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length));

const int64_t* seconds = batch->data.data() + offset;
const int64_t* nanos = batch->nanoseconds.data() + offset;
int64_t* target = reinterpret_cast<int64_t*>(builder->data()->mutable_data());

for (int64_t i = 0; i < length; i++) {
// TODO: boundscheck this, as ORC supports higher resolution timestamps
// than arrow for nanosecond resolution
target[start + i] = seconds[i] * kOneSecondNanos + nanos[i];
}
auto transform_timestamp = [seconds, nanos](int64_t index) {
return seconds[index] * kOneSecondNanos + nanos[index];
};

auto transform_range = internal::MakeLazyRange(transform_timestamp, length);

RETURN_NOT_OK(builder->AppendValues(transform_range.begin(), transform_range.end(),
valid_bytes));
return Status::OK();
}

Expand Down
132 changes: 114 additions & 18 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/lazy.h"

namespace arrow {

Expand Down Expand Up @@ -241,14 +242,17 @@ TEST_F(TestArray, TestIsNullIsValidNoNulls) {
}

TEST_F(TestArray, BuildLargeInMemoryArray) {
#ifdef NDEBUG
const int64_t length = static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
#else
// use a smaller size since the insert function isn't optimized properly on debug and
// the test takes a long time to complete
const int64_t length = 2 << 24;
#endif

BooleanBuilder builder;
ASSERT_OK(builder.Reserve(length));

// Advance does not write to data, see docstring
ASSERT_OK(builder.Advance(length));
memset(builder.data()->mutable_data(), 0, BitUtil::BytesForBits(length));
std::vector<bool> zeros(length);
ASSERT_OK(builder.AppendValues(zeros));

std::shared_ptr<Array> result;
FinishAndCheckPadding(&builder, &result);
Expand All @@ -265,10 +269,10 @@ TEST_F(TestBuilder, TestReserve) {
UInt8Builder builder(pool_);

ASSERT_OK(builder.Init(10));
ASSERT_EQ(2, builder.null_bitmap()->size());
ASSERT_EQ(10, builder.capacity());

ASSERT_OK(builder.Reserve(30));
ASSERT_EQ(4, builder.null_bitmap()->size());
ASSERT_EQ(BitUtil::NextPower2(30), builder.capacity());
}

template <typename Attrs>
Expand Down Expand Up @@ -328,7 +332,6 @@ class TestPrimitiveBuilder : public TestBuilder {
ASSERT_EQ(0, builder->length());
ASSERT_EQ(0, builder->capacity());
ASSERT_EQ(0, builder->null_count());
ASSERT_EQ(nullptr, builder->data());

ASSERT_EQ(ex_null_count, result->null_count());
ASSERT_TRUE(result->Equals(*expected));
Expand Down Expand Up @@ -468,7 +471,6 @@ void TestPrimitiveBuilder<PBoolean>::Check(const std::unique_ptr<BooleanBuilder>
ASSERT_EQ(0, builder->length());
ASSERT_EQ(0, builder->capacity());
ASSERT_EQ(0, builder->null_count());
ASSERT_EQ(nullptr, builder->data());
}

typedef ::testing::Types<PBoolean, PUInt8, PUInt16, PUInt32, PUInt64, PInt8, PInt16,
Expand All @@ -478,13 +480,9 @@ typedef ::testing::Types<PBoolean, PUInt8, PUInt16, PUInt32, PUInt64, PInt8, PIn
TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives);

TYPED_TEST(TestPrimitiveBuilder, TestInit) {
DECL_TYPE();

int64_t n = 1000;
ASSERT_OK(this->builder_->Reserve(n));
ASSERT_EQ(BitUtil::NextPower2(n), this->builder_->capacity());
ASSERT_EQ(BitUtil::NextPower2(TypeTraits<Type>::bytes_required(n)),
this->builder_->data()->size());

// unsure if this should go in all builder classes
ASSERT_EQ(0, this->builder_->num_children());
Expand Down Expand Up @@ -711,6 +709,109 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendValues) {
this->Check(this->builder_nn_, false);
}

TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesIter) {
int64_t size = 10000;
this->RandomData(size);

ASSERT_OK(this->builder_->AppendValues(this->draws_.begin(), this->draws_.end(),
this->valid_bytes_.begin()));
ASSERT_OK(this->builder_nn_->AppendValues(this->draws_.begin(), this->draws_.end()));

ASSERT_EQ(size, this->builder_->length());
ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity());

this->Check(this->builder_, true);
this->Check(this->builder_nn_, false);
}

TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesIterNullValid) {
int64_t size = 10000;
this->RandomData(size);

ASSERT_OK(this->builder_nn_->AppendValues(this->draws_.begin(),
this->draws_.begin() + size / 2,
static_cast<uint8_t*>(nullptr)));

ASSERT_EQ(BitUtil::NextPower2(size / 2), this->builder_nn_->capacity());

ASSERT_OK(this->builder_nn_->AppendValues(this->draws_.begin() + size / 2,
this->draws_.end(),
static_cast<uint64_t*>(nullptr)));

this->Check(this->builder_nn_, false);
}

TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesLazyIter) {
DECL_T();

int64_t size = 10000;
this->RandomData(size);

auto& draws = this->draws_;
auto& valid_bytes = this->valid_bytes_;

auto doubler = [&draws](int64_t index) { return draws[index] * 2; };
auto lazy_iter = internal::MakeLazyRange(doubler, size);

ASSERT_OK(this->builder_->AppendValues(lazy_iter.begin(), lazy_iter.end(),
valid_bytes.begin()));

std::vector<T> doubled;
transform(draws.begin(), draws.end(), back_inserter(doubled),
[](T in) { return in * 2; });

std::shared_ptr<Array> result;
FinishAndCheckPadding(this->builder_.get(), &result);

std::shared_ptr<Array> expected;
ASSERT_OK(
this->builder_->AppendValues(doubled.data(), doubled.size(), valid_bytes.data()));
FinishAndCheckPadding(this->builder_.get(), &expected);

ASSERT_TRUE(expected->Equals(result));
}

TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesIterConverted) {
DECL_T();
// find type we can safely convert the tested values to and from
using conversion_type =
typename std::conditional<std::is_floating_point<T>::value, double,
typename std::conditional<std::is_unsigned<T>::value,
uint64_t, int64_t>::type>::type;

int64_t size = 10000;
this->RandomData(size);

// append convertible values
vector<conversion_type> draws_converted(this->draws_.begin(), this->draws_.end());
vector<int32_t> valid_bytes_converted(this->valid_bytes_.begin(),
this->valid_bytes_.end());

auto cast_values = internal::MakeLazyRange(
[&draws_converted](int64_t index) {
return static_cast<T>(draws_converted[index]);
},
size);
auto cast_valid = internal::MakeLazyRange(
[&valid_bytes_converted](int64_t index) {
return static_cast<bool>(valid_bytes_converted[index]);
},
size);

ASSERT_OK(this->builder_->AppendValues(cast_values.begin(), cast_values.end(),
cast_valid.begin()));
ASSERT_OK(this->builder_nn_->AppendValues(cast_values.begin(), cast_values.end()));

ASSERT_EQ(size, this->builder_->length());
ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity());

ASSERT_EQ(size, this->builder_->length());
ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity());

this->Check(this->builder_, true);
this->Check(this->builder_nn_, false);
}

TYPED_TEST(TestPrimitiveBuilder, TestZeroPadded) {
DECL_T();

Expand Down Expand Up @@ -789,15 +890,10 @@ TYPED_TEST(TestPrimitiveBuilder, TestAdvance) {
}

TYPED_TEST(TestPrimitiveBuilder, TestResize) {
DECL_TYPE();

int64_t cap = kMinBuilderCapacity * 2;

ASSERT_OK(this->builder_->Reserve(cap));
ASSERT_EQ(cap, this->builder_->capacity());

ASSERT_EQ(TypeTraits<Type>::bytes_required(cap), this->builder_->data()->size());
ASSERT_EQ(BitUtil::BytesForBits(cap), this->builder_->null_bitmap()->size());
}

TYPED_TEST(TestPrimitiveBuilder, TestReserve) {
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/buffer-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ TEST(TestBuffer, Copy) {

Buffer expected(data + 5, 4);
ASSERT_TRUE(out->Equals(expected));
// assert the padding is zeroed
std::vector<uint8_t> zeros(out->capacity() - out->size());
ASSERT_EQ(0, memcmp(out->data() + out->size(), zeros.data(), zeros.size()));
}

TEST(TestBuffer, SliceBuffer) {
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/arrow/buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ Status Buffer::Copy(const int64_t start, const int64_t nbytes, MemoryPool* pool,
DCHECK_LT(start, size_);
DCHECK_LE(nbytes, size_ - start);

auto new_buffer = std::make_shared<PoolBuffer>(pool);
RETURN_NOT_OK(new_buffer->Resize(nbytes));
std::shared_ptr<ResizableBuffer> new_buffer;
RETURN_NOT_OK(AllocateResizableBuffer(pool, nbytes, &new_buffer));

std::memcpy(new_buffer->mutable_data(), data() + start, static_cast<size_t>(nbytes));

Expand Down Expand Up @@ -123,6 +123,7 @@ Status PoolBuffer::Resize(const int64_t new_size, bool shrink_to_fit) {
}
}
size_ = new_size;

return Status::OK();
}

Expand All @@ -142,6 +143,7 @@ Status AllocateBuffer(MemoryPool* pool, const int64_t size,
std::shared_ptr<Buffer>* out) {
auto buffer = std::make_shared<PoolBuffer>(pool);
RETURN_NOT_OK(buffer->Resize(size));
buffer->ZeroPadding();
*out = buffer;
return Status::OK();
}
Expand All @@ -154,6 +156,7 @@ Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size,
std::shared_ptr<ResizableBuffer>* out) {
auto buffer = std::make_shared<PoolBuffer>(pool);
RETURN_NOT_OK(buffer->Resize(size));
buffer->ZeroPadding();
*out = buffer;
return Status::OK();
}
Expand Down
Loading