Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-20: Add null_count_ member to array containers, remove nullable_ member #9

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ endif()
# For CMAKE_BUILD_TYPE=Release
# -O3: Enable all compiler optimizations
# -g: Enable symbols for profiler tools (TODO: remove for shipping)
set(CXX_FLAGS_DEBUG "-ggdb")
set(CXX_FLAGS_DEBUG "-ggdb -O0")
set(CXX_FLAGS_FASTDEBUG "-ggdb -O1")
set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG")

Expand Down
57 changes: 28 additions & 29 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>

#include "arrow/array.h"
Expand All @@ -32,60 +31,60 @@
#include "arrow/util/memory-pool.h"
#include "arrow/util/status.h"

using std::string;
using std::vector;

namespace arrow {

static TypePtr int32 = TypePtr(new Int32Type());
static TypePtr int32_nn = TypePtr(new Int32Type(false));


class TestArray : public ::testing::Test {
public:
void SetUp() {
pool_ = GetDefaultMemoryPool();

auto data = std::make_shared<PoolBuffer>(pool_);
auto nulls = std::make_shared<PoolBuffer>(pool_);

ASSERT_OK(data->Resize(400));
ASSERT_OK(nulls->Resize(128));

arr_.reset(new Int32Array(100, data, nulls));
}

protected:
MemoryPool* pool_;
std::unique_ptr<Int32Array> arr_;
};


TEST_F(TestArray, TestNullable) {
std::shared_ptr<Buffer> tmp = arr_->data();
std::unique_ptr<Int32Array> arr_nn(new Int32Array(100, tmp));
TEST_F(TestArray, TestNullCount) {
auto data = std::make_shared<PoolBuffer>(pool_);
auto nulls = std::make_shared<PoolBuffer>(pool_);

ASSERT_TRUE(arr_->nullable());
ASSERT_FALSE(arr_nn->nullable());
std::unique_ptr<Int32Array> arr(new Int32Array(100, data, 10, nulls));
ASSERT_EQ(10, arr->null_count());

std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
ASSERT_EQ(0, arr_no_nulls->null_count());
}


TEST_F(TestArray, TestLength) {
ASSERT_EQ(arr_->length(), 100);
auto data = std::make_shared<PoolBuffer>(pool_);
std::unique_ptr<Int32Array> arr(new Int32Array(100, data));
ASSERT_EQ(arr->length(), 100);
}

TEST_F(TestArray, TestIsNull) {
vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 0, 1};
std::vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 0, 1};
int32_t null_count = 0;
for (uint8_t x : nulls) {
if (x > 0) ++null_count;
}

std::shared_ptr<Buffer> null_buf = bytes_to_null_buffer(nulls.data(), nulls.size());
std::shared_ptr<Buffer> null_buf = bytes_to_null_buffer(nulls.data(),
nulls.size());
std::unique_ptr<Array> arr;
arr.reset(new Array(int32, nulls.size(), null_buf));
arr.reset(new Array(int32, nulls.size(), null_count, null_buf));

ASSERT_EQ(null_count, arr->null_count());
ASSERT_EQ(5, null_buf->size());

ASSERT_TRUE(arr->nulls()->Equals(*null_buf.get()));

ASSERT_EQ(null_buf->size(), 5);
for (size_t i = 0; i < nulls.size(); ++i) {
ASSERT_EQ(static_cast<bool>(nulls[i]), arr->IsNull(i));
}
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,26 @@

#include "arrow/array.h"

#include <cstdint>

#include "arrow/util/buffer.h"

namespace arrow {

// ----------------------------------------------------------------------
// Base array class

Array::Array(const TypePtr& type, int64_t length,
Array::Array(const TypePtr& type, int32_t length, int32_t null_count,
const std::shared_ptr<Buffer>& nulls) {
Init(type, length, nulls);
Init(type, length, null_count, nulls);
}

void Array::Init(const TypePtr& type, int64_t length,
void Array::Init(const TypePtr& type, int32_t length, int32_t null_count,
const std::shared_ptr<Buffer>& nulls) {
type_ = type;
length_ = length;
null_count_ = null_count;
nulls_ = nulls;

nullable_ = type->nullable;
if (nulls_) {
null_bits_ = nulls_->data();
}
Expand Down
37 changes: 24 additions & 13 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,38 +30,49 @@ namespace arrow {
class Buffer;

// Immutable data array with some logical type and some length. Any memory is
// owned by the respective Buffer instance (or its parents). May or may not be
// nullable.
// owned by the respective Buffer instance (or its parents).
//
// The base class only has a null array (if the data type is nullable)
// The base class is only required to have a nulls buffer if the null count is
// greater than 0
//
// Any buffers used to initialize the array have their references "stolen". If
// you wish to use the buffer beyond the lifetime of the array, you need to
// explicitly increment its reference count
class Array {
public:
Array() : length_(0), nulls_(nullptr), null_bits_(nullptr) {}
Array(const TypePtr& type, int64_t length,
Array() :
null_count_(0),
length_(0),
nulls_(nullptr),
null_bits_(nullptr) {}

Array(const TypePtr& type, int32_t length, int32_t null_count = 0,
const std::shared_ptr<Buffer>& nulls = nullptr);

virtual ~Array() {}

void Init(const TypePtr& type, int64_t length, const std::shared_ptr<Buffer>& nulls);
void Init(const TypePtr& type, int32_t length, int32_t null_count,
const std::shared_ptr<Buffer>& nulls);

// Determine if a slot if null. For inner loops. Does *not* boundscheck
bool IsNull(int64_t i) const {
return nullable_ && util::get_bit(null_bits_, i);
// Determine if a slot is null. For inner loops. Does *not* boundscheck
bool IsNull(int i) const {
return null_count_ > 0 && util::get_bit(null_bits_, i);
}

int64_t length() const { return length_;}
bool nullable() const { return nullable_;}
int32_t length() const { return length_;}
int32_t null_count() const { return null_count_;}

const TypePtr& type() const { return type_;}
TypeEnum type_enum() const { return type_->type;}

const std::shared_ptr<Buffer>& nulls() const {
return nulls_;
}

protected:
TypePtr type_;
bool nullable_;
int64_t length_;
int32_t null_count_;
int32_t length_;

std::shared_ptr<Buffer> nulls_;
const uint8_t* null_bits_;
Expand Down
35 changes: 15 additions & 20 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,34 +25,29 @@

namespace arrow {

Status ArrayBuilder::Init(int64_t capacity) {
Status ArrayBuilder::Init(int32_t capacity) {
capacity_ = capacity;

if (nullable_) {
int64_t to_alloc = util::ceil_byte(capacity) / 8;
nulls_ = std::make_shared<PoolBuffer>(pool_);
RETURN_NOT_OK(nulls_->Resize(to_alloc));
null_bits_ = nulls_->mutable_data();
memset(null_bits_, 0, to_alloc);
}
int32_t to_alloc = util::ceil_byte(capacity) / 8;
nulls_ = std::make_shared<PoolBuffer>(pool_);
RETURN_NOT_OK(nulls_->Resize(to_alloc));
null_bits_ = nulls_->mutable_data();
memset(null_bits_, 0, to_alloc);
return Status::OK();
}

Status ArrayBuilder::Resize(int64_t new_bits) {
if (nullable_) {
int64_t new_bytes = util::ceil_byte(new_bits) / 8;
int64_t old_bytes = nulls_->size();
RETURN_NOT_OK(nulls_->Resize(new_bytes));
null_bits_ = nulls_->mutable_data();
if (old_bytes < new_bytes) {
memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes);
}
Status ArrayBuilder::Resize(int32_t new_bits) {
int32_t new_bytes = util::ceil_byte(new_bits) / 8;
int32_t old_bytes = nulls_->size();
RETURN_NOT_OK(nulls_->Resize(new_bytes));
null_bits_ = nulls_->mutable_data();
if (old_bytes < new_bytes) {
memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes);
}
return Status::OK();
}

Status ArrayBuilder::Advance(int64_t elements) {
if (nullable_ && length_ + elements > capacity_) {
Status ArrayBuilder::Advance(int32_t elements) {
if (length_ + elements > capacity_) {
return Status::Invalid("Builder must be expanded");
}
length_ += elements;
Expand Down
29 changes: 15 additions & 14 deletions cpp/src/arrow/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,17 @@ class Array;
class MemoryPool;
class PoolBuffer;

static constexpr int64_t MIN_BUILDER_CAPACITY = 1 << 8;
static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8;

// Base class for all data array builders
class ArrayBuilder {
public:
explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) :
pool_(pool),
type_(type),
nullable_(type_->nullable),
nulls_(nullptr), null_bits_(nullptr),
nulls_(nullptr),
null_count_(0),
null_bits_(nullptr),
length_(0),
capacity_(0) {}

Expand All @@ -57,21 +58,21 @@ class ArrayBuilder {
return children_.size();
}

int64_t length() const { return length_;}
int64_t capacity() const { return capacity_;}
bool nullable() const { return nullable_;}
int32_t length() const { return length_;}
int32_t null_count() const { return null_count_;}
int32_t capacity() const { return capacity_;}

// Allocates requires memory at this level, but children need to be
// initialized independently
Status Init(int64_t capacity);
Status Init(int32_t capacity);

// Resizes the nulls array (if nullable)
Status Resize(int64_t new_bits);
// Resizes the nulls array
Status Resize(int32_t new_bits);

// For cases where raw data was memcpy'd into the internal buffers, allows us
// to advance the length of the builder. It is your responsibility to use
// this function responsibly.
Status Advance(int64_t elements);
Status Advance(int32_t elements);

const std::shared_ptr<PoolBuffer>& nulls() const { return nulls_;}

Expand All @@ -83,15 +84,15 @@ class ArrayBuilder {
MemoryPool* pool_;

TypePtr type_;
bool nullable_;

// If the type is not nullable, then null_ is nullptr after initialization
// When nulls are first appended to the builder, the null bitmap is allocated
std::shared_ptr<PoolBuffer> nulls_;
int32_t null_count_;
uint8_t* null_bits_;

// Array length, so far. Also, the index of the next element to be added
int64_t length_;
int64_t capacity_;
int32_t length_;
int32_t capacity_;

// Child value array builders. These are owned by this class
std::vector<std::unique_ptr<ArrayBuilder> > children_;
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/arrow/test-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ void random_nulls(int64_t n, double pct_null, std::vector<bool>* nulls) {
}
}

static inline int null_count(const std::vector<uint8_t>& nulls) {
int result = 0;
for (size_t i = 0; i < nulls.size(); ++i) {
if (nulls[i] > 0) {
++result;
}
}
return result;
}

std::shared_ptr<Buffer> bytes_to_null_buffer(uint8_t* bytes, int length) {
std::shared_ptr<Buffer> out;

Expand Down
12 changes: 4 additions & 8 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,9 @@ struct LayoutType {
// either a primitive physical type (bytes or bits of some fixed size), a
// nested type consisting of other data types, or another data type (e.g. a
// timestamp encoded as an int64)
//
// Any data type can be nullable

enum class TypeEnum: char {
// A degerate NULL type represented as 0 bytes/bits
// A degenerate NULL type represented as 0 bytes/bits
NA = 0,

// Little-endian integer types
Expand Down Expand Up @@ -138,14 +136,12 @@ enum class TypeEnum: char {

struct DataType {
TypeEnum type;
bool nullable;

explicit DataType(TypeEnum type, bool nullable = true)
: type(type), nullable(nullable) {}
explicit DataType(TypeEnum type)
: type(type) {}

virtual bool Equals(const DataType* other) {
return (this == other) || (this->type == other->type &&
this->nullable == other->nullable);
return this == other || this->type == other->type;
}

virtual std::string ToString() const = 0;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/types/collection.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ template <TypeEnum T>
struct CollectionType : public DataType {
std::vector<TypePtr> child_types_;

explicit CollectionType(bool nullable = true) : DataType(T, nullable) {}
CollectionType() : DataType(T) {}

const TypePtr& child(int i) const {
return child_types_[i];
Expand Down
Loading