Skip to content

Commit

Permalink
ARROW-20: Add null_count_ member to Array containers, remove nullable…
Browse files Browse the repository at this point in the history
… member
  • Loading branch information
wesm committed Mar 3, 2016
1 parent e418020 commit 98be016
Show file tree
Hide file tree
Showing 25 changed files with 265 additions and 253 deletions.
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ endif()
# For CMAKE_BUILD_TYPE=Release
# -O3: Enable all compiler optimizations
# -g: Enable symbols for profiler tools (TODO: remove for shipping)
set(CXX_FLAGS_DEBUG "-ggdb")
set(CXX_FLAGS_DEBUG "-ggdb -O0")
set(CXX_FLAGS_FASTDEBUG "-ggdb -O1")
set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG")

Expand Down
57 changes: 28 additions & 29 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>

#include "arrow/array.h"
Expand All @@ -32,60 +31,60 @@
#include "arrow/util/memory-pool.h"
#include "arrow/util/status.h"

using std::string;
using std::vector;

namespace arrow {

static TypePtr int32 = TypePtr(new Int32Type());
static TypePtr int32_nn = TypePtr(new Int32Type(false));


class TestArray : public ::testing::Test {
public:
void SetUp() {
pool_ = GetDefaultMemoryPool();

auto data = std::make_shared<PoolBuffer>(pool_);
auto nulls = std::make_shared<PoolBuffer>(pool_);

ASSERT_OK(data->Resize(400));
ASSERT_OK(nulls->Resize(128));

arr_.reset(new Int32Array(100, data, nulls));
}

protected:
MemoryPool* pool_;
std::unique_ptr<Int32Array> arr_;
};


TEST_F(TestArray, TestNullable) {
std::shared_ptr<Buffer> tmp = arr_->data();
std::unique_ptr<Int32Array> arr_nn(new Int32Array(100, tmp));
TEST_F(TestArray, TestNullCount) {
auto data = std::make_shared<PoolBuffer>(pool_);
auto nulls = std::make_shared<PoolBuffer>(pool_);

ASSERT_TRUE(arr_->nullable());
ASSERT_FALSE(arr_nn->nullable());
std::unique_ptr<Int32Array> arr(new Int32Array(100, data, 10, nulls));
ASSERT_EQ(10, arr->null_count());

std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
ASSERT_EQ(0, arr_no_nulls->null_count());
}


TEST_F(TestArray, TestLength) {
ASSERT_EQ(arr_->length(), 100);
auto data = std::make_shared<PoolBuffer>(pool_);
std::unique_ptr<Int32Array> arr(new Int32Array(100, data));
ASSERT_EQ(arr->length(), 100);
}

TEST_F(TestArray, TestIsNull) {
vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 0, 1};
std::vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 0, 1};
int32_t null_count = 0;
for (uint8_t x : nulls) {
if (x > 0) ++null_count;
}

std::shared_ptr<Buffer> null_buf = bytes_to_null_buffer(nulls.data(), nulls.size());
std::shared_ptr<Buffer> null_buf = bytes_to_null_buffer(nulls.data(),
nulls.size());
std::unique_ptr<Array> arr;
arr.reset(new Array(int32, nulls.size(), null_buf));
arr.reset(new Array(int32, nulls.size(), null_count, null_buf));

ASSERT_EQ(null_count, arr->null_count());
ASSERT_EQ(5, null_buf->size());

ASSERT_TRUE(arr->nulls()->Equals(*null_buf.get()));

ASSERT_EQ(null_buf->size(), 5);
for (size_t i = 0; i < nulls.size(); ++i) {
ASSERT_EQ(static_cast<bool>(nulls[i]), arr->IsNull(i));
}
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,26 @@

#include "arrow/array.h"

#include <cstdint>

#include "arrow/util/buffer.h"

namespace arrow {

// ----------------------------------------------------------------------
// Base array class

Array::Array(const TypePtr& type, int64_t length,
Array::Array(const TypePtr& type, int32_t length, int32_t null_count,
const std::shared_ptr<Buffer>& nulls) {
Init(type, length, nulls);
Init(type, length, null_count, nulls);
}

void Array::Init(const TypePtr& type, int64_t length,
void Array::Init(const TypePtr& type, int32_t length, int32_t null_count,
const std::shared_ptr<Buffer>& nulls) {
type_ = type;
length_ = length;
null_count_ = null_count;
nulls_ = nulls;

nullable_ = type->nullable;
if (nulls_) {
null_bits_ = nulls_->data();
}
Expand Down
37 changes: 24 additions & 13 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,38 +30,49 @@ namespace arrow {
class Buffer;

// Immutable data array with some logical type and some length. Any memory is
// owned by the respective Buffer instance (or its parents). May or may not be
// nullable.
// owned by the respective Buffer instance (or its parents).
//
// The base class only has a null array (if the data type is nullable)
// The base class is only required to have a nulls buffer if the null count is
// greater than 0
//
// Any buffers used to initialize the array have their references "stolen". If
// you wish to use the buffer beyond the lifetime of the array, you need to
// explicitly increment its reference count
class Array {
public:
Array() : length_(0), nulls_(nullptr), null_bits_(nullptr) {}
Array(const TypePtr& type, int64_t length,
Array() :
null_count_(0),
length_(0),
nulls_(nullptr),
null_bits_(nullptr) {}

Array(const TypePtr& type, int32_t length, int32_t null_count = 0,
const std::shared_ptr<Buffer>& nulls = nullptr);

virtual ~Array() {}

void Init(const TypePtr& type, int64_t length, const std::shared_ptr<Buffer>& nulls);
void Init(const TypePtr& type, int32_t length, int32_t null_count,
const std::shared_ptr<Buffer>& nulls);

// Determine if a slot if null. For inner loops. Does *not* boundscheck
bool IsNull(int64_t i) const {
return nullable_ && util::get_bit(null_bits_, i);
// Determine if a slot is null. For inner loops. Does *not* boundscheck
bool IsNull(int i) const {
return null_count_ > 0 && util::get_bit(null_bits_, i);
}

int64_t length() const { return length_;}
bool nullable() const { return nullable_;}
int32_t length() const { return length_;}
int32_t null_count() const { return null_count_;}

const TypePtr& type() const { return type_;}
TypeEnum type_enum() const { return type_->type;}

const std::shared_ptr<Buffer>& nulls() const {
return nulls_;
}

protected:
TypePtr type_;
bool nullable_;
int64_t length_;
int32_t null_count_;
int32_t length_;

std::shared_ptr<Buffer> nulls_;
const uint8_t* null_bits_;
Expand Down
35 changes: 15 additions & 20 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,34 +25,29 @@

namespace arrow {

Status ArrayBuilder::Init(int64_t capacity) {
Status ArrayBuilder::Init(int32_t capacity) {
capacity_ = capacity;

if (nullable_) {
int64_t to_alloc = util::ceil_byte(capacity) / 8;
nulls_ = std::make_shared<PoolBuffer>(pool_);
RETURN_NOT_OK(nulls_->Resize(to_alloc));
null_bits_ = nulls_->mutable_data();
memset(null_bits_, 0, to_alloc);
}
int32_t to_alloc = util::ceil_byte(capacity) / 8;
nulls_ = std::make_shared<PoolBuffer>(pool_);
RETURN_NOT_OK(nulls_->Resize(to_alloc));
null_bits_ = nulls_->mutable_data();
memset(null_bits_, 0, to_alloc);
return Status::OK();
}

Status ArrayBuilder::Resize(int64_t new_bits) {
if (nullable_) {
int64_t new_bytes = util::ceil_byte(new_bits) / 8;
int64_t old_bytes = nulls_->size();
RETURN_NOT_OK(nulls_->Resize(new_bytes));
null_bits_ = nulls_->mutable_data();
if (old_bytes < new_bytes) {
memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes);
}
Status ArrayBuilder::Resize(int32_t new_bits) {
int32_t new_bytes = util::ceil_byte(new_bits) / 8;
int32_t old_bytes = nulls_->size();
RETURN_NOT_OK(nulls_->Resize(new_bytes));
null_bits_ = nulls_->mutable_data();
if (old_bytes < new_bytes) {
memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes);
}
return Status::OK();
}

Status ArrayBuilder::Advance(int64_t elements) {
if (nullable_ && length_ + elements > capacity_) {
Status ArrayBuilder::Advance(int32_t elements) {
if (length_ + elements > capacity_) {
return Status::Invalid("Builder must be expanded");
}
length_ += elements;
Expand Down
29 changes: 15 additions & 14 deletions cpp/src/arrow/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,17 @@ class Array;
class MemoryPool;
class PoolBuffer;

static constexpr int64_t MIN_BUILDER_CAPACITY = 1 << 8;
static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8;

// Base class for all data array builders
class ArrayBuilder {
public:
explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) :
pool_(pool),
type_(type),
nullable_(type_->nullable),
nulls_(nullptr), null_bits_(nullptr),
nulls_(nullptr),
null_count_(0),
null_bits_(nullptr),
length_(0),
capacity_(0) {}

Expand All @@ -57,21 +58,21 @@ class ArrayBuilder {
return children_.size();
}

int64_t length() const { return length_;}
int64_t capacity() const { return capacity_;}
bool nullable() const { return nullable_;}
int32_t length() const { return length_;}
int32_t null_count() const { return null_count_;}
int32_t capacity() const { return capacity_;}

// Allocates requires memory at this level, but children need to be
// initialized independently
Status Init(int64_t capacity);
Status Init(int32_t capacity);

// Resizes the nulls array (if nullable)
Status Resize(int64_t new_bits);
// Resizes the nulls array
Status Resize(int32_t new_bits);

// For cases where raw data was memcpy'd into the internal buffers, allows us
// to advance the length of the builder. It is your responsibility to use
// this function responsibly.
Status Advance(int64_t elements);
Status Advance(int32_t elements);

const std::shared_ptr<PoolBuffer>& nulls() const { return nulls_;}

Expand All @@ -83,15 +84,15 @@ class ArrayBuilder {
MemoryPool* pool_;

TypePtr type_;
bool nullable_;

// If the type is not nullable, then null_ is nullptr after initialization
// When nulls are first appended to the builder, the null bitmap is allocated
std::shared_ptr<PoolBuffer> nulls_;
int32_t null_count_;
uint8_t* null_bits_;

// Array length, so far. Also, the index of the next element to be added
int64_t length_;
int64_t capacity_;
int32_t length_;
int32_t capacity_;

// Child value array builders. These are owned by this class
std::vector<std::unique_ptr<ArrayBuilder> > children_;
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/arrow/test-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,16 @@ void random_nulls(int64_t n, double pct_null, std::vector<bool>* nulls) {
}
}

static inline int null_count(const std::vector<uint8_t>& nulls) {
int result = 0;
for (size_t i = 0; i < nulls.size(); ++i) {
if (nulls[i] > 0) {
++result;
}
}
return result;
}

std::shared_ptr<Buffer> bytes_to_null_buffer(uint8_t* bytes, int length) {
std::shared_ptr<Buffer> out;

Expand Down
12 changes: 4 additions & 8 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,9 @@ struct LayoutType {
// either a primitive physical type (bytes or bits of some fixed size), a
// nested type consisting of other data types, or another data type (e.g. a
// timestamp encoded as an int64)
//
// Any data type can be nullable

enum class TypeEnum: char {
// A degerate NULL type represented as 0 bytes/bits
// A degenerate NULL type represented as 0 bytes/bits
NA = 0,

// Little-endian integer types
Expand Down Expand Up @@ -138,14 +136,12 @@ enum class TypeEnum: char {

struct DataType {
TypeEnum type;
bool nullable;

explicit DataType(TypeEnum type, bool nullable = true)
: type(type), nullable(nullable) {}
explicit DataType(TypeEnum type)
: type(type) {}

virtual bool Equals(const DataType* other) {
return (this == other) || (this->type == other->type &&
this->nullable == other->nullable);
return this == other || this->type == other->type;
}

virtual std::string ToString() const = 0;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/types/collection.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ template <TypeEnum T>
struct CollectionType : public DataType {
std::vector<TypePtr> child_types_;

explicit CollectionType(bool nullable = true) : DataType(T, nullable) {}
CollectionType() : DataType(T) {}

const TypePtr& child(int i) const {
return child_types_[i];
Expand Down
Loading

0 comments on commit 98be016

Please sign in to comment.