Skip to content

Commit

Permalink
ARROW-47: [C++] Preliminary arrow::Scalar object model
Browse files Browse the repository at this point in the history
This is the start of a Scalar object model suitable for static and dynamic dispatch to correspond with the existing array and array builder types.

I modified the first aggregation kernel (sum) to use these types for outputs.

Author: Wes McKinney <wesm+git@apache.org>

Closes #3604 from wesm/ARROW-47 and squashes the following commits:

0d01bb3 <Wes McKinney> Fix unit test on MSVC for small integer types
d57f7aa <Wes McKinney> Remove ARROW_GTEST_VENDORED
03ca01c <Wes McKinney> Changes because MSVC tries to instantiate NumericScalar for Time/Timestamp types
271d602 <Wes McKinney> Add notes that Scalar API is experimental
e4a13b4 <Wes McKinney> flake
6626035 <Wes McKinney> Fix up date/time scalars, add tests
704daee <Wes McKinney> Code review comments
d922bd2 <Wes McKinney> Use new Scalar objects in aggregation code
fa89bd0 <Wes McKinney> Drafting, untested
94d5e62 <Wes McKinney> start
  • Loading branch information
wesm committed Feb 13, 2019
1 parent 27ba26c commit d831e2c
Show file tree
Hide file tree
Showing 15 changed files with 554 additions and 119 deletions.
2 changes: 2 additions & 0 deletions cpp/build-support/run_cpplint.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ def _check_some_files(completed_processes, filenames):
if problem_files:
msg = "{} had cpplint issues"
print("\n".join(map(msg.format, problem_files)))
if isinstance(stdout, bytes):
stdout = stdout.decode('utf8')
print(stdout, file=sys.stderr)
error = True
except Exception:
Expand Down
1 change: 1 addition & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,7 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS)
endif()

set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep")
set(GTEST_HOME ${GTEST_PREFIX})
set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include")
set(GTEST_STATIC_LIB
"${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_GTEST_DEBUG_EXTENSION}${CMAKE_STATIC_LIBRARY_SUFFIX}")
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,12 @@ set(ARROW_SRCS
memory_pool.cc
pretty_print.cc
record_batch.cc
scalar.cc
sparse_tensor.cc
status.cc
table.cc
table_builder.cc
tensor.cc
sparse_tensor.cc
type.cc
visitor.cc
csv/converter.cc
Expand Down Expand Up @@ -306,6 +307,7 @@ add_arrow_test(buffer-test)
add_arrow_test(memory_pool-test)
add_arrow_test(pretty_print-test)
add_arrow_test(public-api-test)
add_arrow_test(scalar-test)
add_arrow_test(status-test)
add_arrow_test(stl-test)
add_arrow_test(type-test)
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/compute/compute-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,16 @@ void CheckImplicitConstructor(enum Datum::type expected_kind) {
}

TEST(TestDatum, ImplicitConstructors) {
CheckImplicitConstructor<Scalar>(Datum::SCALAR);

CheckImplicitConstructor<Array>(Datum::ARRAY);

// Instantiate from array subclass
CheckImplicitConstructor<BinaryArray>(Datum::ARRAY);

CheckImplicitConstructor<ChunkedArray>(Datum::CHUNKED_ARRAY);
CheckImplicitConstructor<RecordBatch>(Datum::RECORD_BATCH);

CheckImplicitConstructor<Table>(Datum::TABLE);
}

Expand Down
61 changes: 10 additions & 51 deletions cpp/src/arrow/compute/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include "arrow/array.h"
#include "arrow/record_batch.h"
#include "arrow/scalar.h"
#include "arrow/table.h"
#include "arrow/util/macros.h"
#include "arrow/util/variant.h" // IWYU pragma: export
Expand Down Expand Up @@ -55,68 +56,20 @@ class ARROW_EXPORT OpKernel {
virtual ~OpKernel() = default;
};

/// \brief Placeholder for Scalar values until we implement these
struct ARROW_EXPORT Scalar {
util::variant<bool, uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t,
int64_t, float, double>
value;

explicit Scalar(bool value) : value(value) {}
explicit Scalar(uint8_t value) : value(value) {}
explicit Scalar(int8_t value) : value(value) {}
explicit Scalar(uint16_t value) : value(value) {}
explicit Scalar(int16_t value) : value(value) {}
explicit Scalar(uint32_t value) : value(value) {}
explicit Scalar(int32_t value) : value(value) {}
explicit Scalar(uint64_t value) : value(value) {}
explicit Scalar(int64_t value) : value(value) {}
explicit Scalar(float value) : value(value) {}
explicit Scalar(double value) : value(value) {}

Type::type kind() const {
switch (this->value.which()) {
case 0:
return Type::BOOL;
case 1:
return Type::UINT8;
case 2:
return Type::INT8;
case 3:
return Type::UINT16;
case 4:
return Type::INT16;
case 5:
return Type::UINT32;
case 6:
return Type::INT32;
case 7:
return Type::UINT64;
case 8:
return Type::INT64;
case 9:
return Type::FLOAT;
case 10:
return Type::DOUBLE;
default:
return Type::NA;
}
}
};

/// \class Datum
/// \brief Variant type for various Arrow C++ data structures
struct ARROW_EXPORT Datum {
enum type { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION };

util::variant<decltype(NULLPTR), Scalar, std::shared_ptr<ArrayData>,
util::variant<decltype(NULLPTR), std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
std::shared_ptr<Table>, std::vector<Datum>>
value;

/// \brief Empty datum, to be populated elsewhere
Datum() : value(NULLPTR) {}

Datum(const Scalar& value) // NOLINT implicit conversion
Datum(const std::shared_ptr<Scalar>& value) // NOLINT implicit conversion
: value(value) {}
Datum(const std::shared_ptr<ArrayData>& value) // NOLINT implicit conversion
: value(value) {}
Expand Down Expand Up @@ -188,14 +141,18 @@ struct ARROW_EXPORT Datum {
return util::get<std::vector<Datum>>(this->value);
}

Scalar scalar() const { return util::get<Scalar>(this->value); }
std::shared_ptr<Scalar> scalar() const {
return util::get<std::shared_ptr<Scalar>>(this->value);
}

bool is_array() const { return this->kind() == Datum::ARRAY; }

bool is_arraylike() const {
return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY;
}

bool is_scalar() const { return this->kind() == Datum::SCALAR; }

/// \brief The value type of the variant, if any
///
/// \return nullptr if no type
Expand All @@ -204,6 +161,8 @@ struct ARROW_EXPORT Datum {
return util::get<std::shared_ptr<ArrayData>>(this->value)->type;
} else if (this->kind() == Datum::CHUNKED_ARRAY) {
return util::get<std::shared_ptr<ChunkedArray>>(this->value)->type();
} else if (this->kind() == Datum::SCALAR) {
return util::get<std::shared_ptr<Scalar>>(this->value)->type;
}
return NULLPTR;
}
Expand Down
69 changes: 36 additions & 33 deletions cpp/src/arrow/compute/kernels/aggregate-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "arrow/compute/kernels/sum.h"
#include "arrow/compute/test-util.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"

#include "arrow/testing/gtest_common.h"
#include "arrow/testing/gtest_util.h"
Expand All @@ -36,47 +38,47 @@ using std::vector;
namespace arrow {
namespace compute {

template <typename CType, typename Enable = void>
template <typename Type, typename Enable = void>
struct DatumEqual {
static void EnsureEqual(const Datum& lhs, const Datum& rhs) {}
};

template <typename CType>
struct DatumEqual<CType,
typename std::enable_if<std::is_floating_point<CType>::value>::type> {
template <typename Type>
struct DatumEqual<Type, typename std::enable_if<IsFloatingPoint<Type>::Value>::type> {
static constexpr double kArbitraryDoubleErrorBound = 1.0;
using ScalarType = typename TypeTraits<Type>::ScalarType;

static void EnsureEqual(const Datum& lhs, const Datum& rhs) {
ASSERT_EQ(lhs.kind(), rhs.kind());
if (lhs.kind() == Datum::SCALAR) {
ASSERT_EQ(lhs.scalar().kind(), rhs.scalar().kind());
ASSERT_NEAR(util::get<CType>(lhs.scalar().value),
util::get<CType>(rhs.scalar().value), kArbitraryDoubleErrorBound);
auto left = static_cast<const ScalarType*>(lhs.scalar().get());
auto right = static_cast<const ScalarType*>(rhs.scalar().get());
ASSERT_EQ(left->type->id(), right->type->id());
ASSERT_NEAR(left->value, right->value, kArbitraryDoubleErrorBound);
}
}
};

template <typename CType>
struct DatumEqual<CType,
typename std::enable_if<!std::is_floating_point<CType>::value>::type> {
template <typename Type>
struct DatumEqual<Type, typename std::enable_if<!IsFloatingPoint<Type>::value>::type> {
using ScalarType = typename TypeTraits<Type>::ScalarType;
static void EnsureEqual(const Datum& lhs, const Datum& rhs) {
ASSERT_EQ(lhs.kind(), rhs.kind());
if (lhs.kind() == Datum::SCALAR) {
ASSERT_EQ(lhs.scalar().kind(), rhs.scalar().kind());
ASSERT_EQ(util::get<CType>(lhs.scalar().value),
util::get<CType>(rhs.scalar().value));
auto left = static_cast<const ScalarType*>(lhs.scalar().get());
auto right = static_cast<const ScalarType*>(rhs.scalar().get());
ASSERT_EQ(left->type->id(), right->type->id());
ASSERT_EQ(left->value, right->value);
}
}
};

template <typename ArrowType>
void ValidateSum(FunctionContext* ctx, const Array& input, Datum expected) {
using CType = typename ArrowType::c_type;
using SumType = typename FindAccumulatorType<CType>::Type;

using OutputType = typename FindAccumulatorType<ArrowType>::Type;
Datum result;
ASSERT_OK(Sum(ctx, input, &result));
DatumEqual<SumType>::EnsureEqual(result, expected);
DatumEqual<OutputType>::EnsureEqual(result, expected);
}

template <typename ArrowType>
Expand All @@ -87,11 +89,11 @@ void ValidateSum(FunctionContext* ctx, const char* json, Datum expected) {

template <typename ArrowType>
static Datum DummySum(const Array& array) {
using CType = typename ArrowType::c_type;
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
using SumType = typename FindAccumulatorType<CType>::Type;
using SumType = typename FindAccumulatorType<ArrowType>::Type;
using SumScalarType = typename TypeTraits<SumType>::ScalarType;

SumType sum = 0;
typename SumType::c_type sum = 0;
int64_t count = 0;

const auto& array_numeric = reinterpret_cast<const ArrayType&>(array);
Expand All @@ -104,7 +106,11 @@ static Datum DummySum(const Array& array) {
}
}

return (count > 0) ? Datum(Scalar(sum)) : Datum();
if (count > 0) {
return Datum(std::make_shared<SumScalarType>(sum));
} else {
return Datum(std::make_shared<SumScalarType>(0, false));
}
}

template <typename ArrowType>
Expand All @@ -115,24 +121,21 @@ void ValidateSum(FunctionContext* ctx, const Array& array) {
template <typename ArrowType>
class TestSumKernelNumeric : public ComputeFixture, public TestBase {};

typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType>
NumericArrowTypes;

TYPED_TEST_CASE(TestSumKernelNumeric, NumericArrowTypes);
TYPED_TEST(TestSumKernelNumeric, SimpleSum) {
using CType = typename TypeParam::c_type;
using SumType = typename FindAccumulatorType<CType>::Type;
using SumType = typename FindAccumulatorType<TypeParam>::Type;
using ScalarType = typename TypeTraits<SumType>::ScalarType;
using T = typename TypeParam::c_type;

ValidateSum<TypeParam>(&this->ctx_, "[]", Datum());
ValidateSum<TypeParam>(&this->ctx_, "[]",
Datum(std::make_shared<ScalarType>(0, false)));

ValidateSum<TypeParam>(&this->ctx_, "[0, 1, 2, 3, 4, 5]",
Datum(Scalar(static_cast<SumType>(5 * 6 / 2))));
Datum(std::make_shared<ScalarType>(static_cast<T>(5 * 6 / 2))));

// Avoid this tests for (U)Int8Type
if (sizeof(CType) > 1)
ValidateSum<TypeParam>(&this->ctx_, "[1000, null, 300, null, 30, null, 7]",
Datum(Scalar(static_cast<SumType>(1337))));
const T expected_result = static_cast<T>(14);
ValidateSum<TypeParam>(&this->ctx_, "[1, null, 3, null, 3, null, 7]",
Datum(std::make_shared<ScalarType>(expected_result)));
}

template <typename ArrowType>
Expand Down
23 changes: 17 additions & 6 deletions cpp/src/arrow/compute/kernels/sum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@
namespace arrow {
namespace compute {

template <typename CType, typename SumType = typename FindAccumulatorType<CType>::Type>
template <typename ArrowType,
typename SumType = typename FindAccumulatorType<ArrowType>::Type>
struct SumState {
using ThisType = SumState<CType, SumType>;
using ThisType = SumState<ArrowType, SumType>;

ThisType operator+(const ThisType& rhs) const {
return ThisType(this->count + rhs.count, this->sum + rhs.sum);
Expand All @@ -43,11 +44,16 @@ struct SumState {
return *this;
}

std::shared_ptr<Scalar> AsScalar() const {
using ScalarType = typename TypeTraits<SumType>::ScalarType;
return std::make_shared<ScalarType>(this->sum);
}

size_t count = 0;
SumType sum = 0;
typename SumType::c_type sum = 0;
};

template <typename ArrowType, typename StateType = SumState<typename ArrowType::c_type>>
template <typename ArrowType, typename StateType = SumState<ArrowType>>
class SumAggregateFunction final : public AggregateFunctionStaticState<StateType> {
using CType = typename TypeTraits<ArrowType>::CType;
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
Expand All @@ -71,7 +77,12 @@ class SumAggregateFunction final : public AggregateFunctionStaticState<StateType
}

Status Finalize(const StateType& src, Datum* output) const override {
*output = (src.count > 0) ? Datum(Scalar(src.sum)) : Datum();
auto boxed = src.AsScalar();
if (src.count == 0) {
// TODO(wesm): Currently null, but fix this
boxed->is_valid = false;
}
*output = boxed;
return Status::OK();
}

Expand Down Expand Up @@ -185,7 +196,7 @@ Status Sum(FunctionContext* ctx, const Datum& value, Datum* out) {
}

Status Sum(FunctionContext* ctx, const Array& array, Datum* out) {
return Sum(ctx, Datum(array.data()), out);
return Sum(ctx, array.data(), out);
}

} // namespace compute
Expand Down
19 changes: 9 additions & 10 deletions cpp/src/arrow/compute/kernels/sum.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <type_traits>

#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/visibility.h"

namespace arrow {
Expand All @@ -34,25 +36,22 @@ namespace compute {
// Find the largest compatible primitive type for a primitive type.
template <typename I, typename Enable = void>
struct FindAccumulatorType {
using Type = double;
using Type = DoubleType;
};

template <typename I>
struct FindAccumulatorType<I, typename std::enable_if<std::is_integral<I>::value &&
std::is_signed<I>::value>::type> {
using Type = int64_t;
struct FindAccumulatorType<I, typename std::enable_if<IsSignedInt<I>::value>::type> {
using Type = Int64Type;
};

template <typename I>
struct FindAccumulatorType<I, typename std::enable_if<std::is_integral<I>::value &&
std::is_unsigned<I>::value>::type> {
using Type = uint64_t;
struct FindAccumulatorType<I, typename std::enable_if<IsUnsignedInt<I>::value>::type> {
using Type = UInt64Type;
};

template <typename I>
struct FindAccumulatorType<
I, typename std::enable_if<std::is_floating_point<I>::value>::type> {
using Type = double;
struct FindAccumulatorType<I, typename std::enable_if<IsFloatingPoint<I>::value>::type> {
using Type = DoubleType;
};

struct Datum;
Expand Down
Loading

0 comments on commit d831e2c

Please sign in to comment.