diff --git a/.travis.yml b/.travis.yml index b219b03e0eb..f74a3b205c4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ addons: - valgrind - libboost-dev - libboost-filesystem-dev + - libboost-regex-dev - libboost-system-dev - libjemalloc-dev - gtk-doc-tools diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9947a34e4e7..5852fe59da0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -398,30 +398,36 @@ if (ARROW_BOOST_USE_SHARED) add_definitions(-DBOOST_ALL_DYN_LINK) endif() - find_package(Boost COMPONENTS system filesystem REQUIRED) + find_package(Boost COMPONENTS system filesystem regex REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_shared) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) + set(BOOST_REGEX_LIBRARY boost_regex_shared) else() # Find static boost headers and libs # TODO Differentiate here between release and debug builds set(Boost_USE_STATIC_LIBS ON) - find_package(Boost COMPONENTS system filesystem REQUIRED) + find_package(Boost COMPONENTS system filesystem regex REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) + set(BOOST_REGEX_LIBRARY boost_regex_static) endif() message(STATUS "Boost include dir: " ${Boost_INCLUDE_DIRS}) @@ -435,7 +441,11 @@ ADD_THIRDPARTY_LIB(boost_filesystem STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}" SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}") -SET(ARROW_BOOST_LIBS boost_system boost_filesystem) +ADD_THIRDPARTY_LIB(boost_regex + STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}" + SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}") + +SET(ARROW_BOOST_LIBS boost_system boost_filesystem boost_regex) include_directories(SYSTEM ${Boost_INCLUDE_DIR}) @@ -695,14 +705,16 @@ endif() set(ARROW_MIN_TEST_LIBS arrow_static arrow_test_main - ${ARROW_BASE_LIBS}) + ${ARROW_BASE_LIBS} + ${BOOST_REGEX_LIBRARY}) set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) set(ARROW_BENCHMARK_LINK_LIBS arrow_static arrow_benchmark_main - ${ARROW_BASE_LIBS}) + ${ARROW_BASE_LIBS} + ${BOOST_REGEX_LIBRARY}) ############################################################ # "make ctags" target @@ -796,7 +808,7 @@ endif() ############################################################ set(ARROW_LINK_LIBS -) + ${BOOST_REGEX_LIBRARY}) set(ARROW_PRIVATE_LINK_LIBS ) @@ -816,6 +828,7 @@ set(ARROW_SRCS src/arrow/visitor.cc src/arrow/util/bit-util.cc + src/arrow/util/decimal.cc ) if(NOT APPLE AND NOT MSVC) @@ -825,9 +838,11 @@ if(NOT APPLE AND NOT MSVC) set(ARROW_SHARED_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/src/arrow/symbols.map") endif() + ADD_ARROW_LIB(arrow SOURCES ${ARROW_SRCS} SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} + SHARED_LINK_LIBS ${ARROW_LINK_LIBS} ) add_subdirectory(src/arrow) diff --git a/cpp/cmake_modules/FindPythonLibsNew.cmake b/cpp/cmake_modules/FindPythonLibsNew.cmake index dfe5661b015..d9cc4b39557 100644 --- a/cpp/cmake_modules/FindPythonLibsNew.cmake +++ b/cpp/cmake_modules/FindPythonLibsNew.cmake @@ -175,7 +175,8 @@ else() find_library(PYTHON_LIBRARY NAMES "python${PYTHON_LIBRARY_SUFFIX}" PATHS ${_PYTHON_LIBS_SEARCH} - NO_SYSTEM_ENVIRONMENT_PATH) + NO_SYSTEM_ENVIRONMENT_PATH + NO_CMAKE_SYSTEM_PATH) message(STATUS "Found Python lib ${PYTHON_LIBRARY}") endif() diff --git a/cpp/src/arrow/array-decimal-test.cc b/cpp/src/arrow/array-decimal-test.cc index b64023bbc6a..4c01f928a6f 100644 --- a/cpp/src/arrow/array-decimal-test.cc +++ b/cpp/src/arrow/array-decimal-test.cc @@ -15,13 +15,16 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/type.h" #include "gtest/gtest.h" -#include "arrow/type.h" +#include "arrow/builder.h" +#include "arrow/test-util.h" +#include "arrow/util/decimal.h" namespace arrow { -TEST(TypesTest, TestDecimalType) { +TEST(TypesTest, TestDecimal32Type) { DecimalType t1(8, 4); ASSERT_EQ(t1.type, Type::DECIMAL); @@ -29,6 +32,193 @@ TEST(TypesTest, TestDecimalType) { ASSERT_EQ(t1.scale, 4); ASSERT_EQ(t1.ToString(), std::string("decimal(8, 4)")); + + // Test properties + ASSERT_EQ(t1.byte_width(), 4); + ASSERT_EQ(t1.bit_width(), 32); } +TEST(TypesTest, TestDecimal64Type) { + DecimalType t1(12, 5); + + ASSERT_EQ(t1.type, Type::DECIMAL); + ASSERT_EQ(t1.precision, 12); + ASSERT_EQ(t1.scale, 5); + + ASSERT_EQ(t1.ToString(), std::string("decimal(12, 5)")); + + // Test properties + ASSERT_EQ(t1.byte_width(), 8); + ASSERT_EQ(t1.bit_width(), 64); +} + +TEST(TypesTest, TestDecimal128Type) { + DecimalType t1(27, 7); + + ASSERT_EQ(t1.type, Type::DECIMAL); + ASSERT_EQ(t1.precision, 27); + ASSERT_EQ(t1.scale, 7); + + ASSERT_EQ(t1.ToString(), std::string("decimal(27, 7)")); + + // Test properties + ASSERT_EQ(t1.byte_width(), 16); + ASSERT_EQ(t1.bit_width(), 128); +} + +template +class DecimalTestBase { + public: + virtual std::vector data( + const std::vector& input, size_t byte_width) const = 0; + + void test(int precision, const std::vector& draw, + const std::vector& valid_bytes, + const std::vector& sign_bitmap = {}, int64_t offset = 0) const { + auto type = std::make_shared(precision, 4); + int byte_width = type->byte_width(); + auto pool = default_memory_pool(); + auto builder = std::make_shared(pool, type); + size_t null_count = 0; + + size_t size = draw.size(); + builder->Reserve(size); + + for (size_t i = 0; i < size; ++i) { + if (valid_bytes[i]) { + builder->Append(draw[i]); + } else { + builder->AppendNull(); + ++null_count; + } + } + + std::shared_ptr expected_sign_bitmap; + if (!sign_bitmap.empty()) { + BitUtil::BytesToBits(sign_bitmap, &expected_sign_bitmap); + } + + auto raw_bytes = data(draw, byte_width); + auto expected_data = std::make_shared(raw_bytes.data(), size * byte_width); + auto expected_null_bitmap = test::bytes_to_null_buffer(valid_bytes); + int64_t expected_null_count = test::null_count(valid_bytes); + auto expected = std::make_shared(type, size, expected_data, + expected_null_bitmap, expected_null_count, offset, expected_sign_bitmap); + + std::shared_ptr out; + ASSERT_OK(builder->Finish(&out)); + ASSERT_TRUE(out->Equals(*expected)); + } +}; + +template +class DecimalTest : public DecimalTestBase { + public: + std::vector data( + const std::vector& input, size_t byte_width) const override { + std::vector result; + result.reserve(input.size() * byte_width); + // TODO(phillipc): There's probably a better way to do this + constexpr static const size_t bytes_per_element = sizeof(T); + for (size_t i = 0, j = 0; i < input.size(); ++i, j += bytes_per_element) { + *reinterpret_cast(&result[j]) = input[i].value; + } + return result; + } +}; + +template <> +class DecimalTest : public DecimalTestBase { + public: + std::vector data( + const std::vector& input, size_t byte_width) const override { + std::vector result; + result.reserve(input.size() * byte_width); + constexpr static const size_t bytes_per_element = 16; + for (size_t i = 0; i < input.size(); ++i) { + uint8_t stack_bytes[bytes_per_element] = {0}; + uint8_t* bytes = stack_bytes; + bool is_negative; + ToBytes(input[i], &bytes, &is_negative); + + for (size_t i = 0; i < bytes_per_element; ++i) { + result.push_back(bytes[i]); + } + } + return result; + } +}; + +class Decimal32BuilderTest : public ::testing::TestWithParam, + public DecimalTest {}; + +class Decimal64BuilderTest : public ::testing::TestWithParam, + public DecimalTest {}; + +class Decimal128BuilderTest : public ::testing::TestWithParam, + public DecimalTest {}; + +TEST_P(Decimal32BuilderTest, NoNulls) { + int precision = GetParam(); + std::vector draw = { + Decimal32(1), Decimal32(2), Decimal32(2389), Decimal32(4), Decimal32(-12348)}; + std::vector valid_bytes = {true, true, true, true, true}; + this->test(precision, draw, valid_bytes); +} + +TEST_P(Decimal64BuilderTest, NoNulls) { + int precision = GetParam(); + std::vector draw = { + Decimal64(1), Decimal64(2), Decimal64(2389), Decimal64(4), Decimal64(-12348)}; + std::vector valid_bytes = {true, true, true, true, true}; + this->test(precision, draw, valid_bytes); +} + +TEST_P(Decimal128BuilderTest, NoNulls) { + int precision = GetParam(); + std::vector draw = { + Decimal128(1), Decimal128(-2), Decimal128(2389), Decimal128(4), Decimal128(-12348)}; + std::vector valid_bytes = {true, true, true, true, true}; + std::vector sign_bitmap = {false, true, false, false, true}; + this->test(precision, draw, valid_bytes, sign_bitmap); +} + +TEST_P(Decimal32BuilderTest, WithNulls) { + int precision = GetParam(); + std::vector draw = { + Decimal32(1), Decimal32(2), Decimal32(-1), Decimal32(4), Decimal32(-1)}; + std::vector valid_bytes = {true, true, false, true, false}; + this->test(precision, draw, valid_bytes); +} + +TEST_P(Decimal64BuilderTest, WithNulls) { + int precision = GetParam(); + std::vector draw = { + Decimal64(-1), Decimal64(2), Decimal64(-1), Decimal64(4), Decimal64(-1)}; + std::vector valid_bytes = {true, true, false, true, false}; + this->test(precision, draw, valid_bytes); +} + +TEST_P(Decimal128BuilderTest, WithNulls) { + int precision = GetParam(); + std::vector draw = {Decimal128(1), Decimal128(2), Decimal128(-1), + Decimal128(4), Decimal128(-1), Decimal128(1), Decimal128(2), + Decimal128("230342903942.234234"), Decimal128("-23049302932.235234")}; + std::vector valid_bytes = { + true, true, false, true, false, true, true, true, true}; + std::vector sign_bitmap = { + false, false, false, false, false, false, false, false, true}; + this->test(precision, draw, valid_bytes, sign_bitmap); +} + +INSTANTIATE_TEST_CASE_P(Decimal32BuilderTest, Decimal32BuilderTest, + ::testing::Range( + DecimalPrecision::minimum, DecimalPrecision::maximum)); +INSTANTIATE_TEST_CASE_P(Decimal64BuilderTest, Decimal64BuilderTest, + ::testing::Range( + DecimalPrecision::minimum, DecimalPrecision::maximum)); +INSTANTIATE_TEST_CASE_P(Decimal128BuilderTest, Decimal128BuilderTest, + ::testing::Range( + DecimalPrecision::minimum, DecimalPrecision::maximum)); + } // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index bd20654bc87..4e73e7176fa 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -27,6 +27,7 @@ #include "arrow/status.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/visitor.h" #include "arrow/visitor_inline.h" @@ -283,10 +284,8 @@ std::shared_ptr StringArray::Slice(int64_t offset, int64_t length) const FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) - : PrimitiveArray(type, length, data, null_bitmap, null_count, offset) { - DCHECK(type->type == Type::FIXED_SIZE_BINARY); - byte_width_ = static_cast(*type).byte_width(); -} + : PrimitiveArray(type, length, data, null_bitmap, null_count, offset), + byte_width_(static_cast(*type).byte_width()) {} std::shared_ptr FixedSizeBinaryArray::Slice(int64_t offset, int64_t length) const { ConformSliceParams(offset_, length_, &offset, &length); @@ -294,6 +293,48 @@ std::shared_ptr FixedSizeBinaryArray::Slice(int64_t offset, int64_t lengt type_, length, data_, null_bitmap_, kUnknownNullCount, offset); } +const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const { + return raw_data_ + (i + offset_) * byte_width_; +} + +// ---------------------------------------------------------------------- +// Decimal +DecimalArray::DecimalArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, const std::shared_ptr& null_bitmap, + int64_t null_count, int64_t offset, const std::shared_ptr& sign_bitmap) + : FixedSizeBinaryArray(type, length, data, null_bitmap, null_count, offset), + sign_bitmap_(sign_bitmap), + sign_bitmap_data_(sign_bitmap != nullptr ? sign_bitmap->data() : nullptr) {} + +bool DecimalArray::IsNegative(int64_t i) const { + return sign_bitmap_data_ != nullptr ? BitUtil::GetBit(sign_bitmap_data_, i) : false; +} + +template +ARROW_EXPORT Decimal DecimalArray::Value(int64_t i) const { + Decimal result; + FromBytes(GetValue(i), &result); + return result; +} + +template ARROW_EXPORT Decimal32 DecimalArray::Value(int64_t i) const; +template ARROW_EXPORT Decimal64 DecimalArray::Value(int64_t i) const; + +template <> +ARROW_EXPORT Decimal128 DecimalArray::Value(int64_t i) const { + Decimal128 result; + FromBytes(GetValue(i), IsNegative(i), &result); + return result; +} + +template ARROW_EXPORT Decimal128 DecimalArray::Value(int64_t i) const; + +std::shared_ptr DecimalArray::Slice(int64_t offset, int64_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared( + type_, length, data_, null_bitmap_, kUnknownNullCount, offset, sign_bitmap_); +} + // ---------------------------------------------------------------------- // Struct diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 9f0e73914da..a4117facdef 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -39,6 +39,9 @@ class MemoryPool; class MutableBuffer; class Status; +template +struct Decimal; + /// Immutable data array with some logical type and some length. /// /// Any memory is owned by the respective Buffer instance (or its parents). @@ -356,9 +359,7 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, int64_t offset = 0); - const uint8_t* GetValue(int64_t i) const { - return raw_data_ + (i + offset_) * byte_width_; - } + const uint8_t* GetValue(int64_t i) const; int32_t byte_width() const { return byte_width_; } @@ -370,6 +371,30 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { int32_t byte_width_; }; +// ---------------------------------------------------------------------- +// DecimalArray +class ARROW_EXPORT DecimalArray : public FixedSizeBinaryArray { + public: + using TypeClass = Type; + + DecimalArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, int64_t null_count = 0, + int64_t offset = 0, const std::shared_ptr& sign_bitmap = nullptr); + + bool IsNegative(int64_t i) const; + + template + ARROW_EXPORT Decimal Value(int64_t i) const; + + std::shared_ptr Slice(int64_t offset, int64_t length) const override; + + private: + /// Only needed for 128 bit Decimals + std::shared_ptr sign_bitmap_; + const uint8_t* sign_bitmap_data_; +}; + // ---------------------------------------------------------------------- // Struct diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 40b81cf015a..a3677eff686 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -27,6 +27,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" namespace arrow { @@ -323,6 +324,85 @@ Status BooleanBuilder::Append( return Status::OK(); } +// ---------------------------------------------------------------------- +// DecimalBuilder +DecimalBuilder::DecimalBuilder(MemoryPool* pool, const std::shared_ptr& type) + : FixedSizeBinaryBuilder(pool, type), + sign_bitmap_(nullptr), + sign_bitmap_data_(nullptr) {} + +template +ARROW_EXPORT Status DecimalBuilder::Append(const Decimal& val) { + DCHECK_EQ(sign_bitmap_, nullptr) << "sign_bitmap_ is not null"; + DCHECK_EQ(sign_bitmap_data_, nullptr) << "sign_bitmap_data_ is not null"; + + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + return FixedSizeBinaryBuilder::Append(reinterpret_cast(&val.value)); +} + +template ARROW_EXPORT Status DecimalBuilder::Append(const Decimal32& val); +template ARROW_EXPORT Status DecimalBuilder::Append(const Decimal64& val); + +template <> +ARROW_EXPORT Status DecimalBuilder::Append(const Decimal128& value) { + DCHECK_NE(sign_bitmap_, nullptr) << "sign_bitmap_ is null"; + DCHECK_NE(sign_bitmap_data_, nullptr) << "sign_bitmap_data_ is null"; + + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + uint8_t stack_bytes[16] = {0}; + uint8_t* bytes = stack_bytes; + bool is_negative; + ToBytes(value, &bytes, &is_negative); + RETURN_NOT_OK(FixedSizeBinaryBuilder::Append(bytes)); + + // TODO(phillipc): calculate the proper storage size here (do we have a function to do + // this)? + // TODO(phillipc): Reserve number of elements + RETURN_NOT_OK(sign_bitmap_->Reserve(1)); + BitUtil::SetBitTo(sign_bitmap_data_, length_ - 1, is_negative); + return Status::OK(); +} + +template ARROW_EXPORT Status DecimalBuilder::Append(const Decimal128& val); + +Status DecimalBuilder::Init(int64_t capacity) { + RETURN_NOT_OK(FixedSizeBinaryBuilder::Init(capacity)); + if (byte_width_ == 16) { + AllocateResizableBuffer(pool_, null_bitmap_->size(), &sign_bitmap_); + sign_bitmap_data_ = sign_bitmap_->mutable_data(); + memset(sign_bitmap_data_, 0, static_cast(sign_bitmap_->capacity())); + } + return Status::OK(); +} + +Status DecimalBuilder::Resize(int64_t capacity) { + int64_t old_bytes = null_bitmap_ != nullptr ? null_bitmap_->size() : 0; + if (sign_bitmap_ == nullptr) { return Init(capacity); } + RETURN_NOT_OK(FixedSizeBinaryBuilder::Resize(capacity)); + + if (byte_width_ == 16) { + RETURN_NOT_OK(sign_bitmap_->Resize(null_bitmap_->size())); + int64_t new_bytes = sign_bitmap_->size(); + sign_bitmap_data_ = sign_bitmap_->mutable_data(); + + // The buffer might be overpadded to deal with padding according to the spec + if (old_bytes < new_bytes) { + memset(sign_bitmap_data_ + old_bytes, 0, + static_cast(sign_bitmap_->capacity() - old_bytes)); + } + } + return Status::OK(); +} + +Status DecimalBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr data = byte_builder_.Finish(); + + /// TODO(phillipc): not sure where to get the offset argument here + *out = std::make_shared( + type_, length_, data, null_bitmap_, null_count_, 0, sign_bitmap_); + return Status::OK(); +} + // ---------------------------------------------------------------------- // ListBuilder @@ -440,10 +520,9 @@ Status StringBuilder::Finish(std::shared_ptr* out) { FixedSizeBinaryBuilder::FixedSizeBinaryBuilder( MemoryPool* pool, const std::shared_ptr& type) - : ArrayBuilder(pool, type), byte_builder_(pool) { - DCHECK(type->type == Type::FIXED_SIZE_BINARY); - byte_width_ = static_cast(*type).byte_width(); -} + : ArrayBuilder(pool, type), + byte_width_(static_cast(*type).byte_width()), + byte_builder_(pool) {} Status FixedSizeBinaryBuilder::Append(const uint8_t* value) { RETURN_NOT_OK(Reserve(1)); @@ -543,6 +622,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(STRING, StringBuilder); BUILDER_CASE(BINARY, BinaryBuilder); BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); + BUILDER_CASE(DECIMAL, DecimalBuilder); case Type::LIST: { std::shared_ptr value_builder; std::shared_ptr value_type = diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 60cdc4cb3a5..d42ab5b01d1 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -37,6 +37,9 @@ namespace arrow { class Array; +template +struct Decimal; + static constexpr int64_t kMinBuilderCapacity = 1 << 5; /// Base class for all data array builders. @@ -76,12 +79,12 @@ class ARROW_EXPORT ArrayBuilder { Status SetNotNull(int64_t length); /// Allocates initial capacity requirements for the builder. In most - /// cases subclasses should override and call there parent classes + /// cases subclasses should override and call their parent class's /// method as well. virtual Status Init(int64_t capacity); /// Resizes the null_bitmap array. In most - /// cases subclasses should override and call there parent classes + /// cases subclasses should override and call their parent class's /// method as well. virtual Status Resize(int64_t new_bits); @@ -275,9 +278,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } - Status Append(uint8_t val) { - return Append(val != 0); - } + Status Append(uint8_t val) { return Append(val != 0); } /// Vector append /// @@ -415,6 +416,24 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { BufferBuilder byte_builder_; }; +class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { + public: + explicit DecimalBuilder(MemoryPool* pool, const std::shared_ptr& type); + + template + ARROW_EXPORT Status Append(const Decimal& val); + + Status Init(int64_t capacity) override; + Status Resize(int64_t capacity) override; + Status Finish(std::shared_ptr* out) override; + + private: + /// We only need these for 128 bit decimals, because boost stores the sign + /// separate from the underlying bytes. + std::shared_ptr sign_bitmap_; + uint8_t* sign_bitmap_data_; +}; + // ---------------------------------------------------------------------- // Struct diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 7451439a875..2297e4b206d 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -29,6 +29,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -232,6 +233,41 @@ class RangeEqualsVisitor { return Status::OK(); } + Status Visit(const DecimalArray& left) { + const auto& right = static_cast(right_); + + int32_t width = left.byte_width(); + + const uint8_t* left_data = nullptr; + const uint8_t* right_data = nullptr; + + if (left.data()) { left_data = left.raw_data() + left.offset() * width; } + + if (right.data()) { right_data = right.raw_data() + right.offset() * width; } + + for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; + ++i, ++o_i) { + if (left.IsNegative(i) != right.IsNegative(o_i)) { + result_ = false; + return Status::OK(); + } + + const bool is_null = left.IsNull(i); + if (is_null != right.IsNull(o_i)) { + result_ = false; + return Status::OK(); + } + if (is_null) continue; + + if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) { + result_ = false; + return Status::OK(); + } + } + result_ = true; + return Status::OK(); + } + Status Visit(const NullArray& left) { UNUSED(left); result_ = true; @@ -244,10 +280,6 @@ class RangeEqualsVisitor { return CompareValues(left); } - Status Visit(const DecimalArray& left) { - return Status::NotImplemented("Decimal type"); - } - Status Visit(const ListArray& left) { result_ = CompareLists(left); return Status::OK(); diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 57db03311c0..c6880c56e46 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -27,7 +27,8 @@ set(ARROW_IPC_SHARED_LINK_LIBS set(ARROW_IPC_TEST_LINK_LIBS arrow_ipc_static arrow_io_static - arrow_static) + arrow_static + ${BOOST_REGEX_LIBRARY}) set(ARROW_IPC_SRCS feather.cc @@ -161,7 +162,8 @@ if(MSVC) arrow_io_static arrow_static ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_SYSTEM_LIBRARY}) + ${BOOST_SYSTEM_LIBRARY} + ${BOOST_REGEX_LIBRARY}) else() set(UTIL_LINK_LIBS arrow_ipc_static @@ -169,6 +171,7 @@ else() arrow_static ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_SYSTEM_LIBRARY} + ${BOOST_REGEX_LIBRARY} dl) endif() diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index c69d976737f..604527f6304 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -37,7 +37,8 @@ set(ARROW_PYTHON_MIN_TEST_LIBS arrow_python_static arrow_ipc_static arrow_io_static - arrow_static) + arrow_static + ${BOOST_REGEX_LIBRARY}) if(ARROW_BUILD_TESTS) ADD_THIRDPARTY_LIB(python diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 25b32ee26a0..189ecee4fe0 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -17,12 +17,16 @@ #include #include + +#include #include +#include #include "arrow/python/builtin_convert.h" #include "arrow/api.h" #include "arrow/status.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/python/helpers.h" @@ -109,7 +113,6 @@ class ScalarVisitor { int64_t float_count_; int64_t binary_count_; int64_t unicode_count_; - // Place to accumulate errors // std::vector errors_; }; @@ -394,8 +397,7 @@ class BytesConverter : public TypedConverter { } else if (PyBytes_Check(item)) { bytes_obj = item; } else { - return Status::Invalid( - "Value that cannot be converted to bytes was encountered"); + return Status::Invalid("Value that cannot be converted to bytes was encountered"); } // No error checking length = PyBytes_GET_SIZE(bytes_obj); @@ -429,8 +431,7 @@ class FixedWidthBytesConverter : public TypedConverter { } else if (PyBytes_Check(item)) { bytes_obj = item; } else { - return Status::Invalid( - "Value that cannot be converted to bytes was encountered"); + return Status::Invalid("Value that cannot be converted to bytes was encountered"); } // No error checking RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length)); @@ -495,6 +496,54 @@ class ListConverter : public TypedConverter { std::shared_ptr value_converter_; }; +#define DECIMAL_CONVERT_CASE(bit_width, item, builder) \ + case bit_width: { \ + arrow::Decimal##bit_width out; \ + RETURN_NOT_OK(PythonDecimalToArrowDecimal((item), &out)); \ + RETURN_NOT_OK((builder)->Append(out)); \ + break; \ + } + +class DecimalConverter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + /// Ensure we've allocated enough space + Py_ssize_t size = PySequence_Size(seq); + RETURN_NOT_OK(typed_builder_->Reserve(size)); + + /// Can the compiler figure out that the case statement below isn't necessary + /// once we're running? + const int bit_width = + std::dynamic_pointer_cast(typed_builder_->type()) + ->bit_width(); + + OwnedRef ref; + PyObject* item = nullptr; + for (int64_t i = 0; i < size; ++i) { + ref.reset(PySequence_GetItem(seq, i)); + item = ref.obj(); + + /// TODO(phillipc): Check for nan? + if (item != Py_None) { + switch (bit_width) { + DECIMAL_CONVERT_CASE(32, item, typed_builder_) + DECIMAL_CONVERT_CASE(64, item, typed_builder_) + DECIMAL_CONVERT_CASE(128, item, typed_builder_) + default: + break; + } + RETURN_IF_PYERROR(); + } else { + RETURN_NOT_OK(typed_builder_->AppendNull()); + } + } + + return Status::OK(); + } +}; + +#undef DECIMAL_CONVERT_CASE + // Dynamic constructor for sequence converters std::shared_ptr GetConverter(const std::shared_ptr& type) { switch (type->type) { @@ -516,6 +565,9 @@ std::shared_ptr GetConverter(const std::shared_ptr& type return std::make_shared(); case Type::LIST: return std::make_shared(); + case Type::DECIMAL: { + return std::make_shared(); + } case Type::STRUCT: default: return nullptr; diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h index 00ff0fd8236..3c2e350269a 100644 --- a/cpp/src/arrow/python/builtin_convert.h +++ b/cpp/src/arrow/python/builtin_convert.h @@ -25,7 +25,7 @@ #include -#include +#include "arrow/type.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 32bfa784acb..a6806ab95ab 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -57,12 +57,13 @@ class OwnedRef { } void reset(PyObject* obj) { - if (obj_ != nullptr) { Py_XDECREF(obj_); } + /// TODO(phillipc): Should we acquire the GIL here? It definitely needs to be + /// acquired, + /// but callers have probably already acquired it + Py_XDECREF(obj_); obj_ = obj; } - void release() { obj_ = nullptr; } - PyObject* obj() const { return obj_; } private: @@ -72,6 +73,7 @@ class OwnedRef { struct PyObjectStringify { OwnedRef tmp_obj; const char* bytes; + Py_ssize_t size; explicit PyObjectStringify(PyObject* obj) { PyObject* bytes_obj; @@ -82,6 +84,7 @@ struct PyObjectStringify { bytes_obj = obj; } bytes = PyBytes_AsString(bytes_obj); + size = PyBytes_GET_SIZE(bytes_obj); } }; diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index be5f412fbea..ffba7bbc21c 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -16,6 +16,8 @@ // under the License. #include "arrow/python/helpers.h" +#include "arrow/python/common.h" +#include "arrow/util/decimal.h" #include @@ -52,5 +54,82 @@ std::shared_ptr GetPrimitiveType(Type::type type) { } } +Status ImportModule(const std::string& module_name, OwnedRef* ref) { + PyAcquireGIL lock; + PyObject* module = PyImport_ImportModule(module_name.c_str()); + RETURN_IF_PYERROR(); + ref->reset(module); + return Status::OK(); +} + +Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref) { + /// Assumes that ImportModule was called first + DCHECK_NE(module.obj(), nullptr) << "Cannot import from nullptr Python module"; + + PyAcquireGIL lock; + PyObject* attr = PyObject_GetAttrString(module.obj(), name.c_str()); + RETURN_IF_PYERROR(); + ref->reset(attr); + return Status::OK(); +} + +template +Status PythonDecimalToArrowDecimal(PyObject* python_decimal, Decimal* arrow_decimal) { + // Call Python's str(decimal_object) + OwnedRef str_obj(PyObject_Str(python_decimal)); + RETURN_IF_PYERROR(); + + PyObjectStringify str(str_obj.obj()); + RETURN_IF_PYERROR(); + + const char* bytes = str.bytes; + DCHECK_NE(bytes, nullptr); + + Py_ssize_t size = str.size; + + std::string c_string(bytes, size); + return FromString(c_string, arrow_decimal); +} + +template Status PythonDecimalToArrowDecimal( + PyObject* python_decimal, Decimal32* arrow_decimal); +template Status PythonDecimalToArrowDecimal( + PyObject* python_decimal, Decimal64* arrow_decimal); +template Status PythonDecimalToArrowDecimal( + PyObject* python_decimal, Decimal128* arrow_decimal); + +Status InferDecimalPrecisionAndScale( + PyObject* python_decimal, int* precision, int* scale) { + // Call Python's str(decimal_object) + OwnedRef str_obj(PyObject_Str(python_decimal)); + RETURN_IF_PYERROR(); + PyObjectStringify str(str_obj.obj()); + + const char* bytes = str.bytes; + DCHECK_NE(bytes, nullptr); + + auto size = str.size; + + std::string c_string(bytes, size); + return FromString(c_string, static_cast(nullptr), precision, scale); +} + +Status DecimalFromString( + PyObject* decimal_constructor, const std::string& decimal_string, PyObject** out) { + DCHECK_NE(decimal_constructor, nullptr); + DCHECK_NE(out, nullptr); + + auto string_size = decimal_string.size(); + DCHECK_GT(string_size, 0); + + auto string_bytes = decimal_string.c_str(); + DCHECK_NE(string_bytes, nullptr); + + *out = PyObject_CallFunction( + decimal_constructor, const_cast("s#"), string_bytes, string_size); + RETURN_IF_PYERROR(); + return Status::OK(); +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 611e814b7d8..a19b25f7db8 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -18,16 +18,38 @@ #ifndef PYARROW_HELPERS_H #define PYARROW_HELPERS_H +#include + #include +#include +#include #include "arrow/type.h" #include "arrow/util/visibility.h" namespace arrow { + +template +struct Decimal; + namespace py { -ARROW_EXPORT -std::shared_ptr GetPrimitiveType(Type::type type); +class OwnedRef; + +ARROW_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); + +Status ImportModule(const std::string& module_name, OwnedRef* ref); +Status ImportFromModule( + const OwnedRef& module, const std::string& module_name, OwnedRef* ref); + +template +Status PythonDecimalToArrowDecimal(PyObject* python_decimal, Decimal* arrow_decimal); + +Status InferDecimalPrecisionAndScale( + PyObject* python_decimal, int* precision = nullptr, int* scale = nullptr); + +Status DecimalFromString( + PyObject* decimal_constructor, const std::string& decimal_string, PyObject** out); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 48d3489bf90..f6e627e668e 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -41,12 +41,14 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/python/builtin_convert.h" #include "arrow/python/common.h" #include "arrow/python/config.h" +#include "arrow/python/helpers.h" #include "arrow/python/numpy-internal.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/type_traits.h" @@ -375,6 +377,7 @@ class PandasConverter : public TypeVisitor { Status ConvertDates(); Status ConvertLists(const std::shared_ptr& type); Status ConvertObjects(); + Status ConvertDecimals(); protected: MemoryPool* pool_; @@ -468,15 +471,14 @@ Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { RETURN_IF_PYERROR(); DCHECK_NE(type_name.obj(), nullptr); - OwnedRef bytes_obj(PyUnicode_AsUTF8String(type_name.obj())); + PyObjectStringify bytestring(type_name.obj()); RETURN_IF_PYERROR(); - DCHECK_NE(bytes_obj.obj(), nullptr); - - Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj.obj()); - const char* bytes = PyBytes_AS_STRING(bytes_obj.obj()); + const char* bytes = bytestring.bytes; DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null"; + Py_ssize_t size = bytestring.size; + std::string cpp_type_name(bytes, size); std::stringstream ss; @@ -517,6 +519,59 @@ Status PandasConverter::ConvertDates() { return date_builder.Finish(&out_); } +#define CONVERT_DECIMAL_CASE(bit_width, builder, object) \ + case bit_width: { \ + Decimal##bit_width d; \ + RETURN_NOT_OK(PythonDecimalToArrowDecimal((object), &d)); \ + RETURN_NOT_OK((builder).Append(d)); \ + break; \ + } + +Status PandasConverter::ConvertDecimals() { + PyAcquireGIL lock; + + // Import the decimal module and Decimal class + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(ImportModule("decimal", &decimal)); + RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + PyObject* object = objects[0]; + + int precision; + int scale; + + RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale)); + + type_ = std::make_shared(precision, scale); + + const int bit_width = std::dynamic_pointer_cast(type_)->bit_width(); + DecimalBuilder decimal_builder(pool_, type_); + + RETURN_NOT_OK(decimal_builder.Resize(length_)); + + for (int64_t i = 0; i < length_; ++i) { + object = objects[i]; + if (PyObject_IsInstance(object, Decimal.obj())) { + switch (bit_width) { + CONVERT_DECIMAL_CASE(32, decimal_builder, object) + CONVERT_DECIMAL_CASE(64, decimal_builder, object) + CONVERT_DECIMAL_CASE(128, decimal_builder, object) + default: + break; + } + } else if (PyObject_is_null(object)) { + decimal_builder.AppendNull(); + } else { + return InvalidConversion(object, "decimal.Decimal"); + } + } + return decimal_builder.Finish(&out_); +} + +#undef CONVERT_DECIMAL_CASE + Status PandasConverter::ConvertObjectStrings() { PyAcquireGIL lock; @@ -554,6 +609,90 @@ Status PandasConverter::ConvertObjectFixedWidthBytes( return Status::OK(); } +template +Status validate_precision(int precision) { + constexpr static const int maximum_precision = DecimalPrecision::maximum; + if (!(precision > 0 && precision <= maximum_precision)) { + std::stringstream ss; + ss << "Invalid precision: " << precision << ". Minimum is 1, maximum is " + << maximum_precision; + return Status::Invalid(ss.str()); + } + return Status::OK(); +} + +template +Status RawDecimalToString( + const uint8_t* bytes, int precision, int scale, std::string* result) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(result, nullptr); + RETURN_NOT_OK(validate_precision(precision)); + Decimal decimal; + FromBytes(bytes, &decimal); + *result = ToString(decimal, precision, scale); + return Status::OK(); +} + +template Status RawDecimalToString( + const uint8_t*, int, int, std::string* result); +template Status RawDecimalToString( + const uint8_t*, int, int, std::string* result); + +Status RawDecimalToString(const uint8_t* bytes, int precision, int scale, + bool is_negative, std::string* result) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(result, nullptr); + RETURN_NOT_OK(validate_precision(precision)); + Decimal128 decimal; + FromBytes(bytes, is_negative, &decimal); + *result = ToString(decimal, precision, scale); + return Status::OK(); +} + +static Status ConvertDecimals(const ChunkedArray& data, PyObject** out_values) { + PyAcquireGIL lock; + OwnedRef decimal_ref; + OwnedRef Decimal_ref; + RETURN_NOT_OK(ImportModule("decimal", &decimal_ref)); + RETURN_NOT_OK(ImportFromModule(decimal_ref, "Decimal", &Decimal_ref)); + PyObject* Decimal = Decimal_ref.obj(); + + for (int c = 0; c < data.num_chunks(); c++) { + auto* arr(static_cast(data.chunk(c).get())); + auto type(std::dynamic_pointer_cast(arr->type())); + const int precision = type->precision; + const int scale = type->scale; + const int bit_width = type->bit_width(); + + for (int64_t i = 0; i < arr->length(); ++i) { + if (arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values++ = Py_None; + } else { + const uint8_t* raw_value = arr->GetValue(i); + std::string s; + switch (bit_width) { + case 32: + RETURN_NOT_OK(RawDecimalToString(raw_value, precision, scale, &s)); + break; + case 64: + RETURN_NOT_OK(RawDecimalToString(raw_value, precision, scale, &s)); + break; + case 128: + RETURN_NOT_OK( + RawDecimalToString(raw_value, precision, scale, arr->IsNegative(i), &s)); + break; + default: + break; + } + RETURN_NOT_OK(DecimalFromString(Decimal, s, out_values++)); + } + } + } + + return Status::OK(); +} + Status PandasConverter::ConvertBooleans() { PyAcquireGIL lock; @@ -598,6 +737,7 @@ Status PandasConverter::ConvertObjects() { // // * Strings // * Booleans with nulls + // * decimal.Decimals // * Mixed type (not supported at the moment by arrow format) // // Additionally, nulls may be encoded either as np.nan or None. So we have to @@ -613,6 +753,7 @@ Status PandasConverter::ConvertObjects() { PyDateTime_IMPORT; } + // This means we received an explicit type from the user if (type_) { switch (type_->type) { case Type::STRING: @@ -627,10 +768,17 @@ Status PandasConverter::ConvertObjects() { const auto& list_field = static_cast(*type_); return ConvertLists(list_field.value_field()->type); } + case Type::DECIMAL: + return ConvertDecimals(); default: return Status::TypeError("No known conversion to Arrow type"); } } else { + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(ImportModule("decimal", &decimal)); + RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + for (int64_t i = 0; i < length_; ++i) { if (PyObject_is_null(objects[i])) { continue; @@ -640,6 +788,8 @@ Status PandasConverter::ConvertObjects() { return ConvertBooleans(); } else if (PyDate_CheckExact(objects[i])) { return ConvertDates(); + } else if (PyObject_IsInstance(const_cast(objects[i]), Decimal.obj())) { + return ConvertDecimals(); } else { return InvalidConversion( const_cast(objects[i]), "string, bool, or date"); @@ -847,6 +997,7 @@ class PandasBlock { INT64, FLOAT, DOUBLE, + DECIMAL, BOOL, DATETIME, DATETIME_WITH_TZ, @@ -1193,6 +1344,8 @@ class ObjectBlock : public PandasBlock { RETURN_NOT_OK(ConvertBinaryLike(data, out_buffer)); } else if (type == Type::FIXED_SIZE_BINARY) { RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer)); + } else if (type == Type::DECIMAL) { + RETURN_NOT_OK(ConvertDecimals(data, out_buffer)); } else if (type == Type::LIST) { auto list_type = std::static_pointer_cast(col->type()); switch (list_type->value_type()->type) { @@ -1519,6 +1672,7 @@ Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, BLOCK_CASE(DOUBLE, Float64Block); BLOCK_CASE(BOOL, BoolBlock); BLOCK_CASE(DATETIME, DatetimeBlock); + BLOCK_CASE(DECIMAL, ObjectBlock); default: return Status::NotImplemented("Unsupported block type"); } @@ -1649,6 +1803,9 @@ class DataFrameBlockCreator { case Type::DICTIONARY: output_type = PandasBlock::CATEGORICAL; break; + case Type::DECIMAL: + output_type = PandasBlock::DECIMAL; + break; default: return Status::NotImplemented(col->type()->ToString()); } @@ -1892,6 +2049,7 @@ class ArrowDeserializer { CONVERT_CASE(TIMESTAMP); CONVERT_CASE(DICTIONARY); CONVERT_CASE(LIST); + CONVERT_CASE(DECIMAL); default: { std::stringstream ss; ss << "Arrow type reading not implemented for " << col_->type()->ToString(); @@ -1999,6 +2157,13 @@ class ArrowDeserializer { return ConvertFixedSizeBinary(data_, out_values); } + template + inline typename std::enable_if::type ConvertValues() { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + auto out_values = reinterpret_cast(PyArray_DATA(arr_)); + return ConvertDecimals(data_, out_values); + } + #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \ case Type::ArrowEnum: \ return ConvertListsLike(col_, out_values); @@ -2021,6 +2186,7 @@ class ArrowDeserializer { CONVERTVALUES_LISTSLIKE_CASE(FloatType, FLOAT) CONVERTVALUES_LISTSLIKE_CASE(DoubleType, DOUBLE) CONVERTVALUES_LISTSLIKE_CASE(StringType, STRING) + CONVERTVALUES_LISTSLIKE_CASE(DecimalType, DECIMAL) default: { std::stringstream ss; ss << "Not implemented type for lists: " << list_type->value_type()->ToString(); diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index f269ebfb642..b63d2ffb1cd 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -28,8 +28,11 @@ #include "arrow/python/builtin_convert.h" #include "arrow/python/common.h" +#include "arrow/python/helpers.h" #include "arrow/python/pandas_convert.h" +#include "arrow/util/decimal.h" + namespace arrow { namespace py { @@ -37,6 +40,36 @@ TEST(PyBuffer, InvalidInputObject) { PyBuffer buffer(Py_None); } +TEST(DecimalTest, TestPythonDecimalToArrowDecimal128) { + PyAcquireGIL lock; + + OwnedRef decimal; + OwnedRef Decimal; + ASSERT_OK(ImportModule("decimal", &decimal)); + ASSERT_NE(decimal.obj(), nullptr); + + ASSERT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + ASSERT_NE(Decimal.obj(), nullptr); + + std::string decimal_string("-39402950693754869342983"); + const char* format = "s#"; + auto c_string = decimal_string.c_str(); + ASSERT_NE(c_string, nullptr); + + auto c_string_size = decimal_string.size(); + ASSERT_GT(c_string_size, 0); + OwnedRef pydecimal(PyObject_CallFunction( + Decimal.obj(), const_cast(format), c_string, c_string_size)); + ASSERT_NE(pydecimal.obj(), nullptr); + ASSERT_EQ(PyErr_Occurred(), nullptr); + + Decimal128 arrow_decimal; + int128_t boost_decimal(decimal_string); + PyObject* obj = pydecimal.obj(); + ASSERT_OK(PythonDecimalToArrowDecimal(obj, &arrow_decimal)); + ASSERT_EQ(boost_decimal, arrow_decimal.value); +} + TEST(PandasConversionTest, TestObjectBlockWriteFails) { StringBuilder builder(default_memory_pool()); const char value[] = {'\xf1', '\0'}; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index abbb626e0fc..df4590f18d7 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -17,6 +17,7 @@ #include "arrow/type.h" +#include #include #include @@ -91,7 +92,7 @@ std::string BinaryType::ToString() const { } int FixedSizeBinaryType::bit_width() const { - return 8 * byte_width(); + return CHAR_BIT * byte_width(); } std::string FixedSizeBinaryType::ToString() const { @@ -380,6 +381,10 @@ std::shared_ptr field( return std::make_shared(name, type, nullable); } +std::shared_ptr decimal(int precision, int scale) { + return std::make_shared(precision, scale); +} + static const BufferDescr kValidityBuffer(BufferType::VALIDITY, 1); static const BufferDescr kOffsetBuffer(BufferType::OFFSET, 32); static const BufferDescr kTypeBuffer(BufferType::TYPE, 32); @@ -402,7 +407,11 @@ std::vector BinaryType::GetBufferLayout() const { } std::vector FixedSizeBinaryType::GetBufferLayout() const { - return {kValidityBuffer, BufferDescr(BufferType::DATA, byte_width_ * 8)}; + return {kValidityBuffer, BufferDescr(BufferType::DATA, bit_width())}; +} + +std::vector DecimalType::GetBufferLayout() const { + return {kValidityBuffer, kBooleanBuffer, BufferDescr(BufferType::DATA, bit_width())}; } std::vector ListType::GetBufferLayout() const { @@ -427,9 +436,4 @@ std::string DecimalType::ToString() const { return s.str(); } -std::vector DecimalType::GetBufferLayout() const { - // TODO(wesm) - return {}; -} - } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 36ab9d8b2b9..3a35f563811 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -360,6 +360,8 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType { explicit FixedSizeBinaryType(int32_t byte_width) : FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {} + explicit FixedSizeBinaryType(int32_t byte_width, Type::type type_id) + : FixedWidthType(type_id), byte_width_(byte_width) {} Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; @@ -399,19 +401,31 @@ struct ARROW_EXPORT StructType : public NestedType { std::vector GetBufferLayout() const override; }; -struct ARROW_EXPORT DecimalType : public DataType { +static inline int decimal_byte_width(int precision) { + if (precision >= 0 && precision < 10) { + return 4; + } else if (precision >= 10 && precision < 19) { + return 8; + } else { + // TODO(phillipc): validate that we can't construct > 128 bit types + return 16; + } +} + +struct ARROW_EXPORT DecimalType : public FixedSizeBinaryType { static constexpr Type::type type_id = Type::DECIMAL; explicit DecimalType(int precision_, int scale_) - : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} - int precision; - int scale; - + : FixedSizeBinaryType(decimal_byte_width(precision_), Type::DECIMAL), + precision(precision_), + scale(scale_) {} + std::vector GetBufferLayout() const override; Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; static std::string name() { return "decimal"; } - std::vector GetBufferLayout() const override; + int precision; + int scale; }; enum class UnionMode : char { SPARSE, DENSE }; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 2e27ce98589..acf12c3d9d1 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -69,6 +69,7 @@ class StructBuilder; struct DecimalType; class DecimalArray; +class DecimalBuilder; struct UnionType; class UnionArray; @@ -146,6 +147,7 @@ std::shared_ptr ARROW_EXPORT binary(); std::shared_ptr ARROW_EXPORT date32(); std::shared_ptr ARROW_EXPORT date64(); +std::shared_ptr ARROW_EXPORT decimal(int precision, int scale); } // namespace arrow diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 353b638fed8..3e8ea23432b 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -228,6 +228,13 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return float64(); } }; +template <> +struct TypeTraits { + using ArrayType = DecimalArray; + using BuilderType = DecimalBuilder; + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = BooleanArray; @@ -289,12 +296,6 @@ struct TypeTraits { constexpr static bool is_parameter_free = false; }; -template <> -struct TypeTraits { - // using ArrayType = DecimalArray; - constexpr static bool is_parameter_free = false; -}; - // Not all type classes have a c_type template struct as_void { diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index c1b6877a3e9..054f11055b6 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -22,6 +22,7 @@ # Headers: top level install(FILES bit-util.h + decimal.h logging.h macros.h random.h @@ -70,3 +71,4 @@ endif() ADD_ARROW_TEST(bit-util-test) ADD_ARROW_TEST(stl-util-test) +ADD_ARROW_TEST(decimal-test) diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 42afd0705f0..90a1c3eab92 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -149,7 +149,6 @@ int64_t ARROW_EXPORT CountSetBits( bool ARROW_EXPORT BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, int64_t right_offset, int64_t bit_length); - } // namespace arrow #endif // ARROW_UTIL_BIT_UTIL_H diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc new file mode 100644 index 00000000000..1e22643962d --- /dev/null +++ b/cpp/src/arrow/util/decimal-test.cc @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// + +#include "arrow/util/decimal.h" + +#include "gtest/gtest.h" + +#include "arrow/test-util.h" + +namespace arrow { + +template +class DecimalTest : public ::testing::Test { + public: + DecimalTest() : string_value("234.23445") { integer_value.value = 23423445; } + Decimal integer_value; + std::string string_value; +}; + +typedef ::testing::Types DecimalTypes; +TYPED_TEST_CASE(DecimalTest, DecimalTypes); + +TYPED_TEST(DecimalTest, TestToString) { + Decimal decimal(this->integer_value); + int precision = 8; + int scale = 5; + std::string result = ToString(decimal, precision, scale); + ASSERT_EQ(result, this->string_value); +} + +TYPED_TEST(DecimalTest, TestFromString) { + Decimal expected(this->integer_value); + Decimal result; + int precision, scale; + ASSERT_OK(FromString(this->string_value, &result, &precision, &scale)); + ASSERT_EQ(result.value, expected.value); + ASSERT_EQ(precision, 8); + ASSERT_EQ(scale, 5); +} + +TEST(DecimalTest, TestStringToInt32) { + int32_t value = 0; + StringToInteger("123", "456", 1, &value); + ASSERT_EQ(value, 123456); +} + +TEST(DecimalTest, TestStringToInt64) { + int64_t value = 0; + StringToInteger("123456789", "456", -1, &value); + ASSERT_EQ(value, -123456789456); +} + +TEST(DecimalTest, TestStringToInt128) { + int128_t value = 0; + StringToInteger("123456789", "456789123", 1, &value); + ASSERT_EQ(value, 123456789456789123); +} + +TEST(DecimalTest, TestFromString128) { + static const std::string string_value("-23049223942343532412"); + Decimal result(string_value); + int128_t expected = -230492239423435324; + ASSERT_EQ(result.value, expected * 100 - 12); + + // Sanity check that our number is actually using more than 64 bits + ASSERT_NE(result.value, static_cast(result.value)); +} + +TEST(DecimalTest, TestFromDecimalString128) { + static const std::string string_value("-23049223942343.532412"); + Decimal result(string_value); + int128_t expected = -230492239423435324; + ASSERT_EQ(result.value, expected * 100 - 12); + + // Sanity check that our number is actually using more than 64 bits + ASSERT_NE(result.value, static_cast(result.value)); +} + +TEST(DecimalTest, TestDecimal32Precision) { + auto min_precision = DecimalPrecision::minimum; + auto max_precision = DecimalPrecision::maximum; + ASSERT_EQ(min_precision, 1); + ASSERT_EQ(max_precision, 9); +} + +TEST(DecimalTest, TestDecimal64Precision) { + auto min_precision = DecimalPrecision::minimum; + auto max_precision = DecimalPrecision::maximum; + ASSERT_EQ(min_precision, 10); + ASSERT_EQ(max_precision, 18); +} + +TEST(DecimalTest, TestDecimal128Precision) { + auto min_precision = DecimalPrecision::minimum; + auto max_precision = DecimalPrecision::maximum; + ASSERT_EQ(min_precision, 19); + ASSERT_EQ(max_precision, 38); +} + +TEST(DecimalTest, TestDecimal32SignedRoundTrip) { + Decimal32 expected(std::string("-3402692")); + + uint8_t stack_bytes[4] = {0}; + uint8_t* bytes = stack_bytes; + ToBytes(expected, &bytes); + + Decimal32 result; + FromBytes(bytes, &result); + ASSERT_EQ(expected.value, result.value); +} + +TEST(DecimalTest, TestDecimal64SignedRoundTrip) { + Decimal64 expected(std::string("-34034293045.921")); + + uint8_t stack_bytes[8] = {0}; + uint8_t* bytes = stack_bytes; + ToBytes(expected, &bytes); + + Decimal64 result; + FromBytes(bytes, &result); + + ASSERT_EQ(expected.value, result.value); +} + +TEST(DecimalTest, TestDecimal128StringAndBytesRoundTrip) { + std::string string_value("-340282366920938463463374607431.711455"); + Decimal128 expected(string_value); + + std::string expected_string_value("-340282366920938463463374607431711455"); + int128_t expected_underlying_value(expected_string_value); + + ASSERT_EQ(expected.value, expected_underlying_value); + + uint8_t stack_bytes[16] = {0}; + uint8_t* bytes = stack_bytes; + bool is_negative; + ToBytes(expected, &bytes, &is_negative); + + ASSERT_TRUE(is_negative); + + Decimal128 result; + FromBytes(bytes, is_negative, &result); + + ASSERT_EQ(expected.value, result.value); +} +} // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc new file mode 100644 index 00000000000..1ac347180fe --- /dev/null +++ b/cpp/src/arrow/util/decimal.cc @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/decimal.h" + +#include + +namespace arrow { + +static const boost::regex DECIMAL_PATTERN("(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?"); + +template +ARROW_EXPORT Status FromString( + const std::string& s, Decimal* out, int* precision, int* scale) { + if (s.empty()) { + return Status::Invalid("Empty string cannot be converted to decimal"); + } + boost::smatch match; + if (!boost::regex_match(s, match, DECIMAL_PATTERN)) { + std::stringstream ss; + ss << "String " << s << " is not a valid decimal string"; + return Status::Invalid(ss.str()); + } + const int8_t sign = match[1].str() == "-" ? -1 : 1; + std::string whole_part = match[4].str(); + std::string fractional_part = match[6].str(); + if (scale != nullptr) { *scale = static_cast(fractional_part.size()); } + if (precision != nullptr) { + *precision = + static_cast(whole_part.size()) + static_cast(fractional_part.size()); + } + if (out != nullptr) { StringToInteger(whole_part, fractional_part, sign, &out->value); } + return Status::OK(); +} + +template ARROW_EXPORT Status FromString( + const std::string& s, Decimal32* out, int* precision, int* scale); +template ARROW_EXPORT Status FromString( + const std::string& s, Decimal64* out, int* precision, int* scale); +template ARROW_EXPORT Status FromString( + const std::string& s, Decimal128* out, int* precision, int* scale); + +void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int32_t* out) { + DCHECK(sign == -1 || sign == 1); + DCHECK_NE(out, nullptr); + DCHECK(!whole.empty() || !fractional.empty()); + if (!whole.empty()) { + *out = std::stoi(whole, nullptr, 10) * + static_cast(pow(10.0, static_cast(fractional.size()))); + } + if (!fractional.empty()) { *out += std::stoi(fractional, nullptr, 10); } + *out *= sign; +} + +void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int64_t* out) { + DCHECK(sign == -1 || sign == 1); + DCHECK_NE(out, nullptr); + DCHECK(!whole.empty() || !fractional.empty()); + if (!whole.empty()) { + *out = static_cast(std::stoll(whole, nullptr, 10)) * + static_cast(pow(10.0, static_cast(fractional.size()))); + } + if (!fractional.empty()) { *out += std::stoll(fractional, nullptr, 10); } + *out *= sign; +} + +void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int128_t* out) { + DCHECK(sign == -1 || sign == 1); + DCHECK_NE(out, nullptr); + DCHECK(!whole.empty() || !fractional.empty()); + *out = int128_t(whole + fractional) * sign; +} + +void FromBytes(const uint8_t* bytes, Decimal32* decimal) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(decimal, nullptr); + decimal->value = *reinterpret_cast(bytes); +} + +void FromBytes(const uint8_t* bytes, Decimal64* decimal) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(decimal, nullptr); + decimal->value = *reinterpret_cast(bytes); +} + +constexpr static const size_t BYTES_IN_128_BITS = 128 / CHAR_BIT; +constexpr static const size_t LIMB_SIZE = + sizeof(std::remove_pointer::type); +constexpr static const size_t BYTES_PER_LIMB = BYTES_IN_128_BITS / LIMB_SIZE; + +void FromBytes(const uint8_t* bytes, bool is_negative, Decimal128* decimal) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(decimal, nullptr); + + auto& decimal_value(decimal->value); + int128_t::backend_type& backend(decimal_value.backend()); + backend.resize(BYTES_PER_LIMB, BYTES_PER_LIMB); + std::memcpy(backend.limbs(), bytes, BYTES_IN_128_BITS); + if (is_negative) { decimal->value = -decimal->value; } +} + +void ToBytes(const Decimal32& value, uint8_t** bytes) { + DCHECK_NE(*bytes, nullptr); + *reinterpret_cast(*bytes) = value.value; +} + +void ToBytes(const Decimal64& value, uint8_t** bytes) { + DCHECK_NE(*bytes, nullptr); + *reinterpret_cast(*bytes) = value.value; +} + +void ToBytes(const Decimal128& decimal, uint8_t** bytes, bool* is_negative) { + DCHECK_NE(*bytes, nullptr); + DCHECK_NE(is_negative, nullptr); + + /// TODO(phillipc): boost multiprecision is unreliable here, int128_t can't be + /// roundtripped + const auto& backend(decimal.value.backend()); + auto boost_bytes = reinterpret_cast(backend.limbs()); + std::memcpy(*bytes, boost_bytes, BYTES_IN_128_BITS); + *is_negative = backend.isneg(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h new file mode 100644 index 00000000000..46883e3de93 --- /dev/null +++ b/cpp/src/arrow/util/decimal.h @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_DECIMAL_H +#define ARROW_DECIMAL_H + +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" + +#include + +namespace arrow { + +using boost::multiprecision::int128_t; + +template +struct ARROW_EXPORT Decimal; + +ARROW_EXPORT void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int32_t* out); +ARROW_EXPORT void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int64_t* out); +ARROW_EXPORT void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int128_t* out); + +template +ARROW_EXPORT Status FromString(const std::string& s, Decimal* out, + int* precision = nullptr, int* scale = nullptr); + +template +struct ARROW_EXPORT Decimal { + Decimal() : value() {} + explicit Decimal(const std::string& s) : value() { FromString(s, this); } + explicit Decimal(const char* s) : Decimal(std::string(s)) {} + explicit Decimal(const T& value) : value(value) {} + + using value_type = T; + value_type value; +}; + +using Decimal32 = Decimal; +using Decimal64 = Decimal; +using Decimal128 = Decimal; + +template +struct ARROW_EXPORT DecimalPrecision {}; + +template <> +struct ARROW_EXPORT DecimalPrecision { + constexpr static const int minimum = 1; + constexpr static const int maximum = 9; +}; + +template <> +struct ARROW_EXPORT DecimalPrecision { + constexpr static const int minimum = 10; + constexpr static const int maximum = 18; +}; + +template <> +struct ARROW_EXPORT DecimalPrecision { + constexpr static const int minimum = 19; + constexpr static const int maximum = 38; +}; + +template +ARROW_EXPORT std::string ToString( + const Decimal& decimal_value, int precision, int scale) { + T value = decimal_value.value; + + // Decimal values are sent to clients as strings so in the interest of + // speed the string will be created without the using stringstream with the + // whole/fractional_part(). + size_t last_char_idx = precision + (scale > 0) // Add a space for decimal place + + (scale == precision) // Add a space for leading 0 + + (value < 0); // Add a space for negative sign + std::string str = std::string(last_char_idx, '0'); + // Start filling in the values in reverse order by taking the last digit + // of the value. Use a positive value and worry about the sign later. At this + // point the last_char_idx points to the string terminator. + T remaining_value = value; + size_t first_digit_idx = 0; + if (value < 0) { + remaining_value = -value; + first_digit_idx = 1; + } + if (scale > 0) { + int remaining_scale = scale; + do { + str[--last_char_idx] = static_cast( + (remaining_value % 10) + static_cast('0')); // Ascii offset + remaining_value /= 10; + } while (--remaining_scale > 0); + str[--last_char_idx] = '.'; + DCHECK_GT(last_char_idx, first_digit_idx) << "Not enough space remaining"; + } + do { + str[--last_char_idx] = + static_cast((remaining_value % 10) + static_cast('0')); // Ascii offset + remaining_value /= 10; + if (remaining_value == 0) { + // Trim any extra leading 0's. + if (last_char_idx > first_digit_idx) str.erase(0, last_char_idx - first_digit_idx); + break; + } + // For safety, enforce string length independent of remaining_value. + } while (last_char_idx > first_digit_idx); + if (value < 0) str[0] = '-'; + return str; +} + +/// Conversion from raw bytes to a Decimal value +ARROW_EXPORT void FromBytes(const uint8_t* bytes, Decimal32* value); +ARROW_EXPORT void FromBytes(const uint8_t* bytes, Decimal64* value); +ARROW_EXPORT void FromBytes(const uint8_t* bytes, bool is_negative, Decimal128* decimal); + +/// Conversion from a Decimal value to raw bytes +ARROW_EXPORT void ToBytes(const Decimal32& value, uint8_t** bytes); +ARROW_EXPORT void ToBytes(const Decimal64& value, uint8_t** bytes); +ARROW_EXPORT void ToBytes(const Decimal128& decimal, uint8_t** bytes, bool* is_negative); + +} // namespace arrow +#endif // ARROW_DECIMAL_H diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index c61c9f59f7a..29b3db60cad 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -93,7 +93,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { ARRAY_VISIT_INLINE(TimestampType); ARRAY_VISIT_INLINE(Time32Type); ARRAY_VISIT_INLINE(Time64Type); - // ARRAY_VISIT_INLINE(DecimalType); + ARRAY_VISIT_INLINE(DecimalType); ARRAY_VISIT_INLINE(ListType); ARRAY_VISIT_INLINE(StructType); ARRAY_VISIT_INLINE(UnionType); diff --git a/format/Schema.fbs b/format/Schema.fbs index ca9c8e6c3e7..badc7ea8bef 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -77,7 +77,9 @@ table Bool { } table Decimal { + /// Total number of decimal digits precision: int; + /// Number of digits after the decimal point "." scale: int; } diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 8c520748cf3..7b23cf66c6f 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -71,7 +71,7 @@ uint8, uint16, uint32, uint64, timestamp, date32, date64, float16, float32, float64, - binary, string, + binary, string, decimal, list_, struct, dictionary, field, DataType, FixedSizeBinaryType, Field, Schema, schema) diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd index f6aaea2582e..3ba48718265 100644 --- a/python/pyarrow/array.pxd +++ b/python/pyarrow/array.pxd @@ -116,6 +116,10 @@ cdef class FixedSizeBinaryArray(Array): pass +cdef class DecimalArray(FixedSizeBinaryArray): + pass + + cdef class ListArray(Array): pass diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 9f302e02cdb..ee500e68129 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -481,6 +481,10 @@ cdef class FixedSizeBinaryArray(Array): pass +cdef class DecimalArray(FixedSizeBinaryArray): + pass + + cdef class ListArray(Array): pass @@ -602,6 +606,7 @@ cdef dict _array_classes = { Type_STRING: StringArray, Type_DICTIONARY: DictionaryArray, Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, + Type_DECIMAL: DecimalArray, } cdef object box_array(const shared_ptr[CArray]& sp_array): diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index ab38ff3084f..4860334a921 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -51,6 +51,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsTypeError() +cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil: + cdef cppclass int128_t: + pass + + cdef inline object PyObject_to_object(PyObject* o): # Cast to "object" increments reference count cdef object result = o diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2a0488f3a01..73d96b25f52 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -39,6 +39,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: Type_FLOAT" arrow::Type::FLOAT" Type_DOUBLE" arrow::Type::DOUBLE" + Type_DECIMAL" arrow::Type::DECIMAL" + Type_DATE32" arrow::Type::DATE32" Type_DATE64" arrow::Type::DATE64" Type_TIMESTAMP" arrow::Type::TIMESTAMP" @@ -58,6 +60,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: TimeUnit_MICRO" arrow::TimeUnit::MICRO" TimeUnit_NANO" arrow::TimeUnit::NANO" + cdef cppclass Decimal[T]: + Decimal(const T&) + + cdef c_string ToString[T](const Decimal[T]&, int, int) + cdef cppclass CDataType" arrow::DataType": Type type @@ -144,6 +151,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeBinaryType" arrow::FixedSizeBinaryType"(CFixedWidthType): CFixedSizeBinaryType(int byte_width) int byte_width() + int bit_width() + + cdef cppclass CDecimalType" arrow::DecimalType"(CFixedSizeBinaryType): + int precision + int scale + CDecimalType(int precision, int scale) cdef cppclass CField" arrow::Field": c_string name @@ -212,6 +225,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeBinaryArray" arrow::FixedSizeBinaryArray"(CArray): const uint8_t* GetValue(int i) + cdef cppclass CDecimalArray" arrow::DecimalArray"(CFixedSizeBinaryArray): + Decimal[T] Value[T](int i) + cdef cppclass CListArray" arrow::ListArray"(CArray): const int32_t* raw_value_offsets() int32_t value_offset(int i) diff --git a/python/pyarrow/scalar.pxd b/python/pyarrow/scalar.pxd index d6c3b35160c..62a5664e57e 100644 --- a/python/pyarrow/scalar.pxd +++ b/python/pyarrow/scalar.pxd @@ -20,6 +20,7 @@ from pyarrow.includes.libarrow cimport * from pyarrow.schema cimport DataType + cdef class Scalar: cdef readonly: DataType type diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 1c0790a4fdc..f3d93213269 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -17,9 +17,10 @@ from pyarrow.schema cimport DataType, box_data_type +from pyarrow.includes.common cimport int128_t from pyarrow.compat import frombytes import pyarrow.schema as schema - +import decimal import datetime cimport cpython as cp @@ -64,7 +65,7 @@ cdef class ArrayValue(Scalar): if hasattr(self, 'as_py'): return repr(self.as_py()) else: - return Scalar.__repr__(self) + return super(Scalar, self).__repr__() cdef class BooleanValue(ArrayValue): @@ -199,6 +200,25 @@ cdef class DoubleValue(ArrayValue): return ap.Value(self.index) +cdef class DecimalValue(ArrayValue): + + def as_py(self): + cdef: + CDecimalArray* ap = self.sp_array.get() + CDecimalType* t = ap.type().get() + int bit_width = t.bit_width() + int precision = t.precision + int scale = t.scale + c_string s + if bit_width == 32: + s = ToString[int32_t](ap.Value[int32_t](self.index), precision, scale) + elif bit_width == 64: + s = ToString[int64_t](ap.Value[int64_t](self.index), precision, scale) + elif bit_width == 128: + s = ToString[int128_t](ap.Value[int128_t](self.index), precision, scale) + return decimal.Decimal(s.decode('utf8')) + + cdef class StringValue(ArrayValue): def as_py(self): @@ -286,6 +306,7 @@ cdef dict _scalar_classes = { Type_BINARY: BinaryValue, Type_STRING: StringValue, Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue, + Type_DECIMAL: DecimalValue, } cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array, diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd index 94d65bfc157..eceedbad0ba 100644 --- a/python/pyarrow/schema.pxd +++ b/python/pyarrow/schema.pxd @@ -20,6 +20,7 @@ from pyarrow.includes.libarrow cimport (CDataType, CDictionaryType, CTimestampType, CFixedSizeBinaryType, + CDecimalType, CField, CSchema) cdef class DataType: @@ -27,7 +28,7 @@ cdef class DataType: shared_ptr[CDataType] sp_type CDataType* type - cdef init(self, const shared_ptr[CDataType]& type) + cdef void init(self, const shared_ptr[CDataType]& type) cdef class DictionaryType(DataType): @@ -45,6 +46,11 @@ cdef class FixedSizeBinaryType(DataType): const CFixedSizeBinaryType* fixed_size_binary_type +cdef class DecimalType(FixedSizeBinaryType): + cdef: + const CDecimalType* decimal_type + + cdef class Field: cdef: shared_ptr[CField] sp_field @@ -55,6 +61,7 @@ cdef class Field: cdef init(self, const shared_ptr[CField]& field) + cdef class Schema: cdef: shared_ptr[CSchema] sp_schema @@ -63,6 +70,7 @@ cdef class Schema: cdef init(self, const vector[shared_ptr[CField]]& fields) cdef init_schema(self, const shared_ptr[CSchema]& schema) + cdef DataType box_data_type(const shared_ptr[CDataType]& type) cdef Field box_field(const shared_ptr[CField]& field) cdef Schema box_schema(const shared_ptr[CSchema]& schema) diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index 253be4590b5..4b931bf4522 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -29,6 +29,7 @@ from pyarrow.array cimport Array from pyarrow.error cimport check_status from pyarrow.includes.libarrow cimport (CDataType, CStructType, CListType, CFixedSizeBinaryType, + CDecimalType, TimeUnit_SECOND, TimeUnit_MILLI, TimeUnit_MICRO, TimeUnit_NANO, Type, TimeUnit) @@ -45,7 +46,7 @@ cdef class DataType: def __cinit__(self): pass - cdef init(self, const shared_ptr[CDataType]& type): + cdef void init(self, const shared_ptr[CDataType]& type): self.sp_type = type self.type = type.get() @@ -66,14 +67,14 @@ cdef class DataType: cdef class DictionaryType(DataType): - cdef init(self, const shared_ptr[CDataType]& type): + cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) self.dict_type = type.get() cdef class TimestampType(DataType): - cdef init(self, const shared_ptr[CDataType]& type): + cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) self.ts_type = type.get() @@ -93,7 +94,7 @@ cdef class TimestampType(DataType): cdef class FixedSizeBinaryType(DataType): - cdef init(self, const shared_ptr[CDataType]& type): + cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) self.fixed_size_binary_type = type.get() @@ -103,6 +104,13 @@ cdef class FixedSizeBinaryType(DataType): return self.fixed_size_binary_type.byte_width() +cdef class DecimalType(FixedSizeBinaryType): + + cdef void init(self, const shared_ptr[CDataType]& type): + DataType.init(self, type) + self.decimal_type = type.get() + + cdef class Field: def __cinit__(self): @@ -354,6 +362,12 @@ def float64(): return primitive_type(la.Type_DOUBLE) +cpdef DataType decimal(int precision, int scale=0): + cdef shared_ptr[CDataType] decimal_type + decimal_type.reset(new CDecimalType(precision, scale)) + return box_data_type(decimal_type) + + def string(): """ UTF8 string @@ -374,11 +388,9 @@ def binary(int length=-1): if length == -1: return primitive_type(la.Type_BINARY) - cdef FixedSizeBinaryType out = FixedSizeBinaryType() cdef shared_ptr[CDataType] fixed_size_binary_type fixed_size_binary_type.reset(new CFixedSizeBinaryType(length)) - out.init(fixed_size_binary_type) - return out + return box_data_type(fixed_size_binary_type) def list_(DataType value_type): @@ -436,6 +448,8 @@ cdef DataType box_data_type(const shared_ptr[CDataType]& type): out = TimestampType() elif type.get().type == la.Type_FIXED_SIZE_BINARY: out = FixedSizeBinaryType() + elif type.get().type == la.Type_DECIMAL: + out = DecimalType() else: out = DataType() diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index e2b03d85ecd..d89a8e0c54c 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -20,6 +20,7 @@ import pyarrow as pa import datetime +import decimal class TestConvertList(unittest.TestCase): @@ -162,3 +163,42 @@ def test_mixed_types_fails(self): data = ['a', 1, 2.0] with self.assertRaises(pa.ArrowException): pa.from_pylist(data) + + def test_decimal(self): + data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')] + type = pa.decimal(precision=7, scale=3) + arr = pa.from_pylist(data, type=type) + assert arr.to_pylist() == data + + def test_decimal_different_precisions(self): + data = [ + decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234') + ] + type = pa.decimal(precision=13, scale=3) + arr = pa.from_pylist(data, type=type) + assert arr.to_pylist() == data + + def test_decimal_no_scale(self): + data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')] + type = pa.decimal(precision=10) + arr = pa.from_pylist(data, type=type) + assert arr.to_pylist() == data + + def test_decimal_negative(self): + data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')] + type = pa.decimal(precision=10, scale=6) + arr = pa.from_pylist(data, type=type) + assert arr.to_pylist() == data + + def test_decimal_no_whole_part(self): + data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')] + type = pa.decimal(precision=7, scale=7) + arr = pa.from_pylist(data, type=type) + assert arr.to_pylist() == data + + def test_decimal_large_integer(self): + data = [decimal.Decimal('-394029506937548693.42983'), + decimal.Decimal('32358695912932.01033')] + type = pa.decimal(precision=23, scale=5) + arr = pa.from_pylist(data, type=type) + assert arr.to_pylist() == data diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 87c9c03d7da..0504e1ddb4f 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -20,6 +20,7 @@ import datetime import unittest +import decimal import numpy as np @@ -451,3 +452,72 @@ def test_strided_data_import(self): self._check_pandas_roundtrip(df) self._check_array_roundtrip(col) self._check_array_roundtrip(col, mask=strided_mask) + + def test_decimal_32_from_pandas(self): + expected = pd.DataFrame({ + 'decimals': [ + decimal.Decimal('-1234.123'), + decimal.Decimal('1234.439'), + ] + }) + converted = A.Table.from_pandas(expected) + field = A.Field.from_py('decimals', A.decimal(7, 3)) + schema = A.Schema.from_fields([field]) + assert converted.schema.equals(schema) + + def test_decimal_32_to_pandas(self): + expected = pd.DataFrame({ + 'decimals': [ + decimal.Decimal('-1234.123'), + decimal.Decimal('1234.439'), + ] + }) + converted = A.Table.from_pandas(expected) + df = converted.to_pandas() + tm.assert_frame_equal(df, expected) + + def test_decimal_64_from_pandas(self): + expected = pd.DataFrame({ + 'decimals': [ + decimal.Decimal('-129934.123331'), + decimal.Decimal('129534.123731'), + ] + }) + converted = A.Table.from_pandas(expected) + field = A.Field.from_py('decimals', A.decimal(12, 6)) + schema = A.Schema.from_fields([field]) + assert converted.schema.equals(schema) + + def test_decimal_64_to_pandas(self): + expected = pd.DataFrame({ + 'decimals': [ + decimal.Decimal('-129934.123331'), + decimal.Decimal('129534.123731'), + ] + }) + converted = A.Table.from_pandas(expected) + df = converted.to_pandas() + tm.assert_frame_equal(df, expected) + + def test_decimal_128_from_pandas(self): + expected = pd.DataFrame({ + 'decimals': [ + decimal.Decimal('394092382910493.12341234678'), + -decimal.Decimal('314292388910493.12343437128'), + ] + }) + converted = A.Table.from_pandas(expected) + field = A.Field.from_py('decimals', A.decimal(26, 11)) + schema = A.Schema.from_fields([field]) + assert converted.schema.equals(schema) + + def test_decimal_128_to_pandas(self): + expected = pd.DataFrame({ + 'decimals': [ + decimal.Decimal('394092382910493.12341234678'), + -decimal.Decimal('314292388910493.12343437128'), + ] + }) + converted = A.Table.from_pandas(expected) + df = converted.to_pandas() + tm.assert_frame_equal(df, expected)