From 4f01cc2edabe31b7504f75468dc2214d3fbf4bc6 Mon Sep 17 00:00:00 2001 From: Ezra Date: Mon, 12 Oct 2020 14:44:28 -0700 Subject: [PATCH] Add BasicDecimal256 Multiplication Support (PR for decimal256 branch, not master) (#8344) --- cpp/src/arrow/util/basic_decimal.cc | 167 ++++++++++++++++++------ cpp/src/arrow/util/basic_decimal.h | 36 ++++- cpp/src/arrow/util/decimal.cc | 5 + cpp/src/arrow/util/decimal.h | 3 + cpp/src/arrow/util/decimal_benchmark.cc | 16 +++ cpp/src/arrow/util/decimal_test.cc | 63 +++++++++ 6 files changed, 245 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc index ac85bd088ad25..ea247b74ea5e1 100644 --- a/cpp/src/arrow/util/basic_decimal.cc +++ b/cpp/src/arrow/util/basic_decimal.cc @@ -123,7 +123,7 @@ static const BasicDecimal128 ScaleMultipliersHalf[] = { #ifdef ARROW_USE_NATIVE_INT128 static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF; #else -static constexpr uint64_t kIntMask = 0xFFFFFFFF; +static constexpr uint64_t kInt32Mask = 0xFFFFFFFF; #endif // same as ScaleMultipliers[38] - 1 @@ -254,67 +254,125 @@ BasicDecimal128& BasicDecimal128::operator>>=(uint32_t bits) { namespace { -// TODO: Remove this guard once it's used by BasicDecimal256 -#ifndef ARROW_USE_NATIVE_INT128 -// This method losslessly multiplies x and y into a 128 bit unsigned integer -// whose high bits will be stored in hi and low bits in lo. -void ExtendAndMultiplyUint64(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) { +// Convenience wrapper type over 128 bit unsigned integers. We opt not to +// replace the uint128_t type in int128_internal.h because it would require +// significantly more implementation work to be done. This class merely +// provides the minimum necessary set of functions to perform 128+ bit +// multiplication operations when there may or may not be native support. #ifdef ARROW_USE_NATIVE_INT128 - const __uint128_t r = static_cast<__uint128_t>(x) * y; - *lo = r & kInt64Mask; - *hi = r >> 64; +struct uint128_t { + uint128_t() {} + uint128_t(uint64_t hi, uint64_t lo) : val_((static_cast<__uint128_t>(hi) << 64) | lo) {} + uint128_t(const BasicDecimal128& decimal) { + val_ = (static_cast<__uint128_t>(decimal.high_bits()) << 64) | decimal.low_bits(); + } + + uint64_t hi() { return val_ >> 64; } + uint64_t lo() { return val_ & kInt64Mask; } + + uint128_t& operator+=(const uint128_t& other) { + val_ += other.val_; + return *this; + } + + uint128_t& operator*=(const uint128_t& other) { + val_ *= other.val_; + return *this; + } + + __uint128_t val_; +}; + #else - // If we can't use a native fallback, perform multiplication +// Multiply two 64 bit word components into a 128 bit result, with high bits +// stored in hi and low bits in lo. +inline void ExtendAndMultiply(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) { + // Perform multiplication on two 64 bit words x and y into a 128 bit result // by splitting up x and y into 32 bit high/low bit components, // allowing us to represent the multiplication as // x * y = x_lo * y_lo + x_hi * y_lo * 2^32 + y_hi * x_lo * 2^32 - // + x_hi * y_hi * 2^64. + // + x_hi * y_hi * 2^64 // - // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi. + // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi // Therefore, // lo_lo is (x_lo * y_lo)_lo, // lo_hi is ((x_lo * y_lo)_hi + (x_hi * y_lo)_lo + (x_lo * y_hi)_lo)_lo, // hi_lo is ((x_hi * y_hi)_lo + (x_hi * y_lo)_hi + (x_lo * y_hi)_hi)_hi, // hi_hi is (x_hi * y_hi)_hi - const uint64_t x_lo = x & kIntMask; - const uint64_t y_lo = y & kIntMask; + const uint64_t x_lo = x & kInt32Mask; + const uint64_t y_lo = y & kInt32Mask; const uint64_t x_hi = x >> 32; const uint64_t y_hi = y >> 32; const uint64_t t = x_lo * y_lo; - const uint64_t t_lo = t & kIntMask; + const uint64_t t_lo = t & kInt32Mask; const uint64_t t_hi = t >> 32; const uint64_t u = x_hi * y_lo + t_hi; - const uint64_t u_lo = u & kIntMask; + const uint64_t u_lo = u & kInt32Mask; const uint64_t u_hi = u >> 32; const uint64_t v = x_lo * y_hi + u_lo; const uint64_t v_hi = v >> 32; *hi = x_hi * y_hi + u_hi + v_hi; - *lo = (v << 32) | t_lo; -#endif + *lo = (v << 32) + t_lo; } -#endif -void MultiplyUint128(uint64_t x_hi, uint64_t x_lo, uint64_t y_hi, uint64_t y_lo, - uint64_t* hi, uint64_t* lo) { -#ifdef ARROW_USE_NATIVE_INT128 - const __uint128_t x = (static_cast<__uint128_t>(x_hi) << 64) | x_lo; - const __uint128_t y = (static_cast<__uint128_t>(y_hi) << 64) | y_lo; - const __uint128_t r = x * y; - *lo = r & kInt64Mask; - *hi = r >> 64; -#else - // To perform 128 bit multiplication without a native fallback - // we first perform lossless 64 bit multiplication of the low - // bits, and then add x_hi * y_lo and x_lo * y_hi to the high - // bits. Note that we can skip adding x_hi * y_hi because it - // always will be over 128 bits. - ExtendAndMultiplyUint64(x_lo, y_lo, hi, lo); - *hi += (x_hi * y_lo) + (x_lo * y_hi); +struct uint128_t { + uint128_t() {} + uint128_t(uint64_t hi, uint64_t lo) : hi_(hi), lo_(lo) {} + uint128_t(const BasicDecimal128& decimal) { + hi_ = decimal.high_bits(); + lo_ = decimal.low_bits(); + } + + uint64_t hi() const { return hi_; } + uint64_t lo() const { return lo_; } + + uint128_t& operator+=(const uint128_t& other) { + // To deduce the carry bit, we perform "65 bit" addition on the low bits and + // seeing if the resulting high bit is 1. This is accomplished by shifting the + // low bits to the right by 1 (chopping off the lowest bit), then adding 1 if the + // result of adding the two chopped bits would have produced a carry. + uint64_t carry = (((lo_ & other.lo_) & 1) + (lo_ >> 1) + (other.lo_ >> 1)) >> 63; + hi_ += other.hi_ + carry; + lo_ += other.lo_; + return *this; + } + + uint128_t& operator*=(const uint128_t& other) { + uint128_t r; + ExtendAndMultiply(lo_, other.lo_, &r.hi_, &r.lo_); + r.hi_ += (hi_ * other.lo_) + (lo_ * other.hi_); + *this = r; + return *this; + } + + uint64_t hi_; + uint64_t lo_; +}; #endif + +// Multiplies two N * 64 bit unsigned integer types, represented by a uint64_t +// array into a same sized output. Elements in the array should be in +// little endian order, and output will be the same. Overflow in multiplication +// will result in the lower N * 64 bits of the result being set. +template +inline void MultiplyUnsignedArray(const std::array& lh, + const std::array& rh, + std::array* result) { + for (int j = 0; j < N; ++j) { + uint64_t carry = 0; + for (int i = 0; i < N - j; ++i) { + uint128_t tmp(lh[i]); + tmp *= uint128_t(rh[j]); + tmp += uint128_t((*result)[i + j]); + tmp += uint128_t(carry); + (*result)[i + j] = tmp.lo(); + carry = tmp.hi(); + } + } } } // namespace @@ -325,10 +383,10 @@ BasicDecimal128& BasicDecimal128::operator*=(const BasicDecimal128& right) { const bool negate = Sign() != right.Sign(); BasicDecimal128 x = BasicDecimal128::Abs(*this); BasicDecimal128 y = BasicDecimal128::Abs(right); - uint64_t hi; - MultiplyUint128(x.high_bits(), x.low_bits(), y.high_bits(), y.low_bits(), &hi, - &low_bits_); - high_bits_ = hi; + uint128_t r(x); + r *= y; + high_bits_ = r.hi(); + low_bits_ = r.lo(); if (negate) { Negate(); } @@ -800,6 +858,13 @@ BasicDecimal256& BasicDecimal256::Negate() { return *this; } +BasicDecimal256& BasicDecimal256::Abs() { return *this < 0 ? Negate() : *this; } + +BasicDecimal256 BasicDecimal256::Abs(const BasicDecimal256& in) { + BasicDecimal256 result(in); + return result.Abs(); +} + std::array BasicDecimal256::ToBytes() const { std::array out{{0}}; ToBytes(out.data()); @@ -821,12 +886,36 @@ void BasicDecimal256::ToBytes(uint8_t* out) const { #endif } +BasicDecimal256& BasicDecimal256::operator*=(const BasicDecimal256& right) { + // Since the max value of BasicDecimal256 is supposed to be 1e76 - 1 and the + // min the negation taking the absolute values here should always be safe. + const bool negate = Sign() != right.Sign(); + BasicDecimal256 x = BasicDecimal256::Abs(*this); + BasicDecimal256 y = BasicDecimal256::Abs(right); + + uint128_t r_hi; + uint128_t r_lo; + std::array res({0, 0, 0, 0}); + MultiplyUnsignedArray<4>(x.little_endian_array_, y.little_endian_array_, &res); + little_endian_array_ = res; + if (negate) { + Negate(); + } + return *this; +} + DecimalStatus BasicDecimal256::Rescale(int32_t original_scale, int32_t new_scale, BasicDecimal256* out) const { // TODO: implement. return DecimalStatus::kSuccess; } +BasicDecimal256 operator*(const BasicDecimal256& left, const BasicDecimal256& right) { + BasicDecimal256 result = left; + result *= right; + return result; +} + bool operator==(const BasicDecimal256& left, const BasicDecimal256& right) { return left.little_endian_array() == right.little_endian_array(); } diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h index 54c8808f65e17..b11412684bb66 100644 --- a/cpp/src/arrow/util/basic_decimal.h +++ b/cpp/src/arrow/util/basic_decimal.h @@ -109,10 +109,10 @@ class ARROW_EXPORT BasicDecimal128 { BasicDecimal128& operator>>=(uint32_t bits); /// \brief Get the high bits of the two's complement representation of the number. - inline int64_t high_bits() const { return high_bits_; } + inline constexpr int64_t high_bits() const { return high_bits_; } /// \brief Get the low bits of the two's complement representation of the number. - inline uint64_t low_bits() const { return low_bits_; } + inline constexpr uint64_t low_bits() const { return low_bits_; } /// \brief Return the raw bytes of the value in native-endian byte order. std::array ToBytes() const; @@ -179,6 +179,14 @@ ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left, const BasicDecimal128& right); class ARROW_EXPORT BasicDecimal256 { + private: + // Due to a bug in clang, we have to declare the extend method prior to its + // usage. + template + inline static constexpr uint64_t extend(T low_bits) noexcept { + return low_bits >= T() ? uint64_t{0} : ~uint64_t{0}; + } + public: /// \brief Create a BasicDecimal256 from the two's complement representation. constexpr BasicDecimal256(const std::array& little_endian_array) noexcept @@ -195,6 +203,10 @@ class ARROW_EXPORT BasicDecimal256 { : little_endian_array_({static_cast(value), extend(value), extend(value), extend(value)}) {} + constexpr BasicDecimal256(const BasicDecimal128& value) noexcept + : little_endian_array_({value.low_bits(), static_cast(value.high_bits()), + extend(value.high_bits()), extend(value.high_bits())}) {} + /// \brief Create a BasicDecimal256 from an array of bytes. Bytes are assumed to be in /// native-endian byte order. explicit BasicDecimal256(const uint8_t* bytes); @@ -202,6 +214,12 @@ class ARROW_EXPORT BasicDecimal256 { /// \brief Negate the current value (in-place) BasicDecimal256& Negate(); + /// \brief Absolute value (in-place) + BasicDecimal256& Abs(); + + /// \brief Absolute value + static BasicDecimal256 Abs(const BasicDecimal256& left); + /// \brief Get the bits of the two's complement representation of the number. The 4 /// elements are in little endian order. The bits within each uint64_t element are in /// native endian order. For example, @@ -220,11 +238,14 @@ class ARROW_EXPORT BasicDecimal256 { DecimalStatus Rescale(int32_t original_scale, int32_t new_scale, BasicDecimal256* out) const; - private: - template - inline static constexpr uint64_t extend(T low_bits) noexcept { - return low_bits >= T() ? uint64_t{0} : ~uint64_t{0}; + inline int64_t Sign() const { + return 1 | (static_cast(little_endian_array_[3]) >> 63); } + + /// \brief Multiply this number by another number. The result is truncated to 256 bits. + BasicDecimal256& operator*=(const BasicDecimal256& right); + + private: std::array little_endian_array_; }; @@ -234,4 +255,7 @@ ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& ARROW_EXPORT bool operator<=(const BasicDecimal256& left, const BasicDecimal256& right); ARROW_EXPORT bool operator>(const BasicDecimal256& left, const BasicDecimal256& right); ARROW_EXPORT bool operator>=(const BasicDecimal256& left, const BasicDecimal256& right); + +ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left, + const BasicDecimal256& right); } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index e2f3a07c2ba13..c38e66ca81004 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -721,4 +721,9 @@ Result Decimal256::FromString(const char* s) { Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const { return arrow::ToArrowStatus(dstatus, 256); } + +std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) { + os << decimal.ToIntegerString(); + return os; +} } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index a883797008610..3b159bc8d88fa 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -227,6 +227,9 @@ class ARROW_EXPORT Decimal256 : public BasicDecimal256 { return std::move(out); } + friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os, + const Decimal256& decimal); + private: /// Converts internal error code to Status Status ToArrowStatus(DecimalStatus dstatus) const; diff --git a/cpp/src/arrow/util/decimal_benchmark.cc b/cpp/src/arrow/util/decimal_benchmark.cc index c1acefc268e25..abdcb21a23f76 100644 --- a/cpp/src/arrow/util/decimal_benchmark.cc +++ b/cpp/src/arrow/util/decimal_benchmark.cc @@ -148,6 +148,21 @@ static void BinaryMathOp(benchmark::State& state) { // NOLINT non-const referen state.SetItemsProcessed(state.iterations() * kValueSize); } +static void BinaryMathOp256(benchmark::State& state) { // NOLINT non-const reference + std::vector v1, v2; + for (uint64_t x = 0; x < kValueSize; x++) { + v1.push_back(BasicDecimal256({100 + x, 100 + x, 100 + x, 100 + x})); + v2.push_back(BasicDecimal256({200 + x, 200 + x, 200 + x, 200 + x})); + } + + for (auto _ : state) { + for (int x = 0; x < kValueSize; x += 5) { + benchmark::DoNotOptimize(v1[x + 2] * v2[x + 2]); + } + } + state.SetItemsProcessed(state.iterations() * kValueSize); +} + static void UnaryOp(benchmark::State& state) { // NOLINT non-const reference std::vector v; for (int x = 0; x < kValueSize; x++) { @@ -191,6 +206,7 @@ static void BinaryBitOp(benchmark::State& state) { // NOLINT non-const referenc BENCHMARK(FromString); BENCHMARK(ToString); BENCHMARK(BinaryMathOp); +BENCHMARK(BinaryMathOp256); BENCHMARK(BinaryMathOpAggregate); BENCHMARK(BinaryCompareOp); BENCHMARK(BinaryCompareOpConstant); diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc index 372dbee67b078..3c2f5335e7431 100644 --- a/cpp/src/arrow/util/decimal_test.cc +++ b/cpp/src/arrow/util/decimal_test.cc @@ -26,6 +26,7 @@ #include #include +#include #include "arrow/status.h" #include "arrow/testing/gtest_util.h" @@ -37,6 +38,7 @@ namespace arrow { using internal::int128_t; +using internal::uint128_t; class DecimalTestFixture : public ::testing::Test { public: @@ -1233,6 +1235,67 @@ TEST(Decimal256Test, ConstructibleFromBool) { EXPECT_EQ(Decimal256(1), Decimal256(true)); } +Decimal256 Decimal256FromInt128(int128_t value) { + return Decimal256(Decimal128(static_cast(value >> 64), + static_cast(value & 0xFFFFFFFFFFFFFFFFULL))); +} + +TEST(Decimal256Test, Multiply) { + using boost::multiprecision::int256_t; + using boost::multiprecision::uint256_t; + + ASSERT_EQ(Decimal256(60501), Decimal256(301) * Decimal256(201)); + + ASSERT_EQ(Decimal256(-60501), Decimal256(-301) * Decimal256(201)); + + ASSERT_EQ(Decimal256(-60501), Decimal256(301) * Decimal256(-201)); + + ASSERT_EQ(Decimal256(60501), Decimal256(-301) * Decimal256(-201)); + + // Test some random numbers. + std::vector left; + std::vector right; + for (auto x : GetRandomNumbers(16)) { + for (auto y : GetRandomNumbers(16)) { + for (auto z : GetRandomNumbers(16)) { + for (auto w : GetRandomNumbers(16)) { + // Test two 128 bit numbers which have a large amount of bits set. + int128_t l = static_cast(x) << 96 | static_cast(y) << 64 | + static_cast(z) << 32 | static_cast(w); + int128_t r = static_cast(w) << 96 | static_cast(z) << 64 | + static_cast(y) << 32 | static_cast(x); + int256_t expected = int256_t(l) * r; + Decimal256 actual = Decimal256FromInt128(l) * Decimal256FromInt128(r); + ASSERT_EQ(expected.str(), actual.ToIntegerString()) + << " " << int256_t(l).str() << " * " << int256_t(r).str(); + // Test a 96 bit number against a 160 bit number. + int128_t s = l >> 32; + uint256_t b = uint256_t(r) << 32; + Decimal256 b_dec = + Decimal256FromInt128(r) * Decimal256(static_cast(1) << 32); + ASSERT_EQ(b.str(), b_dec.ToIntegerString()) << int256_t(r).str(); + expected = int256_t(s) * b; + actual = Decimal256FromInt128(s) * b_dec; + ASSERT_EQ(expected.str(), actual.ToIntegerString()) + << " " << int256_t(s).str() << " * " << int256_t(b).str(); + } + } + } + } + + // Test some edge cases + for (auto x : std::vector{-INT64_MAX, -INT32_MAX, 0, INT32_MAX, INT64_MAX}) { + for (auto y : + std::vector{-INT32_MAX, -32, -2, -1, 0, 1, 2, 32, INT32_MAX}) { + Decimal256 decimal_x = Decimal256FromInt128(x); + Decimal256 decimal_y = Decimal256FromInt128(y); + Decimal256 result = decimal_x * decimal_y; + EXPECT_EQ(Decimal256FromInt128(x * y), result) + << " x: " << decimal_x << " y: " << decimal_y; + } + } +} + class Decimal256ToStringTest : public ::testing::TestWithParam {}; TEST_P(Decimal256ToStringTest, ToString) {