From 4f01cc2edabe31b7504f75468dc2214d3fbf4bc6 Mon Sep 17 00:00:00 2001
From: Ezra <eumen@google.com>
Date: Mon, 12 Oct 2020 14:44:28 -0700
Subject: [PATCH] Add BasicDecimal256 Multiplication Support (PR for decimal256
 branch, not master) (#8344)

---
 cpp/src/arrow/util/basic_decimal.cc     | 167 ++++++++++++++++++------
 cpp/src/arrow/util/basic_decimal.h      |  36 ++++-
 cpp/src/arrow/util/decimal.cc           |   5 +
 cpp/src/arrow/util/decimal.h            |   3 +
 cpp/src/arrow/util/decimal_benchmark.cc |  16 +++
 cpp/src/arrow/util/decimal_test.cc      |  63 +++++++++
 6 files changed, 245 insertions(+), 45 deletions(-)

diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc
index ac85bd088ad25..ea247b74ea5e1 100644
--- a/cpp/src/arrow/util/basic_decimal.cc
+++ b/cpp/src/arrow/util/basic_decimal.cc
@@ -123,7 +123,7 @@ static const BasicDecimal128 ScaleMultipliersHalf[] = {
 #ifdef ARROW_USE_NATIVE_INT128
 static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF;
 #else
-static constexpr uint64_t kIntMask = 0xFFFFFFFF;
+static constexpr uint64_t kInt32Mask = 0xFFFFFFFF;
 #endif
 
 // same as ScaleMultipliers[38] - 1
@@ -254,67 +254,125 @@ BasicDecimal128& BasicDecimal128::operator>>=(uint32_t bits) {
 
 namespace {
 
-// TODO: Remove this guard once it's used by BasicDecimal256
-#ifndef ARROW_USE_NATIVE_INT128
-// This method losslessly multiplies x and y into a 128 bit unsigned integer
-// whose high bits will be stored in hi and low bits in lo.
-void ExtendAndMultiplyUint64(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) {
+// Convenience wrapper type over 128 bit unsigned integers. We opt not to
+// replace the uint128_t type in int128_internal.h because it would require
+// significantly more implementation work to be done. This class merely
+// provides the minimum necessary set of functions to perform 128+ bit
+// multiplication operations when there may or may not be native support.
 #ifdef ARROW_USE_NATIVE_INT128
-  const __uint128_t r = static_cast<__uint128_t>(x) * y;
-  *lo = r & kInt64Mask;
-  *hi = r >> 64;
+struct uint128_t {
+  uint128_t() {}
+  uint128_t(uint64_t hi, uint64_t lo) : val_((static_cast<__uint128_t>(hi) << 64) | lo) {}
+  uint128_t(const BasicDecimal128& decimal) {
+    val_ = (static_cast<__uint128_t>(decimal.high_bits()) << 64) | decimal.low_bits();
+  }
+
+  uint64_t hi() { return val_ >> 64; }
+  uint64_t lo() { return val_ & kInt64Mask; }
+
+  uint128_t& operator+=(const uint128_t& other) {
+    val_ += other.val_;
+    return *this;
+  }
+
+  uint128_t& operator*=(const uint128_t& other) {
+    val_ *= other.val_;
+    return *this;
+  }
+
+  __uint128_t val_;
+};
+
 #else
-  // If we can't use a native fallback, perform multiplication
+// Multiply two 64 bit word components into a 128 bit result, with high bits
+// stored in hi and low bits in lo.
+inline void ExtendAndMultiply(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) {
+  // Perform multiplication on two 64 bit words x and y into a 128 bit result
   // by splitting up x and y into 32 bit high/low bit components,
   // allowing us to represent the multiplication as
   // x * y = x_lo * y_lo + x_hi * y_lo * 2^32 + y_hi * x_lo * 2^32
-  // + x_hi * y_hi * 2^64.
+  // + x_hi * y_hi * 2^64
   //
-  // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi.
+  // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi
   // Therefore,
   // lo_lo is (x_lo * y_lo)_lo,
   // lo_hi is ((x_lo * y_lo)_hi + (x_hi * y_lo)_lo + (x_lo * y_hi)_lo)_lo,
   // hi_lo is ((x_hi * y_hi)_lo + (x_hi * y_lo)_hi + (x_lo * y_hi)_hi)_hi,
   // hi_hi is (x_hi * y_hi)_hi
-  const uint64_t x_lo = x & kIntMask;
-  const uint64_t y_lo = y & kIntMask;
+  const uint64_t x_lo = x & kInt32Mask;
+  const uint64_t y_lo = y & kInt32Mask;
   const uint64_t x_hi = x >> 32;
   const uint64_t y_hi = y >> 32;
 
   const uint64_t t = x_lo * y_lo;
-  const uint64_t t_lo = t & kIntMask;
+  const uint64_t t_lo = t & kInt32Mask;
   const uint64_t t_hi = t >> 32;
 
   const uint64_t u = x_hi * y_lo + t_hi;
-  const uint64_t u_lo = u & kIntMask;
+  const uint64_t u_lo = u & kInt32Mask;
   const uint64_t u_hi = u >> 32;
 
   const uint64_t v = x_lo * y_hi + u_lo;
   const uint64_t v_hi = v >> 32;
 
   *hi = x_hi * y_hi + u_hi + v_hi;
-  *lo = (v << 32) | t_lo;
-#endif
+  *lo = (v << 32) + t_lo;
 }
-#endif
 
-void MultiplyUint128(uint64_t x_hi, uint64_t x_lo, uint64_t y_hi, uint64_t y_lo,
-                     uint64_t* hi, uint64_t* lo) {
-#ifdef ARROW_USE_NATIVE_INT128
-  const __uint128_t x = (static_cast<__uint128_t>(x_hi) << 64) | x_lo;
-  const __uint128_t y = (static_cast<__uint128_t>(y_hi) << 64) | y_lo;
-  const __uint128_t r = x * y;
-  *lo = r & kInt64Mask;
-  *hi = r >> 64;
-#else
-  // To perform 128 bit multiplication without a native fallback
-  // we first perform lossless 64 bit multiplication of the low
-  // bits, and then add x_hi * y_lo and x_lo * y_hi to the high
-  // bits. Note that we can skip adding x_hi * y_hi because it
-  // always will be over 128 bits.
-  ExtendAndMultiplyUint64(x_lo, y_lo, hi, lo);
-  *hi += (x_hi * y_lo) + (x_lo * y_hi);
+struct uint128_t {
+  uint128_t() {}
+  uint128_t(uint64_t hi, uint64_t lo) : hi_(hi), lo_(lo) {}
+  uint128_t(const BasicDecimal128& decimal) {
+    hi_ = decimal.high_bits();
+    lo_ = decimal.low_bits();
+  }
+
+  uint64_t hi() const { return hi_; }
+  uint64_t lo() const { return lo_; }
+
+  uint128_t& operator+=(const uint128_t& other) {
+    // To deduce the carry bit, we perform "65 bit" addition on the low bits and
+    // seeing if the resulting high bit is 1. This is accomplished by shifting the
+    // low bits to the right by 1 (chopping off the lowest bit), then adding 1 if the
+    // result of adding the two chopped bits would have produced a carry.
+    uint64_t carry = (((lo_ & other.lo_) & 1) + (lo_ >> 1) + (other.lo_ >> 1)) >> 63;
+    hi_ += other.hi_ + carry;
+    lo_ += other.lo_;
+    return *this;
+  }
+
+  uint128_t& operator*=(const uint128_t& other) {
+    uint128_t r;
+    ExtendAndMultiply(lo_, other.lo_, &r.hi_, &r.lo_);
+    r.hi_ += (hi_ * other.lo_) + (lo_ * other.hi_);
+    *this = r;
+    return *this;
+  }
+
+  uint64_t hi_;
+  uint64_t lo_;
+};
 #endif
+
+// Multiplies two N * 64 bit unsigned integer types, represented by a uint64_t
+// array into a same sized output. Elements in the array should be in
+// little endian order, and output will be the same. Overflow in multiplication
+// will result in the lower N * 64 bits of the result being set.
+template <int N>
+inline void MultiplyUnsignedArray(const std::array<uint64_t, N>& lh,
+                                  const std::array<uint64_t, N>& rh,
+                                  std::array<uint64_t, N>* result) {
+  for (int j = 0; j < N; ++j) {
+    uint64_t carry = 0;
+    for (int i = 0; i < N - j; ++i) {
+      uint128_t tmp(lh[i]);
+      tmp *= uint128_t(rh[j]);
+      tmp += uint128_t((*result)[i + j]);
+      tmp += uint128_t(carry);
+      (*result)[i + j] = tmp.lo();
+      carry = tmp.hi();
+    }
+  }
 }
 
 }  // namespace
@@ -325,10 +383,10 @@ BasicDecimal128& BasicDecimal128::operator*=(const BasicDecimal128& right) {
   const bool negate = Sign() != right.Sign();
   BasicDecimal128 x = BasicDecimal128::Abs(*this);
   BasicDecimal128 y = BasicDecimal128::Abs(right);
-  uint64_t hi;
-  MultiplyUint128(x.high_bits(), x.low_bits(), y.high_bits(), y.low_bits(), &hi,
-                  &low_bits_);
-  high_bits_ = hi;
+  uint128_t r(x);
+  r *= y;
+  high_bits_ = r.hi();
+  low_bits_ = r.lo();
   if (negate) {
     Negate();
   }
@@ -800,6 +858,13 @@ BasicDecimal256& BasicDecimal256::Negate() {
   return *this;
 }
 
+BasicDecimal256& BasicDecimal256::Abs() { return *this < 0 ? Negate() : *this; }
+
+BasicDecimal256 BasicDecimal256::Abs(const BasicDecimal256& in) {
+  BasicDecimal256 result(in);
+  return result.Abs();
+}
+
 std::array<uint8_t, 32> BasicDecimal256::ToBytes() const {
   std::array<uint8_t, 32> out{{0}};
   ToBytes(out.data());
@@ -821,12 +886,36 @@ void BasicDecimal256::ToBytes(uint8_t* out) const {
 #endif
 }
 
+BasicDecimal256& BasicDecimal256::operator*=(const BasicDecimal256& right) {
+  // Since the max value of BasicDecimal256 is supposed to be 1e76 - 1 and the
+  // min the negation taking the absolute values here should always be safe.
+  const bool negate = Sign() != right.Sign();
+  BasicDecimal256 x = BasicDecimal256::Abs(*this);
+  BasicDecimal256 y = BasicDecimal256::Abs(right);
+
+  uint128_t r_hi;
+  uint128_t r_lo;
+  std::array<uint64_t, 4> res({0, 0, 0, 0});
+  MultiplyUnsignedArray<4>(x.little_endian_array_, y.little_endian_array_, &res);
+  little_endian_array_ = res;
+  if (negate) {
+    Negate();
+  }
+  return *this;
+}
+
 DecimalStatus BasicDecimal256::Rescale(int32_t original_scale, int32_t new_scale,
                                        BasicDecimal256* out) const {
   // TODO: implement.
   return DecimalStatus::kSuccess;
 }
 
+BasicDecimal256 operator*(const BasicDecimal256& left, const BasicDecimal256& right) {
+  BasicDecimal256 result = left;
+  result *= right;
+  return result;
+}
+
 bool operator==(const BasicDecimal256& left, const BasicDecimal256& right) {
   return left.little_endian_array() == right.little_endian_array();
 }
diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h
index 54c8808f65e17..b11412684bb66 100644
--- a/cpp/src/arrow/util/basic_decimal.h
+++ b/cpp/src/arrow/util/basic_decimal.h
@@ -109,10 +109,10 @@ class ARROW_EXPORT BasicDecimal128 {
   BasicDecimal128& operator>>=(uint32_t bits);
 
   /// \brief Get the high bits of the two's complement representation of the number.
-  inline int64_t high_bits() const { return high_bits_; }
+  inline constexpr int64_t high_bits() const { return high_bits_; }
 
   /// \brief Get the low bits of the two's complement representation of the number.
-  inline uint64_t low_bits() const { return low_bits_; }
+  inline constexpr uint64_t low_bits() const { return low_bits_; }
 
   /// \brief Return the raw bytes of the value in native-endian byte order.
   std::array<uint8_t, 16> ToBytes() const;
@@ -179,6 +179,14 @@ ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left,
                                        const BasicDecimal128& right);
 
 class ARROW_EXPORT BasicDecimal256 {
+ private:
+  // Due to a bug in clang, we have to declare the extend method prior to its
+  // usage.
+  template <typename T>
+  inline static constexpr uint64_t extend(T low_bits) noexcept {
+    return low_bits >= T() ? uint64_t{0} : ~uint64_t{0};
+  }
+
  public:
   /// \brief Create a BasicDecimal256 from the two's complement representation.
   constexpr BasicDecimal256(const std::array<uint64_t, 4>& little_endian_array) noexcept
@@ -195,6 +203,10 @@ class ARROW_EXPORT BasicDecimal256 {
       : little_endian_array_({static_cast<uint64_t>(value), extend(value), extend(value),
                               extend(value)}) {}
 
+  constexpr BasicDecimal256(const BasicDecimal128& value) noexcept
+      : little_endian_array_({value.low_bits(), static_cast<uint64_t>(value.high_bits()),
+                              extend(value.high_bits()), extend(value.high_bits())}) {}
+
   /// \brief Create a BasicDecimal256 from an array of bytes. Bytes are assumed to be in
   /// native-endian byte order.
   explicit BasicDecimal256(const uint8_t* bytes);
@@ -202,6 +214,12 @@ class ARROW_EXPORT BasicDecimal256 {
   /// \brief Negate the current value (in-place)
   BasicDecimal256& Negate();
 
+  /// \brief Absolute value (in-place)
+  BasicDecimal256& Abs();
+
+  /// \brief Absolute value
+  static BasicDecimal256 Abs(const BasicDecimal256& left);
+
   /// \brief Get the bits of the two's complement representation of the number. The 4
   /// elements are in little endian order. The bits within each uint64_t element are in
   /// native endian order. For example,
@@ -220,11 +238,14 @@ class ARROW_EXPORT BasicDecimal256 {
   DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
                         BasicDecimal256* out) const;
 
- private:
-  template <typename T>
-  inline static constexpr uint64_t extend(T low_bits) noexcept {
-    return low_bits >= T() ? uint64_t{0} : ~uint64_t{0};
+  inline int64_t Sign() const {
+    return 1 | (static_cast<int64_t>(little_endian_array_[3]) >> 63);
   }
+
+  /// \brief Multiply this number by another number. The result is truncated to 256 bits.
+  BasicDecimal256& operator*=(const BasicDecimal256& right);
+
+ private:
   std::array<uint64_t, 4> little_endian_array_;
 };
 
@@ -234,4 +255,7 @@ ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256&
 ARROW_EXPORT bool operator<=(const BasicDecimal256& left, const BasicDecimal256& right);
 ARROW_EXPORT bool operator>(const BasicDecimal256& left, const BasicDecimal256& right);
 ARROW_EXPORT bool operator>=(const BasicDecimal256& left, const BasicDecimal256& right);
+
+ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left,
+                                       const BasicDecimal256& right);
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index e2f3a07c2ba13..c38e66ca81004 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -721,4 +721,9 @@ Result<Decimal256> Decimal256::FromString(const char* s) {
 Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const {
   return arrow::ToArrowStatus(dstatus, 256);
 }
+
+std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) {
+  os << decimal.ToIntegerString();
+  return os;
+}
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h
index a883797008610..3b159bc8d88fa 100644
--- a/cpp/src/arrow/util/decimal.h
+++ b/cpp/src/arrow/util/decimal.h
@@ -227,6 +227,9 @@ class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
     return std::move(out);
   }
 
+  friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
+                                               const Decimal256& decimal);
+
  private:
   /// Converts internal error code to Status
   Status ToArrowStatus(DecimalStatus dstatus) const;
diff --git a/cpp/src/arrow/util/decimal_benchmark.cc b/cpp/src/arrow/util/decimal_benchmark.cc
index c1acefc268e25..abdcb21a23f76 100644
--- a/cpp/src/arrow/util/decimal_benchmark.cc
+++ b/cpp/src/arrow/util/decimal_benchmark.cc
@@ -148,6 +148,21 @@ static void BinaryMathOp(benchmark::State& state) {  // NOLINT non-const referen
   state.SetItemsProcessed(state.iterations() * kValueSize);
 }
 
+static void BinaryMathOp256(benchmark::State& state) {  // NOLINT non-const reference
+  std::vector<BasicDecimal256> v1, v2;
+  for (uint64_t x = 0; x < kValueSize; x++) {
+    v1.push_back(BasicDecimal256({100 + x, 100 + x, 100 + x, 100 + x}));
+    v2.push_back(BasicDecimal256({200 + x, 200 + x, 200 + x, 200 + x}));
+  }
+
+  for (auto _ : state) {
+    for (int x = 0; x < kValueSize; x += 5) {
+      benchmark::DoNotOptimize(v1[x + 2] * v2[x + 2]);
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * kValueSize);
+}
+
 static void UnaryOp(benchmark::State& state) {  // NOLINT non-const reference
   std::vector<BasicDecimal128> v;
   for (int x = 0; x < kValueSize; x++) {
@@ -191,6 +206,7 @@ static void BinaryBitOp(benchmark::State& state) {  // NOLINT non-const referenc
 BENCHMARK(FromString);
 BENCHMARK(ToString);
 BENCHMARK(BinaryMathOp);
+BENCHMARK(BinaryMathOp256);
 BENCHMARK(BinaryMathOpAggregate);
 BENCHMARK(BinaryCompareOp);
 BENCHMARK(BinaryCompareOpConstant);
diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc
index 372dbee67b078..3c2f5335e7431 100644
--- a/cpp/src/arrow/util/decimal_test.cc
+++ b/cpp/src/arrow/util/decimal_test.cc
@@ -26,6 +26,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <boost/multiprecision/cpp_int.hpp>
 
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
@@ -37,6 +38,7 @@
 namespace arrow {
 
 using internal::int128_t;
+using internal::uint128_t;
 
 class DecimalTestFixture : public ::testing::Test {
  public:
@@ -1233,6 +1235,67 @@ TEST(Decimal256Test, ConstructibleFromBool) {
   EXPECT_EQ(Decimal256(1), Decimal256(true));
 }
 
+Decimal256 Decimal256FromInt128(int128_t value) {
+  return Decimal256(Decimal128(static_cast<int64_t>(value >> 64),
+                               static_cast<uint64_t>(value & 0xFFFFFFFFFFFFFFFFULL)));
+}
+
+TEST(Decimal256Test, Multiply) {
+  using boost::multiprecision::int256_t;
+  using boost::multiprecision::uint256_t;
+
+  ASSERT_EQ(Decimal256(60501), Decimal256(301) * Decimal256(201));
+
+  ASSERT_EQ(Decimal256(-60501), Decimal256(-301) * Decimal256(201));
+
+  ASSERT_EQ(Decimal256(-60501), Decimal256(301) * Decimal256(-201));
+
+  ASSERT_EQ(Decimal256(60501), Decimal256(-301) * Decimal256(-201));
+
+  // Test some random numbers.
+  std::vector<int128_t> left;
+  std::vector<int128_t> right;
+  for (auto x : GetRandomNumbers<Int32Type>(16)) {
+    for (auto y : GetRandomNumbers<Int32Type>(16)) {
+      for (auto z : GetRandomNumbers<Int32Type>(16)) {
+        for (auto w : GetRandomNumbers<Int32Type>(16)) {
+          // Test two 128 bit numbers which have a large amount of bits set.
+          int128_t l = static_cast<uint128_t>(x) << 96 | static_cast<uint128_t>(y) << 64 |
+                       static_cast<uint128_t>(z) << 32 | static_cast<uint128_t>(w);
+          int128_t r = static_cast<uint128_t>(w) << 96 | static_cast<uint128_t>(z) << 64 |
+                       static_cast<uint128_t>(y) << 32 | static_cast<uint128_t>(x);
+          int256_t expected = int256_t(l) * r;
+          Decimal256 actual = Decimal256FromInt128(l) * Decimal256FromInt128(r);
+          ASSERT_EQ(expected.str(), actual.ToIntegerString())
+              << " " << int256_t(l).str() << " * " << int256_t(r).str();
+          // Test a 96 bit number against a 160 bit number.
+          int128_t s = l >> 32;
+          uint256_t b = uint256_t(r) << 32;
+          Decimal256 b_dec =
+              Decimal256FromInt128(r) * Decimal256(static_cast<uint64_t>(1) << 32);
+          ASSERT_EQ(b.str(), b_dec.ToIntegerString()) << int256_t(r).str();
+          expected = int256_t(s) * b;
+          actual = Decimal256FromInt128(s) * b_dec;
+          ASSERT_EQ(expected.str(), actual.ToIntegerString())
+              << " " << int256_t(s).str() << " * " << int256_t(b).str();
+        }
+      }
+    }
+  }
+
+  // Test some edge cases
+  for (auto x : std::vector<int128_t>{-INT64_MAX, -INT32_MAX, 0, INT32_MAX, INT64_MAX}) {
+    for (auto y :
+         std::vector<int128_t>{-INT32_MAX, -32, -2, -1, 0, 1, 2, 32, INT32_MAX}) {
+      Decimal256 decimal_x = Decimal256FromInt128(x);
+      Decimal256 decimal_y = Decimal256FromInt128(y);
+      Decimal256 result = decimal_x * decimal_y;
+      EXPECT_EQ(Decimal256FromInt128(x * y), result)
+          << " x: " << decimal_x << " y: " << decimal_y;
+    }
+  }
+}
+
 class Decimal256ToStringTest : public ::testing::TestWithParam<ToStringTestParam> {};
 
 TEST_P(Decimal256ToStringTest, ToString) {