Fix the precision when converting a decimal128 column to an arrow arr…

…ay (#14230) This PR fixes #13749. As discussed in the issue linked, the precision is unnecessarily being limited to 18 when converting decimal128 to arrow. Implementation-wise, I wasn't sure where is the best place to define the max precision for decimal types. Given that the decimal types don't store the precision in libcudf, I thought it would be better to not expose the max precision to the outside of to-arrow conversion. However, this led to replicating the definition of max precision across multiple places. Appreciate any suggestion. Finally, it was suggested in #13749 (comment) to add some tests for round tripping. The existing tests look sufficient to me for round trip tests, so I just modified them instead of adding new tests. Please let me know if we need new tests in addition to them. I'm also not sure whether any documentation should be fixed. Please let me know. Authors: - Jihoon Son (https://github.com/jihoonson) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Nghia Truong (https://github.com/ttnghia) URL: #14230
rapidsai · Oct 28, 2023 · 2a923df · 2a923df
1 parent 2bc454a
commit 2a923df
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 6 deletions.
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
@@ -194,5 +194,18 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Return a maximum precision for a given type.
+ *
+ * @tparam T the type to get the maximum precision for
+ */
+template <typename T>
+constexpr std::size_t max_precision()
+{
+  auto constexpr num_bits = sizeof(T) * 8;
+  return std::floor(num_bits * std::log(2) / std::log(10));
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
@@ -129,6 +129,12 @@ struct column_metadata {
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param ar_mr arrow memory pool to allocate memory for arrow Table
  * @return arrow Table generated from `input`
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
@@ -145,6 +151,12 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param ar_mr arrow memory pool to allocate memory for arrow Scalar
  * @return arrow Scalar generated from `input`
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
  */
 std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         column_metadata const& metadata = {},

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
@@ -197,7 +197,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal32>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  return unsupported_decimals_to_arrow<int32_t>(input, 9, ar_mr, stream);
+  using DeviceType = int32_t;
+  return unsupported_decimals_to_arrow<DeviceType>(
+    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
 }
 
 template <>
@@ -208,7 +210,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  return unsupported_decimals_to_arrow<int64_t>(input, 18, ar_mr, stream);
+  using DeviceType = int64_t;
+  return unsupported_decimals_to_arrow<DeviceType>(
+    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
 }
 
 template <>
@@ -219,7 +223,8 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  using DeviceType = __int128_t;
+  using DeviceType         = __int128_t;
+  auto const max_precision = cudf::detail::max_precision<DeviceType>();
 
   rmm::device_uvector<DeviceType> buf(input.size(), stream);
 
@@ -234,7 +239,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
   CUDF_CUDA_TRY(cudaMemcpyAsync(
     data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
 
-  auto type    = arrow::decimal(18, -input.type().scale());
+  auto type    = arrow::decimal(max_precision, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
   auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
   auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);

diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
@@ -186,7 +186,7 @@ template <typename T>
   auto constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(T);
 
   std::shared_ptr<arrow::Array> arr;
-  arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -scale),
+  arrow::Decimal128Builder decimal_builder(arrow::decimal(cudf::detail::max_precision<T>(), -scale),
                                            arrow::default_memory_pool());
 
   for (T i = 0; i < static_cast<T>(data.size() / BIT_WIDTH_RATIO); ++i) {

diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
@@ -604,7 +604,9 @@ struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {};
 TEST_F(ToArrowDecimalScalarTest, Basic)
 {
   auto const value{42};
-  auto const precision{18};  // cudf will convert to the widest-precision Arrow scalar of the type
+  auto const precision =
+    cudf::detail::max_precision<__int128_t>();  // cudf will convert to the widest-precision Arrow
+                                                // scalar of the type
   int32_t const scale{4};
 
   auto const cudf_scalar =