Skip to content

Commit

Permalink
Fix the precision when converting a decimal128 column to an arrow arr…
Browse files Browse the repository at this point in the history
…ay (#14230)

This PR fixes #13749. As discussed in the issue linked, the precision is unnecessarily being limited to 18 when converting decimal128 to arrow. 

Implementation-wise, I wasn't sure where is the best place to define the max precision for decimal types. Given that the decimal types don't store the precision in libcudf, I thought it would be better to not expose the max precision to the outside of to-arrow conversion. However, this led to replicating the definition of max precision across multiple places. Appreciate any suggestion.

Finally, it was suggested in #13749 (comment) to add some tests for round tripping. The existing tests look sufficient to me for round trip tests, so I just modified them instead of adding new tests. Please let me know if we need new tests in addition to them.

I'm also not sure whether any documentation should be fixed. Please let me know.

Authors:
  - Jihoon Son (https://github.com/jihoonson)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14230
  • Loading branch information
jihoonson authored Oct 28, 2023
1 parent 2bc454a commit 2a923df
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 6 deletions.
13 changes: 13 additions & 0 deletions cpp/include/cudf/detail/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,5 +194,18 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Return a maximum precision for a given type.
*
* @tparam T the type to get the maximum precision for
*/
template <typename T>
constexpr std::size_t max_precision()
{
auto constexpr num_bits = sizeof(T) * 8;
return std::floor(num_bits * std::log(2) / std::log(10));
}

} // namespace detail
} // namespace cudf
12 changes: 12 additions & 0 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,12 @@ struct column_metadata {
* @param stream CUDA stream used for device memory operations and kernel launches
* @param ar_mr arrow memory pool to allocate memory for arrow Table
* @return arrow Table generated from `input`
*
* @note For decimals, since the precision is not stored for them in libcudf,
* it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
* supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
* 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
* converted to Arrow decimal128 of the precision 38.
*/
std::shared_ptr<arrow::Table> to_arrow(table_view input,
std::vector<column_metadata> const& metadata = {},
Expand All @@ -145,6 +151,12 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
* @param stream CUDA stream used for device memory operations and kernel launches
* @param ar_mr arrow memory pool to allocate memory for arrow Scalar
* @return arrow Scalar generated from `input`
*
* @note For decimals, since the precision is not stored for them in libcudf,
* it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
* supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
* 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
* converted to Arrow decimal128 of the precision 38.
*/
std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
column_metadata const& metadata = {},
Expand Down
13 changes: 9 additions & 4 deletions cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal32>(
arrow::MemoryPool* ar_mr,
rmm::cuda_stream_view stream)
{
return unsupported_decimals_to_arrow<int32_t>(input, 9, ar_mr, stream);
using DeviceType = int32_t;
return unsupported_decimals_to_arrow<DeviceType>(
input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
}

template <>
Expand All @@ -208,7 +210,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
arrow::MemoryPool* ar_mr,
rmm::cuda_stream_view stream)
{
return unsupported_decimals_to_arrow<int64_t>(input, 18, ar_mr, stream);
using DeviceType = int64_t;
return unsupported_decimals_to_arrow<DeviceType>(
input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
}

template <>
Expand All @@ -219,7 +223,8 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
arrow::MemoryPool* ar_mr,
rmm::cuda_stream_view stream)
{
using DeviceType = __int128_t;
using DeviceType = __int128_t;
auto const max_precision = cudf::detail::max_precision<DeviceType>();

rmm::device_uvector<DeviceType> buf(input.size(), stream);

Expand All @@ -234,7 +239,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
CUDF_CUDA_TRY(cudaMemcpyAsync(
data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));

auto type = arrow::decimal(18, -input.type().scale());
auto type = arrow::decimal(max_precision, -input.type().scale());
auto mask = fetch_mask_buffer(input, ar_mr, stream);
auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
auto data = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/interop/arrow_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ template <typename T>
auto constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(T);

std::shared_ptr<arrow::Array> arr;
arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -scale),
arrow::Decimal128Builder decimal_builder(arrow::decimal(cudf::detail::max_precision<T>(), -scale),
arrow::default_memory_pool());

for (T i = 0; i < static_cast<T>(data.size() / BIT_WIDTH_RATIO); ++i) {
Expand Down
4 changes: 3 additions & 1 deletion cpp/tests/interop/to_arrow_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,9 @@ struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {};
TEST_F(ToArrowDecimalScalarTest, Basic)
{
auto const value{42};
auto const precision{18}; // cudf will convert to the widest-precision Arrow scalar of the type
auto const precision =
cudf::detail::max_precision<__int128_t>(); // cudf will convert to the widest-precision Arrow
// scalar of the type
int32_t const scale{4};

auto const cudf_scalar =
Expand Down

0 comments on commit 2a923df

Please sign in to comment.