ARROW-14844: [R] Implement decimal256()

@jonkeane & @romainfrancois this is the 2nd attempt at implementing `decimal256()`. First one is #11805 Closes #11898 from dragosmg/ARROW-14844_decimal256_take2 Lead-authored-by: Dragos Moldovan-Grünfeld <dragos.mold@gmail.com> Co-authored-by: Dragoș Moldovan-Grünfeld <dragos.mold@gmail.com> Signed-off-by: Jonathan Keane <jkeane@gmail.com>
apache · Dec 20, 2021 · cfcce5a · cfcce5a
1 parent 281dee5
commit cfcce5a
Show file tree

Hide file tree

Showing 14 changed files with 237 additions and 48 deletions.
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -215,6 +215,7 @@ export(date32)
 export(date64)
 export(decimal)
 export(decimal128)
+export(decimal256)
 export(default_memory_pool)
 export(dictionary)
 export(duration)

diff --git a/r/NEWS.md b/r/NEWS.md
@@ -19,11 +19,12 @@
 
 # arrow 6.0.1.9000
 
+* Added `decimal256()`. Updated `decimal()`, which now calls `decimal256()` or `decimal128()` based on the value of the `precision` argument.
 * updated `write_csv_arrow()` to follow the signature of `readr::write_csv()`. The following arguments are supported:
   * `file` identical to `sink`
   * `col_names` identical to `include_header`
   * other arguments are currently unsupported, but the function errors with a meaningful message.
-* Added `decimal128()` (identical to `decimal()`) as the name is more explicit and updated docs to encourage its use. 
+* Added `decimal128()` (~~identical to `decimal()`~~) as the name is more explicit and updated docs to encourage its use. 
 * Source builds now by default use `pkg-config` to search for system dependencies (such as `libz`) and link to them 
 if present. To retain the previous behaviour of downloading and building all dependencies, set `ARROW_DEPENDENCY_SOURCE=BUNDLED`. 
 

diff --git a/r/R/array.R b/r/R/array.R
@@ -187,8 +187,30 @@ Array$create <- function(x, type = NULL) {
     }
     return(out)
   }
-  vec_to_Array(x, type)
+
+  if (is.null(type)) {
+    return(vec_to_Array(x, type))
+  }
+
+  # when a type is given, try to create a vector of the desired type. If that
+  # fails, attempt to cast and if casting is successful, suggest to the user
+  # to try casting manually. If the casting fails, return the original error
+  # message.
+  tryCatch(
+    vec_to_Array(x, type),
+    error = function(cnd) {
+      attempt <- try(vec_to_Array(x, NULL)$cast(type), silent = TRUE)
+      abort(
+        c(conditionMessage(cnd),
+          i = if (!inherits(attempt, "try-error")) {
+            "You might want to try casting manually with `Array$create(...)$cast(...)`."
+          }
+        )
+      )
+    }
+  )
 }
+
 #' @include arrowExports.R
 Array$import_from_c <- ImportArray
 

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/type.R b/r/R/type.R
@@ -157,8 +157,11 @@ DecimalType <- R6Class("DecimalType",
     scale = function() DecimalType__scale(self)
   )
 )
+
 Decimal128Type <- R6Class("Decimal128Type", inherit = DecimalType)
 
+Decimal256Type <- R6Class("Decimal256Type", inherit = DecimalType)
+
 NestedType <- R6Class("NestedType", inherit = DataType)
 
 #' Apache Arrow data types
@@ -188,7 +191,7 @@ NestedType <- R6Class("NestedType", inherit = DataType)
 #' `bit64::integer64` object) by setting `options(arrow.int64_downcast =
 #' FALSE)`.
 #'
-#' `decimal128()` creates a `decimal128` type. Arrow decimals are fixed-point
+#' `decimal128()` creates a `Decimal128Type`. Arrow decimals are fixed-point
 #' decimal numbers encoded as a scalar integer. The `precision` is the number of
 #' significant digits that the decimal type can represent; the `scale` is the
 #' number of digits after the decimal point. For example, the number 1234.567
@@ -204,21 +207,30 @@ NestedType <- R6Class("NestedType", inherit = DataType)
 #' negative, `scale` causes the number to be expressed using scientific notation
 #' and power of 10.
 #'
-#' `decimal()` is identical to `decimal128()`, defined for backward compatibility.
-#' Use `decimal128()` as the name  is more informative and `decimal()` might be
-#' deprecated in the future.
+#' `decimal256()` creates a `Decimal256Type`, which allows for higher maximum
+#' precision. For most use cases, the maximum precision offered by `Decimal128Type`
+#' is sufficient, and it will result in a more compact and more efficient encoding.
+#'
+#' #' `decimal()` creates either a `Decimal128Type` or a `Decimal256Type`
+#' depending on the value for `precision`. If `precision` is greater than 38 a
+#' `Decimal256Type` is returned, otherwise a `Decimal128Type`.
+#'
+#' Use `decimal128()` or `decimal256()` as the names are more informative than
+#' `decimal()`.
 #'
 #' @param unit For time/timestamp types, the time unit. `time32()` can take
 #' either "s" or "ms", while `time64()` can be "us" or "ns". `timestamp()` can
 #' take any of those four values.
 #' @param timezone For `timestamp()`, an optional time zone string.
 #' @param byte_width byte width for `FixedSizeBinary` type.
 #' @param list_size list size for `FixedSizeList` type.
-#' @param precision For `decimal()`, `decimal128()` the number of significant
-#'    digits the arrow `decimal` type can represent. The maximum precision for
-#'    `decimal()` and `decimal128()` is 38 significant digits.
-#' @param scale For `decimal()` and `decimal128()`, the number of digits after
-#'    the decimal point. It can be negative.
+#' @param precision For `decimal()`, `decimal128()`, and `decimal256()` the
+#'    number of significant digits the arrow `decimal` type can represent. The
+#'    maximum precision for `decimal128()` is 38 significant digits, while for
+#'    `decimal256()` it is 76 digits. `decimal()` will use it to choose which
+#'    type of decimal to return.
+#' @param scale For `decimal()`, `decimal128()`, and `decimal256()` the number
+#'    of digits after the decimal point. It can be negative.
 #' @param type For `list_of()`, a data type to make a list-of-type
 #' @param ... For `struct()`, a named list of types to define the struct columns
 #'
@@ -399,25 +411,49 @@ timestamp <- function(unit = c("s", "ms", "us", "ns"), timezone = "") {
   Timestamp__initialize(unit, timezone)
 }
 
+#' @rdname data-type
+#' @export
+decimal <- function(precision, scale) {
+  args <- check_decimal_args(precision, scale)
+
+  if (args$precision > 38) {
+    decimal256(args$precision, args$scale)
+  } else {
+    decimal128(args$precision, args$scale)
+  }
+}
+
 #' @rdname data-type
 #' @export
 decimal128 <- function(precision, scale) {
+  args <- check_decimal_args(precision, scale)
+  Decimal128Type__initialize(args$precision, args$scale)
+}
+
+#' @rdname data-type
+#' @export
+decimal256 <- function(precision, scale) {
+  args <- check_decimal_args(precision, scale)
+  Decimal256Type__initialize(args$precision, args$scale)
+}
+
+check_decimal_args <- function(precision, scale) {
   if (is.numeric(precision)) {
-    precision <- as.integer(precision)
+    precision <- vec_cast(precision, to = integer())
+    vctrs::vec_assert(precision, size = 1L)
   } else {
-    stop('"precision" must be an integer', call. = FALSE)
+    stop("`precision` must be an integer", call. = FALSE)
   }
+
   if (is.numeric(scale)) {
-    scale <- as.integer(scale)
+    scale <- vec_cast(scale, to = integer())
+    vctrs::vec_assert(scale, size = 1L)
   } else {
-    stop('"scale" must be an integer', call. = FALSE)
+    stop("`scale` must be an integer", call. = FALSE)
   }
-  Decimal128Type__initialize(precision, scale)
-}
 
-#' @rdname data-type
-#' @export
-decimal <- decimal128
+  list(precision = precision, scale = scale)
+}
 
 StructType <- R6Class("StructType",
   inherit = NestedType,
@@ -520,6 +556,7 @@ canonical_type_str <- function(type_str) {
     null = "null",
     timestamp = "timestamp",
     decimal128 = "decimal128",
+    decimal256 = "decimal256",
     struct = "struct",
     list_of = "list",
     list = "list",

diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
@@ -960,6 +960,7 @@ class Converter_Timestamp : public Converter_Time<value_type, TimestampType> {
   }
 };
 
+template <typename Type>
 class Converter_Decimal : public Converter {
  public:
   explicit Converter_Decimal(const std::shared_ptr<ChunkedArray>& chunked_array)
@@ -974,8 +975,9 @@ class Converter_Decimal : public Converter {
 
   Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
                            R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+    using DecimalArray = typename TypeTraits<Type>::ArrayType;
     auto p_data = REAL(data) + start;
-    const auto& decimals_arr = checked_cast<const arrow::Decimal128Array&>(*array);
+    const auto& decimals_arr = checked_cast<const DecimalArray&>(*array);
 
     auto ingest_one = [&](R_xlen_t i) {
       p_data[i] = std::stod(decimals_arr.FormatValue(i).c_str());
@@ -1275,7 +1277,10 @@ std::shared_ptr<Converter> Converter::Make(
       }
 
     case Type::DECIMAL128:
-      return std::make_shared<arrow::r::Converter_Decimal>(chunked_array);
+      return std::make_shared<arrow::r::Converter_Decimal<Decimal128Type>>(chunked_array);
+
+    case Type::DECIMAL256:
+      return std::make_shared<arrow::r::Converter_Decimal<Decimal256Type>>(chunked_array);
 
       // nested
     case Type::STRUCT:
@@ -1303,7 +1308,7 @@ std::shared_ptr<Converter> Converter::Make(
       break;
   }
 
-  cpp11::stop("cannot handle Array of type ", type->name().c_str());
+  cpp11::stop("cannot handle Array of type <%s>", type->name().c_str());
 }
 
 std::shared_ptr<ChunkedArray> to_chunks(const std::shared_ptr<Array>& array) {

diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp
@@ -84,6 +84,8 @@ const char* r6_class_name<arrow::DataType>::get(
 
     case Type::DECIMAL128:
       return "Decimal128Type";
+    case Type::DECIMAL256:
+      return "Decimal256Type";
 
     case Type::LIST:
       return "ListType";
@@ -182,6 +184,13 @@ std::shared_ptr<arrow::DataType> Decimal128Type__initialize(int32_t precision,
   return ValueOrStop(arrow::Decimal128Type::Make(precision, scale));
 }
 
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Decimal256Type__initialize(int32_t precision,
+                                                            int32_t scale) {
+  // Use the builder that validates inputs
+  return ValueOrStop(arrow::Decimal256Type::Make(precision, scale));
+}
+
 // [[arrow::export]]
 std::shared_ptr<arrow::DataType> FixedSizeBinary__initialize(R_xlen_t byte_width) {
   if (byte_width == NA_INTEGER) {

diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
@@ -801,6 +801,24 @@ test_that("Array$create() should have helpful error", {
   expect_error(Array$create(list()), "Requires at least one element to infer")
   expect_error(Array$create(list(lgl, lgl, int)), "Expecting a logical vector")
   expect_error(Array$create(list(char, num, char)), "Expecting a character vector")
+
+  # hint at casting if direct fails and casting looks like it might work
+  expect_error(
+    Array$create(as.double(1:10), type = decimal(4, 2)),
+    "You might want to try casting manually"
+  )
+
+  expect_error(
+    Array$create(1:10, type = decimal(12, 2)),
+    "You might want to try casting manually"
+  )
+
+  a <- expect_error(Array$create("one", int32()))
+  b <- expect_error(vec_to_Array("one", int32()))
+  # the captured conditions (errors) are not identical, but their messages should be
+  expect_s3_class(a, "rlang_error")
+  expect_s3_class(b, "simpleError")
+  expect_equal(a$message, b$message)
 })
 
 test_that("Array$View() (ARROW-6542)", {