apache · paleolimbot · Jul 22, 2022 · Jun 17, 2022 · Jun 17, 2022 · Jun 17, 2022
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -45,7 +45,9 @@ S3method(as_arrow_array,data.frame)
 S3method(as_arrow_array,default)
 S3method(as_arrow_array,pyarrow.lib.Array)
 S3method(as_arrow_table,RecordBatch)
+S3method(as_arrow_table,RecordBatchReader)
 S3method(as_arrow_table,Table)
+S3method(as_arrow_table,arrow_dplyr_query)
 S3method(as_arrow_table,data.frame)
 S3method(as_arrow_table,default)
 S3method(as_arrow_table,pyarrow.lib.RecordBatch)
@@ -343,6 +345,7 @@ export(read_schema)
 export(read_tsv_arrow)
 export(record_batch)
 export(register_extension_type)
+export(register_scalar_function)
 export(reregister_extension_type)
 export(s3_bucket)
 export(schema)

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/compute.R b/r/R/compute.R
@@ -306,3 +306,179 @@ cast_options <- function(safe = TRUE, ...) {
   )
   modifyList(opts, list(...))
 }
+
+#' Register user-defined functions
+#'
+#' These functions support calling R code from query engine execution
+#' (i.e., a [dplyr::mutate()] or [dplyr::filter()] on a [Table] or [Dataset]).
+#' Use [register_scalar_function()] attach Arrow input and output types to an
+#' R function and make it available for use in the dplyr interface and/or
+#' [call_function()]. Scalar functions are currently the only type of
+#' user-defined function supported. In Arrow, scalar functions must be
+#' stateless and return output with the same shape (i.e., the same number
+#' of rows) as the input.
+#'
+#' @param name The function name to be used in the dplyr bindings
+#' @param in_type A [DataType] of the input type or a [schema()]
+#'   for functions with more than one argument. This signature will be used
+#'   to determine if this function is appropriate for a given set of arguments.
+#'   If this function is appropriate for more than one signature, pass a
+#'   `list()` of the above.
+#' @param out_type A [DataType] of the output type or a function accepting
+#'   a single argument (`types`), which is a `list()` of [DataType]s. If a
+#'   function it must return a [DataType].
+#' @param fun An R function or rlang-style lambda expression. The function
+#'   will be called with a first argument `context` which is a `list()`
+#'   with elements `batch_size` (the expected length of the output) and
+#'   `output_type` (the required [DataType] of the output) that may be used
+#'   to ensure that the output has the correct type and length. Subsequent
+#'   arguments are passed by position as specified by `in_types`. If
+#'   `auto_convert` is `TRUE`, subsequent arguments are converted to
+#'   R vectors before being passed to `fun` and the output is automatically
+#'   constructed with the expected output type via [as_arrow_array()].
+#' @param auto_convert Use `TRUE` to convert inputs before passing to `fun`
+#'   and construct an Array of the correct type from the output. Use this
+#'   option to write functions of R objects as opposed to functions of
+#'   Arrow R6 objects.
+#'
+#' @return `NULL`, invisibly
+#' @export
+#'
+#' @examplesIf arrow_with_dataset()
+#' library(dplyr, warn.conflicts = FALSE)
+#'
+#' some_model <- lm(mpg ~ disp + cyl, data = mtcars)
+#' register_scalar_function(
+#'   "mtcars_predict_mpg",
+#'   function(context, disp, cyl) {
+#'     predict(some_model, newdata = data.frame(disp, cyl))
+#'   },
+#'   in_type = schema(disp = float64(), cyl = float64()),
+#'   out_type = float64(),
+#'   auto_convert = TRUE
+#' )
+#'
+#' as_arrow_table(mtcars) %>%
+#'   transmute(mpg, mpg_predicted = mtcars_predict_mpg(disp, cyl)) %>%
+#'   collect() %>%
+#'   head()
+#'
+register_scalar_function <- function(name, fun, in_type, out_type,
+                                     auto_convert = FALSE) {
+  assert_that(is.string(name))
+
+  scalar_function <- arrow_scalar_function(
+    fun,
+    in_type,
+    out_type,
+    auto_convert = auto_convert
+  )
+
+  # register with Arrow C++ function registry (enables its use in
+  # call_function() and Expression$create())
+  RegisterScalarUDF(name, scalar_function)
+
+  # register with dplyr binding (enables its use in mutate(), filter(), etc.)
+  register_binding(
+    name,
+    function(...) build_expr(name, ...),
+    update_cache = TRUE
+  )
+
+  invisible(NULL)
+}
+
+arrow_scalar_function <- function(fun, in_type, out_type, auto_convert = FALSE) {
+  assert_that(is.function(fun))
+
+  # Create a small wrapper function that is easier to call from C++.
+  # TODO(ARROW-17148): This wrapper could be implemented in C/C++ to
+  # reduce evaluation overhead and generate prettier backtraces when
+  # errors occur (probably using a similar approach to purrr).
+  if (auto_convert) {
+    wrapper_fun <- function(context, args) {
+      args <- lapply(args, as.vector)
+      result <- do.call(fun, c(list(context), args))
+      as_arrow_array(result, type = context$output_type)
+    }
+  } else {
+    wrapper_fun <- function(context, args) {
+      do.call(fun, c(list(context), args))
+    }
+  }
+
+  # in_type can be a list() if registering multiple kernels at once
+  if (is.list(in_type)) {
+    in_type <- lapply(in_type, in_type_as_schema)
+  } else {
+    in_type <- list(in_type_as_schema(in_type))
+  }
+
+  # out_type can be a list() if registering multiple kernels at once
+  if (is.list(out_type)) {
+    out_type <- lapply(out_type, out_type_as_function)
+  } else {
+    out_type <- list(out_type_as_function(out_type))
+  }
+
+  # recycle out_type (which is frequently length 1 even if multiple kernels
+  # are being registered at once)
+  out_type <- rep_len(out_type, length(in_type))
+
+  # check n_kernels and number of args in fun
+  n_kernels <- length(in_type)
+  if (n_kernels == 0) {
+    abort("Can't register user-defined scalar function with 0 kernels")
+  }
+
+  expected_n_args <- in_type[[1]]$num_fields + 1L
+  fun_formals_have_dots <- any(names(formals(fun)) == "...")
+  if (!fun_formals_have_dots && length(formals(fun)) != expected_n_args) {
+    abort(
+      sprintf(
+        paste0(
+          "Expected `fun` to accept %d argument(s)\n",
+          "but found a function that acccepts %d argument(s)\n",
+          "Did you forget to include `context` as the first argument?"
+        ),
+        expected_n_args,
+        length(formals(fun))
+      )
+    )
+  }
+
+  structure(
+    list(
+      wrapper_fun = wrapper_fun,
+      in_type = in_type,
+      out_type = out_type
+    ),
+    class = "arrow_scalar_function"
+  )
+}
+
+# This function sanitizes the in_type argument for arrow_scalar_function(),
+# which can be a data type (e.g., int32()), a field for a unary function
+# or a schema() for functions accepting more than one argument. C++ expects
+# a schema().
+in_type_as_schema <- function(x) {
+  if (inherits(x, "Field")) {
+    schema(x)
+  } else if (inherits(x, "DataType")) {
+    schema(field("", x))
+  } else {
+    as_schema(x)
+  }
+}
+
+# This function sanitizes the out_type argument for arrow_scalar_function(),
+# which can be a data type (e.g., int32()) or a function of the input types.
+# C++ currently expects a function.
+out_type_as_function <- function(x) {
+  if (is.function(x)) {
+    x
+  } else {
+    x <- as_data_type(x)
+    function(types) x
+  }
+}
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
@@ -20,7 +20,7 @@
 
 collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
   tryCatch(
-    out <- as_record_batch_reader(x)$read_table(),
+    out <- as_arrow_table(x),
     # n = 4 because we want the error to show up as being from collect()
     # and not handle_csv_read_error()
     error = function(e, call = caller_env(n = 4)) {

diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R
@@ -50,20 +50,28 @@ NULL
 #'   - `fun`: string function name
 #'   - `data`: `Expression` (these are all currently a single field)
 #'   - `options`: list of function options, as passed to call_function
+#' @param update_cache Update .cache$functions at the time of registration.
+#'   the default is FALSE because the majority of usage is to register
+#'   bindings at package load, after which we create the cache once. The
+#'   reason why .cache$functions is needed in addition to nse_funcs for
+#'   non-aggregate functions could be revisited...it is currently used
+#'   as the data mask in mutate, filter, and aggregate (but not
+#'   summarise) because the data mask has to be a list.
 #' @param registry An environment in which the functions should be
 #'   assigned.
 #'
 #' @return The previously registered binding or `NULL` if no previously
 #'   registered function existed.
 #' @keywords internal
 #'
-register_binding <- function(fun_name, fun, registry = nse_funcs) {
+register_binding <- function(fun_name, fun, registry = nse_funcs,
+                             update_cache = FALSE) {
   unqualified_name <- sub("^.*?:{+}", "", fun_name)
 
   previous_fun <- registry[[unqualified_name]]
 
   # if the unqualified name exists in the registry, warn
-  if (!is.null(fun) && !is.null(previous_fun)) {
+  if (!is.null(previous_fun)) {
     warn(
       paste0(
         "A \"",
@@ -73,11 +81,36 @@ register_binding <- function(fun_name, fun, registry = nse_funcs) {
   }
 
   # register both as `pkg::fun` and as `fun` if `qualified_name` is prefixed
-  if (grepl("::", fun_name)) {
-    registry[[unqualified_name]] <- fun
-    registry[[fun_name]] <- fun
-  } else {
-    registry[[unqualified_name]] <- fun
+  # unqualified_name and fun_name will be the same if not prefixed
+  registry[[unqualified_name]] <- fun
+  registry[[fun_name]] <- fun
+
+  if (update_cache) {
+    fun_cache <- .cache$functions
+    fun_cache[[unqualified_name]] <- fun
+    fun_cache[[fun_name]] <- fun
+    .cache$functions <- fun_cache
+  }
+
+  invisible(previous_fun)
+}
+
+unregister_binding <- function(fun_name, registry = nse_funcs,
+                               update_cache = FALSE) {
+  unqualified_name <- sub("^.*?:{+}", "", fun_name)
+  previous_fun <- registry[[unqualified_name]]
+
+  rm(
+    list = unique(c(fun_name, unqualified_name)),
+    envir = registry,
+    inherits = FALSE
+  )
+
+  if (update_cache) {
+    fun_cache <- .cache$functions
+    fun_cache[[unqualified_name]] <- NULL
+    fun_cache[[fun_name]] <- NULL
+    .cache$functions <- fun_cache
   }
 
   invisible(previous_fun)

diff --git a/r/R/feather.R b/r/R/feather.R
@@ -190,7 +190,7 @@ FeatherReader <- R6Class("FeatherReader",
   inherit = ArrowObject,
   public = list(
     Read = function(columns) {
-      ipc___feather___Reader__Read(self, columns, on_old_windows())
+      ipc___feather___Reader__Read(self, columns)
     },
     print = function(...) {
       cat("FeatherReader:\n")
@@ -211,5 +211,5 @@ names.FeatherReader <- function(x) x$column_names
 
 FeatherReader$create <- function(file) {
   assert_is(file, "RandomAccessFile")
-  ipc___feather___Reader__Open(file, on_old_windows())
+  ipc___feather___Reader__Open(file)
 }