-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[R] Allow functions with {{pkg::}} prefixes #30124
Comments
Neal Richardson / @nealrichardson: |
Jonathan Keane / @jonkeane: Getting the execution exactly right and being able to select the right |
Dewey Dunnington / @paleolimbot: Erroring out could look like this (only slightly better than the current error, though).
A slightly different definition of
|
Dewey Dunnington / @paleolimbot: Translations for specific packages as R6 classes? Advantage here just that we get documentation that isn’t checked by CRAN for free. BaseTranslations <- R6::R6Class(
"BaseTranslations",
public = list(
#' Arrow compute translation for `base::sum()`
#'
#' @param x A field reference or scalar
sum = function(x) {
arrow::Expression$create(
"sum",
if (inherits(x, "Expression")) x else sum(x)
)
}
)
) ...and a totally untested interface for keeping track of function translations from multiple packages. Helper to detect the package that a user intended based on their current search path detect_package_name <- function(name, env = parent.frame()) {
tryCatch({
obj <- get(name, env = env, mode = "function", inherits = TRUE)
obj_env <- environment(obj)
if (is.null(obj_env)) {
"base"
} else if (".packageName" %in% names(obj_env)) {
obj_env$.packageName
} else {
NULL
}
}, error = function(e) {
NULL
})
} Using R6 for my basic implementation of a function registry that includes the intended package for a translation. These can be stacked and recombined similar to how environments can be stacked and combined so that each set of translations can have a limited scope. TranslationRegistry <- R6::R6Class(
"TranslationRegistry",
public = list(
initialize = function(parent = NULL) {
private$parent <- parent
private$functions <- new.env(parent = emptyenv())
private$functions$`_all_functions` <- new.env(parent = emptyenv())
},
register_translation = function(name, fun, package = NULL, env = parent.frame()) {
if (is.null(package)) {
package <- detect_package_name(name)
}
if (!is.null(package) && !(package %in% names(private$functions))) {
private$functions[[package]] <- new.env(parent = emptyenv())
}
if (!is.null(package)) {
private$functions[[package]][[name]] <- fun
}
private$functions$`_all_functions`[[name]] <- fun
invisible(self)
},
get_translation = function(name, package = NULL, env = parent.frame()) {
if (is.null(package)) {
package_det <- detect_package_name(name, env = env)
} else {
package_det <- package
}
if (is.null(package_det)) {
package_det <- "_all_functions"
}
if (is.null(private$parent)) {
tryCatch(get(name, envir = get(package_det, envir = private$functions)), error = function(e) {
self$abort_function_not_found(name, package)
})
} else {
tryCatch(
get(name, envir = get(package_det, envir = private$functions)),
error = function(e) private$parent$get_translation(name, package = package, env = env)
)
}
},
with_search = function(order = search()) {
order <- rev(intersect(gsub("package:", "", order), names(private$functions)))
funs_with_pkg <- unlist(
lapply(
setdiff(names(order), "_all_functions"),
function(pkg) names(private$functions[[pkg]])
)
)
funs_without_pkg <- setdiff(names(private$functions$`_all_functions`), funs_with_pkg)
out_names <- c(funs_without_pkg, funs_with_pkg)
out <- vector("list", length(out_names))
names(out) <- out_names
i <- 1L
for (pkg in order) {
for (name in names(private$functions[[pkg]])) {
out[[i]] <- private$functions[[pkg]][[name]]
i <- i + 1L
}
}
out
},
abort_function_not_found = function(name, package) {
if (is.null(package)) {
msg <- glue::glue("No translations registered for function `{ name }()`")
} else if (!(package %in% names(private$functions))) {
msg <- glue::glue("No translations registered for package '{ package }'")
} else {
msg <- glue::glue("No translations registered for function `{ package }::{ name }()`")
}
rlang::abort(msg , class = "function_not_found")
}
),
private = list(
parent = NULL,
functions = NULL
)
) In practice this would be an under-the-hood thing accessed by regular functions translations <- TranslationRegistry$new()
register_translation <- function(name, fun, package = NULL, env = parent.frame()) {
translations$register_translation(name, fun, package = package, env = env)
}
get_translation <- function(name, package = NULL, env = parent.frame()) {
translations$get_translation(name, package = package, env = env)
}
translations_with_search <- function(order = search()) {
translations$with_search(order)
} The basic idea is that you can register some functions and get a decent error when requesting one that doesn’t exist. register_translation("sum", function(x) {
arrow::Expression$create(
"sum",
if (inherits(x, "Expression")) x else sum(x)
)
})
get_translation("sum")
#> function(x) {
#> arrow::Expression$create(
#> "sum",
#> if (inherits(x, "Expression")) x else sum(x)
#> )
#> }
get_translation("sum", package = "base")
#> function(x) {
#> arrow::Expression$create(
#> "sum",
#> if (inherits(x, "Expression")) x else sum(x)
#> )
#> }
get_translation("summ")
#> Error: No translations registered for function `summ()`
get_translation("sum", package = "basee")
#> Error: No translations registered for package 'basee'
get_translation("summ", package = "base")
#> Error: No translations registered for function `base::summ()` Need to override register_translation("::", function(lhs, rhs) {
get_translation(
as.character(substitute(rhs)),
as.character(substitute(lhs))
)
}) Use rlang::eval_tidy(
rlang::quo(base::sum(some_field)),
data = c(
list(some_field = arrow::Expression$field_ref("some_field")),
translations_with_search()
)
)
#> Expression
#> sum(some_field, {skip_nulls=true, min_count=1}) |
Dragoș Moldovan-Grünfeld / @dragosmg:
|
Dragoș Moldovan-Grünfeld / @dragosmg:
|
Neal Richardson / @nealrichardson: |
{}Proposed approach{}:
pkg::fun()
name;nse_funcs
requiring indirect mappingnse_funcs
requiring direct mapping (unary and binary bindings)agg_funcs
for use withsummarise()
going forward we should be using pkg::fun when defining a binding, which will register 2 copies of the same binding.
Different implementation options are outlined and discussed in the design document.
{}Description{}:
Currently we implement a number of functions from packages like
lubridate
which work well when called without namespacing (e.g.{}year(){
}), however if someone callslubridate::year()
we get a not-implemented method (e.g.{}Warning: Expression lubridate::year(time_hour) not supported in Arrow{
}). Is it possible for us to look and see if we have an arrow function that matches the function itself.{code:r}
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
ds <- InMemoryDataset$create(nycflights13::flights)
ds %>%
mutate(year = lubridate::year(time_hour)) %>%
collect()
#> Warning: Expression lubridate::year(time_hour) not supported in Arrow; pulling
#> data into R
#> # A tibble: 336,776 × 19
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>
#> 1 2013 1 1 517 515 2 830 819
#> 2 2013 1 1 533 529 4 850 830
#> 3 2013 1 1 542 540 2 923 850
#> 4 2013 1 1 544 545 -1 1004 1022
#> 5 2013 1 1 554 600 -6 812 837
#> 6 2013 1 1 554 558 -4 740 728
#> 7 2013 1 1 555 600 -5 913 854
#> 8 2013 1 1 557 600 -3 709 723
#> 9 2013 1 1 557 600 -3 838 846
#> 10 2013 1 1 558 600 -2 753 745
#> # … with 336,766 more rows, and 11 more variables: arr_delay ,
#> # carrier , flight , tailnum , origin , dest ,
#> # air_time , distance , hour , minute , time_hour
ds %>%
mutate(year = year(time_hour)) %>%
collect()
#> # A tibble: 336,776 × 19
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>
#> 1 2013 1 1 517 515 2 830 819
#> 2 2013 1 1 533 529 4 850 830
#> 3 2013 1 1 542 540 2 923 850
#> 4 2013 1 1 544 545 -1 1004 1022
#> 5 2013 1 1 554 600 -6 812 837
#> 6 2013 1 1 554 558 -4 740 728
#> 7 2013 1 1 555 600 -5 913 854
#> 8 2013 1 1 557 600 -3 709 723
#> 9 2013 1 1 557 600 -3 838 846
#> 10 2013 1 1 558 600 -2 753 745
#> # … with 336,766 more rows, and 11 more variables: arr_delay ,
#> # carrier , flight , tailnum , origin , dest ,
#> # air_time , distance , hour , minute , time_hour
{code}
Reporter: Jonathan Keane / @jonkeane
Assignee: Dragoș Moldovan-Grünfeld / @dragosmg
Subtasks:
pkg::fun
namense_funcs
requiring indirect mappingnse_funcs
requiring direct mapping (unary & binary)Related issues:
n
function cannot be called withdplyr::n()
(is duplicated by)PRs and other links:
Note: This issue was originally created as ARROW-14575. Please see the migration documentation for further details.
The text was updated successfully, but these errors were encountered: