Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41834: [R] Better error handling in dplyr code #41576

Merged
merged 12 commits into from
May 29, 2024
Merged
6 changes: 5 additions & 1 deletion r/R/dplyr-across.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ expand_across <- function(.data, quos_in, exclude_cols = NULL) {
)

if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) {
abort("`...` argument to `across()` is deprecated in dplyr and not supported in Arrow")
arrow_not_supported(
"`...` argument to `across()` is deprecated in dplyr and",
body = c(">" = "Convert your call into a function or formula including the arguments"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TIL about this making arrows!

call = rlang::caller_call()
)
}

if (!is.null(across_call[[".cols"]])) {
Expand Down
87 changes: 44 additions & 43 deletions r/R/dplyr-arrange.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,47 +19,46 @@
# The following S3 methods are registered on load if dplyr is present

arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) {
call <- match.call()
.data <- as_adq(.data)
exprs <- expand_across(.data, quos(...))
try_arrow_dplyr({
.data <- as_adq(.data)
exprs <- expand_across(.data, quos(...))

if (.by_group) {
# when the data is grouped and .by_group is TRUE, order the result by
# the grouping columns first
exprs <- c(quos(!!!dplyr::groups(.data)), exprs)
}
if (length(exprs) == 0) {
# Nothing to do
return(.data)
}
.data <- as_adq(.data)
# find and remove any dplyr::desc() and tidy-eval
# the arrange expressions inside an Arrow data_mask
sorts <- vector("list", length(exprs))
descs <- logical(0)
mask <- arrow_mask(.data)
for (i in seq_along(exprs)) {
x <- find_and_remove_desc(exprs[[i]])
exprs[[i]] <- x[["quos"]]
sorts[[i]] <- arrow_eval(exprs[[i]], mask)
names(sorts)[i] <- format_expr(exprs[[i]])
if (inherits(sorts[[i]], "try-error")) {
msg <- paste("Expression", names(sorts)[i], "not supported in Arrow")
return(abandon_ship(call, .data, msg))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's an example of "not just an indentation change": in the new code, we don't have to evaluate, catch the error, and re-raise in abandon_ship, we just let arrow_eval() raise, and try_arrow_dplyr() handles the abandon_ship.

if (.by_group) {
# when the data is grouped and .by_group is TRUE, order the result by
# the grouping columns first
exprs <- c(quos(!!!dplyr::groups(.data)), exprs)
}
if (length(mask$.aggregations)) {
# dplyr lets you arrange on e.g. x < mean(x), but we haven't implemented it.
# But we could, the same way it works in mutate() via join, if someone asks.
# Until then, just error.
# TODO: add a test for this
msg <- paste("Expression", format_expr(expr), "not supported in arrange() in Arrow")
return(abandon_ship(call, .data, msg))
if (length(exprs) == 0) {
# Nothing to do
return(.data)
}
descs[i] <- x[["desc"]]
}
.data$arrange_vars <- c(sorts, .data$arrange_vars)
.data$arrange_desc <- c(descs, .data$arrange_desc)
.data
.data <- as_adq(.data)
# find and remove any dplyr::desc() and tidy-eval
# the arrange expressions inside an Arrow data_mask
sorts <- vector("list", length(exprs))
descs <- logical(0)
mask <- arrow_mask(.data)
for (i in seq_along(exprs)) {
x <- find_and_remove_desc(exprs[[i]])
exprs[[i]] <- x[["quos"]]
sorts[[i]] <- arrow_eval(exprs[[i]], mask)
names(sorts)[i] <- format_expr(exprs[[i]])
if (length(mask$.aggregations)) {
# dplyr lets you arrange on e.g. x < mean(x), but we haven't implemented it.
# But we could, the same way it works in mutate() via join, if someone asks.
# Until then, just error.
# TODO: add a test for this
arrow_not_supported(
.actual_msg = "Expression not supported in arrange() in Arrow",
call = expr
)
}
descs[i] <- x[["desc"]]
}
.data$arrange_vars <- c(sorts, .data$arrange_vars)
.data$arrange_desc <- c(descs, .data$arrange_desc)
.data
})
}
arrange.Dataset <- arrange.ArrowTabular <- arrange.RecordBatchReader <- arrange.arrow_dplyr_query

Expand All @@ -73,10 +72,9 @@ find_and_remove_desc <- function(quosure) {
expr <- quo_get_expr(quosure)
descending <- FALSE
if (length(all.vars(expr)) < 1L) {
stop(
"Expression in arrange() does not contain any field names: ",
deparse(expr),
call. = FALSE
validation_error(
"Expression in arrange() does not contain any field names",
call = quosure
)
}
# Use a while loop to remove any number of nested pairs of enclosing
Expand All @@ -90,7 +88,10 @@ find_and_remove_desc <- function(quosure) {
# ensure desc() has only one argument (when an R expression is a function
# call, length == 2 means it has exactly one argument)
if (length(expr) > 2) {
stop("desc() expects only one argument", call. = FALSE)
validation_error(
"desc() expects only one argument",
call = expr
)
}
# remove desc() and toggle descending
expr <- expr[[2]]
Expand Down
31 changes: 16 additions & 15 deletions r/R/dplyr-datetime-helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) {
if (tolower(Sys.info()[["sysname"]]) == "windows" && locale != "C") {
# MingW C++ std::locale only supports "C" and "POSIX"
stop(paste0(
"On Windows, time locales other than 'C' are not supported in Arrow. ",
"Consider setting `Sys.setlocale('LC_TIME', 'C')`"
))
arrow_not_supported(
"On Windows, time locales other than 'C'",
body = c(">" = "Consider setting `Sys.setlocale('LC_TIME', 'C')`")
)
}
locale
}
Expand Down Expand Up @@ -56,13 +56,15 @@ duration_from_chunks <- function(chunks) {
matched_chunks <- accepted_chunks[pmatch(names(chunks), accepted_chunks, duplicates.ok = TRUE)]

if (any(is.na(matched_chunks))) {
abort(
paste0(
"named `difftime` units other than: ",
oxford_paste(accepted_chunks, quote_symbol = "`"),
" not supported in Arrow. \nInvalid `difftime` parts: ",
arrow_not_supported(
paste(
"named `difftime` units other than:",
oxford_paste(accepted_chunks, quote_symbol = "`")
),
body = c(i = paste(
"Invalid `difftime` parts:",
oxford_paste(names(chunks[is.na(matched_chunks)]), quote_symbol = "`")
)
))
)
}

Expand Down Expand Up @@ -114,7 +116,6 @@ binding_as_date_character <- function(x,
}

binding_as_date_numeric <- function(x, origin = "1970-01-01") {

# Arrow does not support direct casting from double to date32(), but for
# integer-like values we can go via int32()
# TODO: revisit after ARROW-15798
Expand Down Expand Up @@ -442,7 +443,7 @@ parse_period_unit <- function(x) {
unit <- as.integer(pmatch(str_unit_start, known_units)) - 1L

if (any(is.na(unit))) {
abort(
validation_error(
sprintf(
"Invalid period name: '%s'",
str_unit,
Expand Down Expand Up @@ -484,13 +485,13 @@ parse_period_unit <- function(x) {
# more special cases: lubridate imposes sensible maximum
# values on the number of seconds, minutes and hours
if (unit == 3L && multiple > 60) {
abort("Rounding with second > 60 is not supported")
validation_error("Rounding with second > 60 is not supported")
}
if (unit == 4L && multiple > 60) {
abort("Rounding with minute > 60 is not supported")
validation_error("Rounding with minute > 60 is not supported")
}
if (unit == 5L && multiple > 24) {
abort("Rounding with hour > 24 is not supported")
validation_error("Rounding with hour > 24 is not supported")
}

list(unit = unit, multiple = multiple)
Expand Down
Loading
Loading