diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R index 2604d86261cad..8e6a7f6185366 100644 --- a/r/R/dplyr-datetime-helpers.R +++ b/r/R/dplyr-datetime-helpers.R @@ -18,10 +18,10 @@ check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) { if (tolower(Sys.info()[["sysname"]]) == "windows" && locale != "C") { # MingW C++ std::locale only supports "C" and "POSIX" - arrow_not_supported(.actual_msg = paste0( - "On Windows, time locales other than 'C' are not supported in Arrow. ", - "Consider setting `Sys.setlocale('LC_TIME', 'C')`" - )) + arrow_not_supported( + "On Windows, time locales other than 'C'", + body = c(">" = "Consider setting `Sys.setlocale('LC_TIME', 'C')`") + ) } locale } @@ -57,12 +57,14 @@ duration_from_chunks <- function(chunks) { if (any(is.na(matched_chunks))) { arrow_not_supported( - .actual_msg = paste0( - "named `difftime` units other than: ", - oxford_paste(accepted_chunks, quote_symbol = "`"), - " not supported in Arrow. \nInvalid `difftime` parts: ", + paste( + "named `difftime` units other than:", + oxford_paste(accepted_chunks, quote_symbol = "`") + ), + body = c(i = paste( + "Invalid `difftime` parts:", oxford_paste(names(chunks[is.na(matched_chunks)]), quote_symbol = "`") - ) + )) ) } diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index d31166518ed39..6ef2b2d5f2193 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -38,26 +38,36 @@ arrow_eval <- function(expr, mask) { if (arrow_debug) print(msg) # A few cases: - # 1. Invalid input. Retry with dplyr won't help - if (inherits(e, "validation_error")) { - validation_error(msg, call = expr) + # 1. Evaluation raised one of our error classes. Add the expr as the call + # and re-raise it. + if (inherits(e, c("validation_error", "arrow_not_supported"))) { + e$call <- expr + stop(e) } - # 2. Not supported in Arrow. Retry with dplyr may help - if (inherits(e, "arrow_not_supported") || grepl("NotImplemented", msg)) { - arrow_not_supported(.actual_msg = msg, call = expr) + # 2. Error is from assert_that: raise as validation_error + if (inherits(e, "assertError")) { + validation_error(msg, call = expr) } # 3. Check to see if this is a standard R error message (not found etc.). - # Retry with dplyr won't help + # Retry with dplyr won't help. if (grepl(get_standard_error_messages(), msg)) { # Raise the original error: it's actually helpful here validation_error(msg, call = expr) } - # 4. Otherwise, we're not sure why this errored: it's not an error we raised - # explicitly. We'll assume it's because the function it calls isn't - # supported in arrow, and retry with dplyr may help. + # 4. Check for NotImplemented error raised from Arrow C++ code. + # Not sure where exactly we may raise this, but if we see it, it means + # that something isn't supported in Arrow. Retry in dplyr may help? + if (grepl("NotImplemented", msg)) { + arrow_not_supported(.actual_msg = msg, call = expr) + } + + + # 5. Otherwise, we're not sure why this errored: it's not an error we raised + # explicitly. We'll assume it's because the function it calls isn't + # supported in arrow, and retry with dplyr may help. if (arrow_debug) { arrow_not_supported(.actual_msg = msg, call = expr) } else { @@ -116,6 +126,9 @@ get_standard_error_messages <- function() { patterns <- .cache$i18ized_error_pattern if (is.null(patterns)) { patterns <- i18ize_error_messages() + # Add to the patterns something for match.arg() errors + # (the function name won't be internationalized, so we can't just look for it) + patterns <- paste0(patterns, "|match\\.arg") # Memoize it .cache$i18ized_error_pattern <- patterns } @@ -132,13 +145,50 @@ i18ize_error_messages <- function() { paste(map(out, ~ sub("X_____X", ".*", .)), collapse = "|") } -# Helpers to raise classed errors +#' Helpers to raise classed errors +#' +#' `arrow_not_supported()` and `validation_error()` raise classed errors that +#' allow us to distinguish between things that are not supported in Arrow and +#' things that are just invalid input. Additional wrapping in `arrow_eval()` +#' and `try_arrow_dplyr()` provide more context and suggestions. +#' Importantly, if `arrow_not_supported` is raised, then retrying the same code +#' in regular dplyr in R may work. But if `validation_error` is raised, then we +#' shouldn't recommend retrying with regular dplyr because it will fail there +#' too. +#' +#' Use these in function bindings and in the dplyr methods. Inside of function +#' bindings, you don't need to provide the `call` argument, as it will be +#' automatically filled in with the expression that caused the error in +#' `arrow_eval()`. In dplyr methods, you should provide the `call` argument; +#' `rlang::caller_call()` often is correct, but you may need to experiment to +#' find how far up the call stack you need to look. +#' +#' You may provide additional information in the `body` argument, a named +#' character vector. Use `i` for additional information about the error and `>` +#' to indicate potential solutions or workarounds that don't require pulling the +#' data into R. If you have an `arrow_not_supported()` error with a `>` +#' suggestion, when the error is ultimately raised by `try_error_dplyr()`, +#' `Call collect() first to pull data into R` won't be the only suggestion. +#' +#' You can still use `match.arg()` and `assert_that()` for simple input +#' validation inside of the function bindings. `arrow_eval()` will catch their +#' errors and re-raise them as `validation_error`. +#' +#' @param msg The message to show. `arrow_not_supported()` will append +#' "not supported in Arrow" to this message. +#' @param .actual_msg If you don't want to append "not supported in Arrow" to +#' the message, you can provide the full message here. +#' @param ... Additional arguments to pass to `rlang::abort()`. Useful arguments +#' include `call` to provide the call or expression that caused the error, and +#' `body` to provide additional context about the error. +#' @keywords internal arrow_not_supported <- function(msg, .actual_msg = paste(msg, "not supported in Arrow"), ...) { abort(.actual_msg, class = "arrow_not_supported", use_cli_format = TRUE, ...) } +#' @rdname arrow_not_supported validation_error <- function(msg, ...) { abort(msg, class = "validation_error", use_cli_format = TRUE, ...) } @@ -146,13 +196,10 @@ validation_error <- function(msg, ...) { # Wrap the contents of an arrow dplyr verb function in a tryCatch block to # handle arrow_not_supported errors: # * If it errors because of arrow_not_supported, abandon ship -# * If it's another error, just stop +# * If it's another error, just stop, retry with regular dplyr won't help try_arrow_dplyr <- function(expr) { parent <- caller_env() tryCatch(eval(expr, parent), error = function(e) { - # Instead of checking for arrow_not_supported, we could check !validation_error. - # Difference is in how non-classed (regular) errors are handled. - # This way, regular errors just stop. If we want them to abandon_ship, change it. if (inherits(e, "arrow_not_supported")) { abandon_ship(e, parent) } else { @@ -170,17 +217,21 @@ abandon_ship <- function(err, env) { call <- get("call", envir = env) if (query_on_dataset(.data)) { - err$body <- c( - err$body, - # TODO: if there are things in body, they should be recommendations - # for solving this within arrow. So the message to collect() should start - # with "Or, ..." - ">" = "Call collect() first to pull data into R." + # Add a note suggesting `collect()` to the error message. + # If there are other suggestions already there (with the > arrow name), + # collect() isn't the only suggestion, so message differently + msg <- ifelse( + ">" %in% names(err$body), + "Or, call collect() first to pull data into R.", + "Call collect() first to pull data into R." ) + err$body <- c(err$body, ">" = msg) stop(err) } - # else, warn, collect(), and run in regular dplyr + # Else, warn, collect(), and run in regular dplyr + # Note that this drops any suggestions for fixing in arrow + # (since that ship has sailed by the time you see the warning) rlang::warn( message = paste0("In ", format_expr(err$call), ": "), body = c("i" = conditionMessage(err), ">" = "Pulling data into R") diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 3bd77992aed8a..5e6ac4a1035f8 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -326,9 +326,9 @@ register_bindings_datetime_conversion <- function() { tz = "UTC") { if (is.null(format) && length(tryFormats) > 1) { arrow_not_supported( - .actual_msg = paste( - "`as.Date()` with multiple `tryFormats` is not supported in Arrow.", - "Consider using the lubridate specialised parsing functions `ymd()`, `ymd()`, etc." + "`as.Date()` with multiple `tryFormats`", + body = c( + ">" = "Consider using the lubridate specialised parsing functions `ymd()`, `ymd()`, etc." ) ) } diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index 412a2a6a3dfd5..77e1a5405a692 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -135,10 +135,8 @@ format_string_replacement <- function(replacement, ignore.case, fixed) { stop_if_locale_provided <- function(locale) { if (!identical(locale, "en")) { arrow_not_supported( - .actual_msg = paste( - "Providing a value for 'locale' other than the default ('en') is not", - "supported in Arrow. To change locale, use 'Sys.setlocale()'" - ) + "Providing a value for 'locale' other than the default ('en')", + body = c(">" = "To change locale, use 'Sys.setlocale()'") ) } } @@ -160,10 +158,11 @@ register_bindings_string_join <- function() { # handle scalar literal args, and cast all args to string for # consistency with base::paste(), base::paste0(), and stringr::str_c() if (!inherits(arg, "Expression")) { - assert_that( - length(arg) == 1, - msg = "Literal vectors of length != 1 not supported in string concatenation" - ) + if (length(arg) != 1) { + arrow_not_supported( + "Literal vectors of length != 1 in string concatenation" + ) + } Expression$scalar(as.character(arg)) } else { call_binding("as.character", arg) @@ -183,12 +182,11 @@ register_bindings_string_join <- function() { register_binding( "base::paste", function(..., sep = " ", collapse = NULL, recycle0 = FALSE) { - assert_that( - is.null(collapse), - msg = "paste() with the collapse argument is not yet supported in Arrow" - ) - if (!inherits(sep, "Expression")) { - assert_that(!is.na(sep), msg = "Invalid separator") + if (!is.null(collapse)) { + arrow_not_supported("`collapse` argument") + } + if (!inherits(sep, "Expression") && is.na(sep)) { + validation_error("Invalid separator") } arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep) }, @@ -198,10 +196,9 @@ register_bindings_string_join <- function() { register_binding( "base::paste0", function(..., collapse = NULL, recycle0 = FALSE) { - assert_that( - is.null(collapse), - msg = "paste0() with the collapse argument is not yet supported in Arrow" - ) + if (!is.null(collapse)) { + arrow_not_supported("`collapse` argument") + } arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "") }, notes = "the `collapse` argument is not yet supported" @@ -210,12 +207,11 @@ register_bindings_string_join <- function() { register_binding( "stringr::str_c", function(..., sep = "", collapse = NULL) { - assert_that( - is.null(collapse), - msg = "str_c() with the collapse argument is not yet supported in Arrow" - ) - if (!inherits(sep, "Expression")) { - assert_that(!is.na(sep), msg = "`sep` must be a single string, not `NA`.") + if (!is.null(collapse)) { + arrow_not_supported("`collapse` argument") + } + if (!inherits(sep, "Expression") && is.na(sep)) { + validation_error("`sep` must be a single string, not `NA`.") } arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep) }, @@ -514,14 +510,12 @@ register_bindings_string_other <- function() { register_binding( "base::substr", function(x, start, stop) { - assert_that( - length(start) == 1, - msg = "`start` must be length 1 - other lengths are not supported in Arrow" - ) - assert_that( - length(stop) == 1, - msg = "`stop` must be length 1 - other lengths are not supported in Arrow" - ) + if (length(start) != 1) { + arrow_not_supported("`start` must be length 1 - other lengths") + } + if (length(stop) != 1) { + arrow_not_supported("`stop` must be length 1 - other lengths") + } # substr treats values as if they're on a continuous number line, so values # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics @@ -563,14 +557,12 @@ register_bindings_string_other <- function() { }) register_binding("stringr::str_sub", function(string, start = 1L, end = -1L) { - assert_that( - length(start) == 1, - msg = "`start` must be length 1 - other lengths are not supported in Arrow" - ) - assert_that( - length(end) == 1, - msg = "`end` must be length 1 - other lengths are not supported in Arrow" - ) + if (length(start) != 1) { + arrow_not_supported("`start` must be length 1 - other lengths") + } + if (length(end) != 1) { + arrow_not_supported("`end` must be length 1 - other lengths") + } # In stringr::str_sub, an `end` value of -1 means the end of the string, so # set it to the maximum integer to match this behavior diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 9f73d7acf39ea..85c26ec05c8ba 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -219,7 +219,10 @@ register_bindings_type_inspect <- function() { call_binding("is.character", x) }) register_binding("rlang::is_double", function(x, n = NULL, finite = NULL) { - assert_that(is.null(n) && is.null(finite)) + assert_that(is.null(n)) + if (!is.null(finite)) { + arrow_not_supported("`finite` argument") + } call_binding("is.double", x) }) register_binding("rlang::is_integer", function(x, n = NULL) { diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 23aded0ee9245..e5b7ae71ee4f8 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -61,10 +61,7 @@ mutate.arrow_dplyr_query <- function(.data, !is.null(results[[new_var]])) { # We need some wrapping to handle literal values if (length(results[[new_var]]) != 1) { - arrow_not_supported(.actual_msg = paste0( - "In ", new_var, " = ", format_expr(exprs[[i]]), - ", only values of size one are recycled" - )) + arrow_not_supported("Recycling values of length != 1", call = exprs[[i]]) } results[[new_var]] <- Expression$scalar(results[[new_var]]) } diff --git a/r/man/arrow_not_supported.Rd b/r/man/arrow_not_supported.Rd new file mode 100644 index 0000000000000..be6a001fa1fa4 --- /dev/null +++ b/r/man/arrow_not_supported.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-eval.R +\name{arrow_not_supported} +\alias{arrow_not_supported} +\alias{validation_error} +\title{Helpers to raise classed errors} +\usage{ +arrow_not_supported( + msg, + .actual_msg = paste(msg, "not supported in Arrow"), + ... +) + +validation_error(msg, ...) +} +\arguments{ +\item{msg}{The message to show. \code{arrow_not_supported()} will append +"not supported in Arrow" to this message.} + +\item{.actual_msg}{If you don't want to append "not supported in Arrow" to +the message, you can provide the full message here.} + +\item{...}{Additional arguments to pass to \code{rlang::abort()}. Useful arguments +include \code{call} to provide the call or expression that caused the error, and +\code{body} to provide additional context about the error.} +} +\description{ +\code{arrow_not_supported()} and \code{validation_error()} raise classed errors that +allow us to distinguish between things that are not supported in Arrow and +things that are just invalid input. Additional wrapping in \code{arrow_eval()} +and \code{try_arrow_dplyr()} provide more context and suggestions. +Importantly, if \code{arrow_not_supported} is raised, then retrying the same code +in regular dplyr in R may work. But if \code{validation_error} is raised, then we +shouldn't recommend retrying with regular dplyr because it will fail there +too. +} +\details{ +Use these in function bindings and in the dplyr methods. Inside of function +bindings, you don't need to provide the \code{call} argument, as it will be +automatically filled in with the expression that caused the error in +\code{arrow_eval()}. In dplyr methods, you should provide the \code{call} argument; +\code{rlang::caller_call()} often is correct, but you may need to experiment to +find how far up the call stack you need to look. + +You may provide additional information in the \code{body} argument, a named +character vector. Use \code{i} for additional information about the error and \code{>} +to indicate potential solutions or workarounds that don't require pulling the +data into R. If you have an \code{arrow_not_supported()} error with a \code{>} +suggestion, when the error is ultimately raised by \code{try_error_dplyr()}, +\verb{Call collect() first to pull data into R} won't be the only suggestion. + +You can still use \code{match.arg()} and \code{assert_that()} for simple input +validation inside of the function bindings. \code{arrow_eval()} will catch their +errors and re-raise them as \code{validation_error}. +} +\keyword{internal}