Skip to content

Commit

Permalink
Handle assert_that and match.arg; implement alternative suggestions; …
Browse files Browse the repository at this point in the history
…add docs
  • Loading branch information
nealrichardson committed May 11, 2024
1 parent c0bdbba commit 178648a
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 79 deletions.
20 changes: 11 additions & 9 deletions r/R/dplyr-datetime-helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) {
if (tolower(Sys.info()[["sysname"]]) == "windows" && locale != "C") {
# MingW C++ std::locale only supports "C" and "POSIX"
arrow_not_supported(.actual_msg = paste0(
"On Windows, time locales other than 'C' are not supported in Arrow. ",
"Consider setting `Sys.setlocale('LC_TIME', 'C')`"
))
arrow_not_supported(
"On Windows, time locales other than 'C'",
body = c(">" = "Consider setting `Sys.setlocale('LC_TIME', 'C')`")
)
}
locale
}
Expand Down Expand Up @@ -57,12 +57,14 @@ duration_from_chunks <- function(chunks) {

if (any(is.na(matched_chunks))) {
arrow_not_supported(
.actual_msg = paste0(
"named `difftime` units other than: ",
oxford_paste(accepted_chunks, quote_symbol = "`"),
" not supported in Arrow. \nInvalid `difftime` parts: ",
paste(
"named `difftime` units other than:",
oxford_paste(accepted_chunks, quote_symbol = "`")
),
body = c(i = paste(
"Invalid `difftime` parts:",
oxford_paste(names(chunks[is.na(matched_chunks)]), quote_symbol = "`")
)
))
)
}

Expand Down
95 changes: 73 additions & 22 deletions r/R/dplyr-eval.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,26 +38,36 @@ arrow_eval <- function(expr, mask) {
if (arrow_debug) print(msg)

# A few cases:
# 1. Invalid input. Retry with dplyr won't help
if (inherits(e, "validation_error")) {
validation_error(msg, call = expr)
# 1. Evaluation raised one of our error classes. Add the expr as the call
# and re-raise it.
if (inherits(e, c("validation_error", "arrow_not_supported"))) {
e$call <- expr
stop(e)
}

# 2. Not supported in Arrow. Retry with dplyr may help
if (inherits(e, "arrow_not_supported") || grepl("NotImplemented", msg)) {
arrow_not_supported(.actual_msg = msg, call = expr)
# 2. Error is from assert_that: raise as validation_error
if (inherits(e, "assertError")) {
validation_error(msg, call = expr)
}

# 3. Check to see if this is a standard R error message (not found etc.).
# Retry with dplyr won't help
# Retry with dplyr won't help.
if (grepl(get_standard_error_messages(), msg)) {
# Raise the original error: it's actually helpful here
validation_error(msg, call = expr)
}

# 4. Otherwise, we're not sure why this errored: it's not an error we raised
# explicitly. We'll assume it's because the function it calls isn't
# supported in arrow, and retry with dplyr may help.
# 4. Check for NotImplemented error raised from Arrow C++ code.
# Not sure where exactly we may raise this, but if we see it, it means
# that something isn't supported in Arrow. Retry in dplyr may help?
if (grepl("NotImplemented", msg)) {
arrow_not_supported(.actual_msg = msg, call = expr)
}


# 5. Otherwise, we're not sure why this errored: it's not an error we raised
# explicitly. We'll assume it's because the function it calls isn't
# supported in arrow, and retry with dplyr may help.
if (arrow_debug) {
arrow_not_supported(.actual_msg = msg, call = expr)
} else {
Expand Down Expand Up @@ -116,6 +126,9 @@ get_standard_error_messages <- function() {
patterns <- .cache$i18ized_error_pattern
if (is.null(patterns)) {
patterns <- i18ize_error_messages()
# Add to the patterns something for match.arg() errors
# (the function name won't be internationalized, so we can't just look for it)
patterns <- paste0(patterns, "|match\\.arg")
# Memoize it
.cache$i18ized_error_pattern <- patterns
}
Expand All @@ -132,27 +145,61 @@ i18ize_error_messages <- function() {
paste(map(out, ~ sub("X_____X", ".*", .)), collapse = "|")
}

# Helpers to raise classed errors
#' Helpers to raise classed errors
#'
#' `arrow_not_supported()` and `validation_error()` raise classed errors that
#' allow us to distinguish between things that are not supported in Arrow and
#' things that are just invalid input. Additional wrapping in `arrow_eval()`
#' and `try_arrow_dplyr()` provide more context and suggestions.
#' Importantly, if `arrow_not_supported` is raised, then retrying the same code
#' in regular dplyr in R may work. But if `validation_error` is raised, then we
#' shouldn't recommend retrying with regular dplyr because it will fail there
#' too.
#'
#' Use these in function bindings and in the dplyr methods. Inside of function
#' bindings, you don't need to provide the `call` argument, as it will be
#' automatically filled in with the expression that caused the error in
#' `arrow_eval()`. In dplyr methods, you should provide the `call` argument;
#' `rlang::caller_call()` often is correct, but you may need to experiment to
#' find how far up the call stack you need to look.
#'
#' You may provide additional information in the `body` argument, a named
#' character vector. Use `i` for additional information about the error and `>`
#' to indicate potential solutions or workarounds that don't require pulling the
#' data into R. If you have an `arrow_not_supported()` error with a `>`
#' suggestion, when the error is ultimately raised by `try_error_dplyr()`,
#' `Call collect() first to pull data into R` won't be the only suggestion.
#'
#' You can still use `match.arg()` and `assert_that()` for simple input
#' validation inside of the function bindings. `arrow_eval()` will catch their
#' errors and re-raise them as `validation_error`.
#'
#' @param msg The message to show. `arrow_not_supported()` will append
#' "not supported in Arrow" to this message.
#' @param .actual_msg If you don't want to append "not supported in Arrow" to
#' the message, you can provide the full message here.
#' @param ... Additional arguments to pass to `rlang::abort()`. Useful arguments
#' include `call` to provide the call or expression that caused the error, and
#' `body` to provide additional context about the error.
#' @keywords internal
arrow_not_supported <- function(msg,
.actual_msg = paste(msg, "not supported in Arrow"),
...) {
abort(.actual_msg, class = "arrow_not_supported", use_cli_format = TRUE, ...)
}

#' @rdname arrow_not_supported
validation_error <- function(msg, ...) {
abort(msg, class = "validation_error", use_cli_format = TRUE, ...)
}

# Wrap the contents of an arrow dplyr verb function in a tryCatch block to
# handle arrow_not_supported errors:
# * If it errors because of arrow_not_supported, abandon ship
# * If it's another error, just stop
# * If it's another error, just stop, retry with regular dplyr won't help
try_arrow_dplyr <- function(expr) {
parent <- caller_env()
tryCatch(eval(expr, parent), error = function(e) {
# Instead of checking for arrow_not_supported, we could check !validation_error.
# Difference is in how non-classed (regular) errors are handled.
# This way, regular errors just stop. If we want them to abandon_ship, change it.
if (inherits(e, "arrow_not_supported")) {
abandon_ship(e, parent)
} else {
Expand All @@ -170,17 +217,21 @@ abandon_ship <- function(err, env) {
call <- get("call", envir = env)

if (query_on_dataset(.data)) {
err$body <- c(
err$body,
# TODO: if there are things in body, they should be recommendations
# for solving this within arrow. So the message to collect() should start
# with "Or, ..."
">" = "Call collect() first to pull data into R."
# Add a note suggesting `collect()` to the error message.
# If there are other suggestions already there (with the > arrow name),
# collect() isn't the only suggestion, so message differently
msg <- ifelse(
">" %in% names(err$body),
"Or, call collect() first to pull data into R.",
"Call collect() first to pull data into R."
)
err$body <- c(err$body, ">" = msg)
stop(err)
}

# else, warn, collect(), and run in regular dplyr
# Else, warn, collect(), and run in regular dplyr
# Note that this drops any suggestions for fixing in arrow
# (since that ship has sailed by the time you see the warning)
rlang::warn(
message = paste0("In ", format_expr(err$call), ": "),
body = c("i" = conditionMessage(err), ">" = "Pulling data into R")
Expand Down
6 changes: 3 additions & 3 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,9 @@ register_bindings_datetime_conversion <- function() {
tz = "UTC") {
if (is.null(format) && length(tryFormats) > 1) {
arrow_not_supported(
.actual_msg = paste(
"`as.Date()` with multiple `tryFormats` is not supported in Arrow.",
"Consider using the lubridate specialised parsing functions `ymd()`, `ymd()`, etc."
"`as.Date()` with multiple `tryFormats`",
body = c(
">" = "Consider using the lubridate specialised parsing functions `ymd()`, `ymd()`, etc."
)
)
}
Expand Down
72 changes: 32 additions & 40 deletions r/R/dplyr-funcs-string.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,8 @@ format_string_replacement <- function(replacement, ignore.case, fixed) {
stop_if_locale_provided <- function(locale) {
if (!identical(locale, "en")) {
arrow_not_supported(
.actual_msg = paste(
"Providing a value for 'locale' other than the default ('en') is not",
"supported in Arrow. To change locale, use 'Sys.setlocale()'"
)
"Providing a value for 'locale' other than the default ('en')",
body = c(">" = "To change locale, use 'Sys.setlocale()'")
)
}
}
Expand All @@ -160,10 +158,11 @@ register_bindings_string_join <- function() {
# handle scalar literal args, and cast all args to string for
# consistency with base::paste(), base::paste0(), and stringr::str_c()
if (!inherits(arg, "Expression")) {
assert_that(
length(arg) == 1,
msg = "Literal vectors of length != 1 not supported in string concatenation"
)
if (length(arg) != 1) {
arrow_not_supported(
"Literal vectors of length != 1 in string concatenation"
)
}
Expression$scalar(as.character(arg))
} else {
call_binding("as.character", arg)
Expand All @@ -183,12 +182,11 @@ register_bindings_string_join <- function() {
register_binding(
"base::paste",
function(..., sep = " ", collapse = NULL, recycle0 = FALSE) {
assert_that(
is.null(collapse),
msg = "paste() with the collapse argument is not yet supported in Arrow"
)
if (!inherits(sep, "Expression")) {
assert_that(!is.na(sep), msg = "Invalid separator")
if (!is.null(collapse)) {
arrow_not_supported("`collapse` argument")
}
if (!inherits(sep, "Expression") && is.na(sep)) {
validation_error("Invalid separator")
}
arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep)
},
Expand All @@ -198,10 +196,9 @@ register_bindings_string_join <- function() {
register_binding(
"base::paste0",
function(..., collapse = NULL, recycle0 = FALSE) {
assert_that(
is.null(collapse),
msg = "paste0() with the collapse argument is not yet supported in Arrow"
)
if (!is.null(collapse)) {
arrow_not_supported("`collapse` argument")
}
arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "")
},
notes = "the `collapse` argument is not yet supported"
Expand All @@ -210,12 +207,11 @@ register_bindings_string_join <- function() {
register_binding(
"stringr::str_c",
function(..., sep = "", collapse = NULL) {
assert_that(
is.null(collapse),
msg = "str_c() with the collapse argument is not yet supported in Arrow"
)
if (!inherits(sep, "Expression")) {
assert_that(!is.na(sep), msg = "`sep` must be a single string, not `NA`.")
if (!is.null(collapse)) {
arrow_not_supported("`collapse` argument")
}
if (!inherits(sep, "Expression") && is.na(sep)) {
validation_error("`sep` must be a single string, not `NA`.")
}
arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep)
},
Expand Down Expand Up @@ -514,14 +510,12 @@ register_bindings_string_other <- function() {
register_binding(
"base::substr",
function(x, start, stop) {
assert_that(
length(start) == 1,
msg = "`start` must be length 1 - other lengths are not supported in Arrow"
)
assert_that(
length(stop) == 1,
msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
)
if (length(start) != 1) {
arrow_not_supported("`start` must be length 1 - other lengths")
}
if (length(stop) != 1) {
arrow_not_supported("`stop` must be length 1 - other lengths")
}

# substr treats values as if they're on a continuous number line, so values
# 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
Expand Down Expand Up @@ -563,14 +557,12 @@ register_bindings_string_other <- function() {
})

register_binding("stringr::str_sub", function(string, start = 1L, end = -1L) {
assert_that(
length(start) == 1,
msg = "`start` must be length 1 - other lengths are not supported in Arrow"
)
assert_that(
length(end) == 1,
msg = "`end` must be length 1 - other lengths are not supported in Arrow"
)
if (length(start) != 1) {
arrow_not_supported("`start` must be length 1 - other lengths")
}
if (length(end) != 1) {
arrow_not_supported("`end` must be length 1 - other lengths")
}

# In stringr::str_sub, an `end` value of -1 means the end of the string, so
# set it to the maximum integer to match this behavior
Expand Down
5 changes: 4 additions & 1 deletion r/R/dplyr-funcs-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,10 @@ register_bindings_type_inspect <- function() {
call_binding("is.character", x)
})
register_binding("rlang::is_double", function(x, n = NULL, finite = NULL) {
assert_that(is.null(n) && is.null(finite))
assert_that(is.null(n))
if (!is.null(finite)) {
arrow_not_supported("`finite` argument")
}
call_binding("is.double", x)
})
register_binding("rlang::is_integer", function(x, n = NULL) {
Expand Down
5 changes: 1 addition & 4 deletions r/R/dplyr-mutate.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,7 @@ mutate.arrow_dplyr_query <- function(.data,
!is.null(results[[new_var]])) {
# We need some wrapping to handle literal values
if (length(results[[new_var]]) != 1) {
arrow_not_supported(.actual_msg = paste0(
"In ", new_var, " = ", format_expr(exprs[[i]]),
", only values of size one are recycled"
))
arrow_not_supported("Recycling values of length != 1", call = exprs[[i]])
}
results[[new_var]] <- Expression$scalar(results[[new_var]])
}
Expand Down
Loading

0 comments on commit 178648a

Please sign in to comment.