diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 477fa67e7c6..e1c391c4917 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -38,27 +38,33 @@ supported_dplyr_methods <- list( select = NULL, filter = NULL, collect = NULL, - summarise = NULL, + summarise = c( + "window functions not currently supported;", + 'arguments `.drop = FALSE` and `.groups = "rowwise" not supported' + ), group_by = NULL, groups = NULL, group_vars = NULL, group_by_drop_default = NULL, ungroup = NULL, - mutate = NULL, + mutate = c( + "window functions (e.g. things that require aggregation within groups)", + "not currently supported" + ), transmute = NULL, arrange = NULL, rename = NULL, - pull = NULL, + pull = "returns an Arrow [ChunkedArray], not an R vector", relocate = NULL, compute = NULL, collapse = NULL, - distinct = NULL, - left_join = NULL, - right_join = NULL, - inner_join = NULL, - full_join = NULL, - semi_join = NULL, - anti_join = NULL, + distinct = "`.keep_all = TRUE` not supported", + left_join = "the `copy` and `na_matches` arguments are ignored", + right_join = "the `copy` and `na_matches` arguments are ignored", + inner_join = "the `copy` and `na_matches` arguments are ignored", + full_join = "the `copy` and `na_matches` arguments are ignored", + semi_join = "the `copy` and `na_matches` arguments are ignored", + anti_join = "the `copy` and `na_matches` arguments are ignored", count = NULL, tally = NULL, rename_with = NULL, diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 6106adbc5e4..3ecc32d3fe5 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -29,52 +29,59 @@ register_bindings_datetime <- function() { } register_bindings_datetime_utility <- function() { - register_binding("base::strptime", function(x, - format = "%Y-%m-%d %H:%M:%S", - tz = "", - unit = "ms") { - # Arrow uses unit for time parsing, strptime() does not. - # Arrow has no default option for strptime (format, unit), - # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms", - # (ARROW-12809) - - unit <- make_valid_time_unit( - unit, - c(valid_time64_units, valid_time32_units) - ) - - output <- build_expr( - "strptime", - x, - options = - list( - format = format, - unit = unit, - error_is_null = TRUE - ) - ) - - if (tz == "") { - tz <- Sys.timezone() - } + register_binding( + "base::strptime", + function(x, + format = "%Y-%m-%d %H:%M:%S", + tz = "", + unit = "ms") { + # Arrow uses unit for time parsing, strptime() does not. + # Arrow has no default option for strptime (format, unit), + # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms", + # (ARROW-12809) + + unit <- make_valid_time_unit( + unit, + c(valid_time64_units, valid_time32_units) + ) - # if a timestamp does not contain timezone information (i.e. it is - # "timezone-naive") we can attach timezone information (i.e. convert it into - # a "timezone-aware" timestamp) with `assume_timezone` - # if we want to cast to a different timezone, we can only do it for - # timezone-aware timestamps, not for timezone-naive ones - if (!is.null(tz)) { output <- build_expr( - "assume_timezone", - output, + "strptime", + x, options = list( - timezone = tz + format = format, + unit = unit, + error_is_null = TRUE ) ) - } - output - }) + + if (tz == "") { + tz <- Sys.timezone() + } + + # if a timestamp does not contain timezone information (i.e. it is + # "timezone-naive") we can attach timezone information (i.e. convert it into + # a "timezone-aware" timestamp) with `assume_timezone` + # if we want to cast to a different timezone, we can only do it for + # timezone-aware timestamps, not for timezone-naive ones + if (!is.null(tz)) { + output <- build_expr( + "assume_timezone", + output, + options = + list( + timezone = tz + ) + ) + } + output + }, + notes = c( + "accepts a `unit` argument not present in the `base` function.", + 'Valid values are "s", "ms" (default), "us", "ns".' + ) + ) register_binding("base::strftime", function(x, format = "", @@ -251,23 +258,27 @@ register_bindings_datetime_components <- function() { } register_bindings_datetime_conversion <- function() { - register_binding("lubridate::make_datetime", function(year = 1970L, - month = 1L, - day = 1L, - hour = 0L, - min = 0L, - sec = 0, - tz = "UTC") { - - # ParseTimestampStrptime currently ignores the timezone information (ARROW-12820). - # Stop if tz other than 'UTC' is provided. - if (tz != "UTC") { - arrow_not_supported("Time zone other than 'UTC'") - } + register_binding( + "lubridate::make_datetime", + function(year = 1970L, + month = 1L, + day = 1L, + hour = 0L, + min = 0L, + sec = 0, + tz = "UTC") { + + # ParseTimestampStrptime currently ignores the timezone information (ARROW-12820). + # Stop if tz other than 'UTC' is provided. + if (tz != "UTC") { + arrow_not_supported("Time zone other than 'UTC'") + } - x <- call_binding("str_c", year, month, day, hour, min, sec, sep = "-") - build_expr("strptime", x, options = list(format = "%Y-%m-%d-%H-%M-%S", unit = 0L)) - }) + x <- call_binding("str_c", year, month, day, hour, min, sec, sep = "-") + build_expr("strptime", x, options = list(format = "%Y-%m-%d-%H-%M-%S", unit = 0L)) + }, + notes = "only supports UTC (default) timezone" + ) register_binding("lubridate::make_date", function(year = 1970L, month = 1L, @@ -305,37 +316,44 @@ register_bindings_datetime_conversion <- function() { call_binding("make_datetime", year, month, day, hour, min, sec, tz) }) - register_binding("base::as.Date", function(x, - format = NULL, - tryFormats = "%Y-%m-%d", - origin = "1970-01-01", - tz = "UTC") { - if (is.null(format) && length(tryFormats) > 1) { - abort( - paste( - "`as.Date()` with multiple `tryFormats` is not supported in Arrow,", - "consider using the lubridate specialised parsing functions such as, `ymd()`, `ymd()`, etc." + register_binding( + "base::as.Date", + function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01", + tz = "UTC") { + if (is.null(format) && length(tryFormats) > 1) { + abort( + paste( + "`as.Date()` with multiple `tryFormats` is not supported in Arrow.", + "Consider using the lubridate specialised parsing functions `ymd()`, `ymd()`, etc." + ) ) - ) - } + } - # base::as.Date() and lubridate::as_date() differ in the way they use the - # `tz` argument. Both cast to the desired timezone, if present. The - # difference appears when the `tz` argument is not set: `as.Date()` uses the - # default value ("UTC"), while `as_date()` keeps the original attribute - # => we only cast when we want the behaviour of the base version or when - # `tz` is set (i.e. not NULL) - if (call_binding("is.POSIXct", x)) { - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - } + # base::as.Date() and lubridate::as_date() differ in the way they use the + # `tz` argument. Both cast to the desired timezone, if present. The + # difference appears when the `tz` argument is not set: `as.Date()` uses the + # default value ("UTC"), while `as_date()` keeps the original attribute + # => we only cast when we want the behaviour of the base version or when + # `tz` is set (i.e. not NULL) + if (call_binding("is.POSIXct", x)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } - binding_as_date( - x = x, - format = format, - tryFormats = tryFormats, - origin = origin + binding_as_date( + x = x, + format = format, + tryFormats = tryFormats, + origin = origin + ) + }, + notes = c( + "Multiple `tryFormats` not supported in Arrow.", + "Consider using the lubridate specialised parsing functions `ymd()`, `ymd()`, etc." ) - }) + ) register_binding("lubridate::as_date", function(x, format = NULL, @@ -412,108 +430,127 @@ register_bindings_datetime_conversion <- function() { } register_bindings_duration <- function() { - register_binding("base::difftime", function(time1, - time2, - tz, - units = "secs") { - if (units != "secs") { - abort("`difftime()` with units other than `secs` not supported in Arrow") - } + register_binding( + "base::difftime", + function(time1, + time2, + tz, + units = "secs") { + if (units != "secs") { + abort("`difftime()` with units other than `secs` not supported in Arrow") + } - if (!missing(tz)) { - warn("`tz` argument is not supported in Arrow, so it will be ignored") - } + if (!missing(tz)) { + warn("`tz` argument is not supported in Arrow, so it will be ignored") + } - # cast to timestamp if time1 and time2 are not dates or timestamp expressions - # (the subtraction of which would output a `duration`) - if (!call_binding("is.instant", time1)) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) - } + # cast to timestamp if time1 and time2 are not dates or timestamp expressions + # (the subtraction of which would output a `duration`) + if (!call_binding("is.instant", time1)) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) + } - if (!call_binding("is.instant", time2)) { - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp())) - } + if (!call_binding("is.instant", time2)) { + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp())) + } - # if time1 or time2 are timestamps they cannot be expressed in "s" /seconds - # otherwise they cannot be added subtracted with durations - # TODO delete the casting to "us" once - # https://issues.apache.org/jira/browse/ARROW-16060 is solved - if (inherits(time1, "Expression") && - time1$type_id() %in% Type[c("TIMESTAMP")] && time1$type()$unit() != 2L) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp("us"))) - } + # if time1 or time2 are timestamps they cannot be expressed in "s" /seconds + # otherwise they cannot be added subtracted with durations + # TODO delete the casting to "us" once + # https://issues.apache.org/jira/browse/ARROW-16060 is solved + if (inherits(time1, "Expression") && + time1$type_id() %in% Type[c("TIMESTAMP")] && time1$type()$unit() != 2L) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp("us"))) + } - if (inherits(time2, "Expression") && - time2$type_id() %in% Type[c("TIMESTAMP")] && time2$type()$unit() != 2L) { - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp("us"))) - } + if (inherits(time2, "Expression") && + time2$type_id() %in% Type[c("TIMESTAMP")] && time2$type()$unit() != 2L) { + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp("us"))) + } - # we need to go build the subtract expression instead of `time1 - time2` to - # prevent complaints when we try to subtract an R object from an Expression - subtract_output <- build_expr("-", time1, time2) - build_expr("cast", subtract_output, options = cast_options(to_type = duration("s"))) - }) - register_binding("base::as.difftime", function(x, - format = "%X", - units = "secs") { - # windows doesn't seem to like "%X" - if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { - format <- "%H:%M:%S" - } + # we need to go build the subtract expression instead of `time1 - time2` to + # prevent complaints when we try to subtract an R object from an Expression + subtract_output <- build_expr("-", time1, time2) + build_expr("cast", subtract_output, options = cast_options(to_type = duration("s"))) + }, + notes = c( + 'only supports `units = "secs"` (the default);', + "`tz` argument not supported" + ) + ) - if (units != "secs") { - abort("`as.difftime()` with units other than 'secs' not supported in Arrow") - } + register_binding( + "base::as.difftime", + function(x, + format = "%X", + units = "secs") { + # windows doesn't seem to like "%X" + if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { + format <- "%H:%M:%S" + } - if (call_binding("is.character", x)) { - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - # we do a final cast to duration ("s") at the end - x <- make_duration(x$cast(time64("us")), unit = "us") - } + if (units != "secs") { + abort("`as.difftime()` with units other than 'secs' not supported in Arrow") + } - # numeric -> duration not supported in Arrow yet so we use int64() as an - # intermediate step - # TODO: revisit after ARROW-15862 + if (call_binding("is.character", x)) { + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + # we do a final cast to duration ("s") at the end + x <- make_duration(x$cast(time64("us")), unit = "us") + } - if (call_binding("is.numeric", x)) { - # coerce x to be int64(). it should work for integer-like doubles and fail - # for pure doubles - # if we abort for all doubles, we risk erroring in cases in which - # coercion to int64() would work - x <- build_expr("cast", x, options = cast_options(to_type = int64())) - } + # numeric -> duration not supported in Arrow yet so we use int64() as an + # intermediate step + # TODO: revisit after ARROW-15862 - build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) - }) + if (call_binding("is.numeric", x)) { + # coerce x to be int64(). it should work for integer-like doubles and fail + # for pure doubles + # if we abort for all doubles, we risk erroring in cases in which + # coercion to int64() would work + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + } + + build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) + }, + notes = 'only supports `units = "secs"` (the default)' + ) } register_bindings_duration_constructor <- function() { - register_binding("lubridate::make_difftime", function(num = NULL, - units = "secs", - ...) { - if (units != "secs") { - abort("`make_difftime()` with units other than 'secs' not supported in Arrow") - } + register_binding( + "lubridate::make_difftime", + function(num = NULL, + units = "secs", + ...) { + if (units != "secs") { + abort("`make_difftime()` with units other than 'secs' not supported in Arrow") + } - chunks <- list(...) + chunks <- list(...) - # lubridate concatenates durations passed via the `num` argument with those - # passed via `...` resulting in a vector of length 2 - which is virtually - # unusable in a dplyr pipeline. Arrow errors in this situation - if (!is.null(num) && length(chunks) > 0) { - abort("`make_difftime()` with both `num` and `...` not supported in Arrow") - } + # lubridate concatenates durations passed via the `num` argument with those + # passed via `...` resulting in a vector of length 2 - which is virtually + # unusable in a dplyr pipeline. Arrow errors in this situation + if (!is.null(num) && length(chunks) > 0) { + abort("`make_difftime()` with both `num` and `...` not supported in Arrow") + } - if (!is.null(num)) { - # build duration from num if present - duration <- num - } else { - # build duration from chunks when nothing is passed via ... - duration <- duration_from_chunks(chunks) - } + if (!is.null(num)) { + # build duration from num if present + duration <- num + } else { + # build duration from chunks when nothing is passed via ... + duration <- duration_from_chunks(chunks) + } - make_duration(duration, "s") - }) + make_duration(duration, "s") + }, + notes = c( + 'only supports `units = "secs"` (the default);', + "providing both `num` and `...` is not supported" + ) + ) } register_bindings_duration_helpers <- function() { @@ -533,52 +570,62 @@ register_bindings_duration_helpers <- function() { ) } - register_binding("lubridate::dpicoseconds", function(x = 1) { - abort("Duration in picoseconds not supported in Arrow.") - }) + register_binding( + "lubridate::dpicoseconds", + function(x = 1) { + abort("Duration in picoseconds not supported in Arrow.") + }, + notes = "not supported" + ) } register_bindings_datetime_parsers <- function() { - register_binding("lubridate::parse_date_time", function(x, - orders, - tz = "UTC", - truncated = 0, - quiet = TRUE, - exact = FALSE) { - if (!quiet) { - arrow_not_supported("`quiet = FALSE`") - } + register_binding( + "lubridate::parse_date_time", + function(x, + orders, + tz = "UTC", + truncated = 0, + quiet = TRUE, + exact = FALSE) { + if (!quiet) { + arrow_not_supported("`quiet = FALSE`") + } - if (truncated > 0) { - if (truncated > (nchar(orders) - 3)) { - arrow_not_supported(paste0("a value for `truncated` > ", nchar(orders) - 3)) + if (truncated > 0) { + if (truncated > (nchar(orders) - 3)) { + arrow_not_supported(paste0("a value for `truncated` > ", nchar(orders) - 3)) + } + # build several orders for truncated formats + orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop = nchar(orders) - .x)) } - # build several orders for truncated formats - orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop = nchar(orders) - .x)) - } - if (!inherits(x, "Expression")) { - x <- Expression$scalar(x) - } + if (!inherits(x, "Expression")) { + x <- Expression$scalar(x) + } - if (exact == TRUE) { - # no data processing takes place & we don't derive formats - parse_attempts <- build_strptime_exprs(x, orders) - } else { - parse_attempts <- attempt_parsing(x, orders = orders) - } + if (exact == TRUE) { + # no data processing takes place & we don't derive formats + parse_attempts <- build_strptime_exprs(x, orders) + } else { + parse_attempts <- attempt_parsing(x, orders = orders) + } - coalesce_output <- build_expr("coalesce", args = parse_attempts) + coalesce_output <- build_expr("coalesce", args = parse_attempts) - # we need this binding to be able to handle a NULL `tz`, which, in turn, - # will be used by bindings such as `ymd()` to return a date or timestamp, - # based on whether tz is NULL or not - if (!is.null(tz)) { - build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) - } else { - coalesce_output - } - }) + # we need this binding to be able to handle a NULL `tz`, which, in turn, + # will be used by bindings such as `ymd()` to return a date or timestamp, + # based on whether tz is NULL or not + if (!is.null(tz)) { + build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) + } else { + coalesce_output + } + }, + notes = c( + "`quiet = FALSE` is not supported" + ) + ) parser_vec <- c( "ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my", "yq", @@ -610,45 +657,50 @@ register_bindings_datetime_parsers <- function() { for (order in parser_vec) { register_binding( paste0("lubridate::", tolower(order)), - parser_map_factory(order) + parser_map_factory(order), + notes = "`locale` argument not supported" ) } - register_binding("lubridate::fast_strptime", function(x, - format, - tz = "UTC", - lt = FALSE, - cutoff_2000 = 68L) { - # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play - # well with mutate, for example) - if (lt) { - arrow_not_supported("`lt = TRUE` argument") - } - - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16596 is done - if (cutoff_2000 != 68L) { - arrow_not_supported("`cutoff_2000` != 68L argument") - } + register_binding( + "lubridate::fast_strptime", + function(x, + format, + tz = "UTC", + lt = FALSE, + cutoff_2000 = 68L) { + # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play + # well with mutate, for example) + if (lt) { + arrow_not_supported("`lt = TRUE` argument") + } - parse_attempt_expressions <- list() + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16596 is done + if (cutoff_2000 != 68L) { + arrow_not_supported("`cutoff_2000` != 68L argument") + } - parse_attempt_expressions <- map( - format, - ~ build_expr( - "strptime", - x, - options = list( - format = .x, - unit = 0L, - error_is_null = TRUE + parse_attempt_expressions <- list() + + parse_attempt_expressions <- map( + format, + ~ build_expr( + "strptime", + x, + options = list( + format = .x, + unit = 0L, + error_is_null = TRUE + ) ) ) - ) - coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) + coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions) - build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) - }) + build_expr("assume_timezone", coalesce_output, options = list(timezone = tz)) + }, + notes = "non-default values of `lt` and `cutoff_2000` not supported" + ) } register_bindings_datetime_rounding <- function() { diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index d399e37e101..8132537af87 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -36,38 +36,38 @@ #' which returns an `arrow` [Table], or `collect()`, which pulls the resulting #' Table into an R `data.frame`. #' -#' * [`anti_join()`][dplyr::anti_join()] +#' * [`anti_join()`][dplyr::anti_join()]: the `copy` and `na_matches` arguments are ignored #' * [`arrange()`][dplyr::arrange()] #' * [`collapse()`][dplyr::collapse()] #' * [`collect()`][dplyr::collect()] #' * [`compute()`][dplyr::compute()] #' * [`count()`][dplyr::count()] -#' * [`distinct()`][dplyr::distinct()] +#' * [`distinct()`][dplyr::distinct()]: `.keep_all = TRUE` not supported #' * [`explain()`][dplyr::explain()] #' * [`filter()`][dplyr::filter()] -#' * [`full_join()`][dplyr::full_join()] +#' * [`full_join()`][dplyr::full_join()]: the `copy` and `na_matches` arguments are ignored #' * [`glimpse()`][dplyr::glimpse()] #' * [`group_by()`][dplyr::group_by()] #' * [`group_by_drop_default()`][dplyr::group_by_drop_default()] #' * [`group_vars()`][dplyr::group_vars()] #' * [`groups()`][dplyr::groups()] -#' * [`inner_join()`][dplyr::inner_join()] -#' * [`left_join()`][dplyr::left_join()] -#' * [`mutate()`][dplyr::mutate()] -#' * [`pull()`][dplyr::pull()] +#' * [`inner_join()`][dplyr::inner_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`left_join()`][dplyr::left_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported +#' * [`pull()`][dplyr::pull()]: returns an Arrow [ChunkedArray], not an R vector #' * [`relocate()`][dplyr::relocate()] #' * [`rename()`][dplyr::rename()] #' * [`rename_with()`][dplyr::rename_with()] -#' * [`right_join()`][dplyr::right_join()] +#' * [`right_join()`][dplyr::right_join()]: the `copy` and `na_matches` arguments are ignored #' * [`select()`][dplyr::select()] -#' * [`semi_join()`][dplyr::semi_join()] +#' * [`semi_join()`][dplyr::semi_join()]: the `copy` and `na_matches` arguments are ignored #' * [`show_query()`][dplyr::show_query()] #' * [`slice_head()`][dplyr::slice_head()]: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; `prop` only supported on queries where `nrow()` is knowable without evaluating #' * [`slice_max()`][dplyr::slice_max()]: slicing within groups not supported; `with_ties = TRUE` (dplyr default) is not supported; `prop` only supported on queries where `nrow()` is knowable without evaluating #' * [`slice_min()`][dplyr::slice_min()]: slicing within groups not supported; `with_ties = TRUE` (dplyr default) is not supported; `prop` only supported on queries where `nrow()` is knowable without evaluating #' * [`slice_sample()`][dplyr::slice_sample()]: slicing within groups not supported; `replace = TRUE` and the `weight_by` argument not supported; `n` only supported on queries where `nrow()` is knowable without evaluating #' * [`slice_tail()`][dplyr::slice_tail()]: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; `prop` only supported on queries where `nrow()` is knowable without evaluating -#' * [`summarise()`][dplyr::summarise()] +#' * [`summarise()`][dplyr::summarise()]: window functions not currently supported; arguments `.drop = FALSE` and `.groups = "rowwise" not supported #' * [`tally()`][dplyr::tally()] #' * [`transmute()`][dplyr::transmute()] #' * [`ungroup()`][dplyr::ungroup()] @@ -121,8 +121,9 @@ #' * [`all()`][base::all()] #' * [`any()`][base::any()] #' * [`as.character()`][base::as.character()] -#' * [`as.Date()`][base::as.Date()] -#' * [`as.difftime()`][base::as.difftime()] +#' * [`as.Date()`][base::as.Date()]: Multiple `tryFormats` not supported in Arrow. +#' Consider using the lubridate specialised parsing functions `ymd()`, `ymd()`, etc. +#' * [`as.difftime()`][base::as.difftime()]: only supports `units = "secs"` (the default) #' * [`as.double()`][base::as.double()] #' * [`as.integer()`][base::as.integer()] #' * [`as.logical()`][base::as.logical()] @@ -130,8 +131,10 @@ #' * [`asin()`][base::asin()] #' * [`ceiling()`][base::ceiling()] #' * [`cos()`][base::cos()] -#' * [`data.frame()`][base::data.frame()] -#' * [`difftime()`][base::difftime()] +#' * [`data.frame()`][base::data.frame()]: `row.names` and `check.rows` arguments not supported; +#' `stringsAsFactors` must be `FALSE` +#' * [`difftime()`][base::difftime()]: only supports `units = "secs"` (the default); +#' `tz` argument not supported #' * [`endsWith()`][base::endsWith()] #' * [`exp()`][base::exp()] #' * [`floor()`][base::floor()] @@ -160,7 +163,7 @@ #' * [`max()`][base::max()] #' * [`mean()`][base::mean()] #' * [`min()`][base::min()] -#' * [`nchar()`][base::nchar()] +#' * [`nchar()`][base::nchar()]: `allowNA = TRUE` and `keepNA = TRUE` not supported #' * [`paste()`][base::paste()]: the `collapse` argument is not yet supported #' * [`paste0()`][base::paste0()]: the `collapse` argument is not yet supported #' * [`pmax()`][base::pmax()] @@ -171,11 +174,12 @@ #' * [`sqrt()`][base::sqrt()] #' * [`startsWith()`][base::startsWith()] #' * [`strftime()`][base::strftime()] -#' * [`strptime()`][base::strptime()] +#' * [`strptime()`][base::strptime()]: accepts a `unit` argument not present in the `base` function. +#' Valid values are "s", "ms" (default), "us", "ns". #' * [`strrep()`][base::strrep()] #' * [`strsplit()`][base::strsplit()] #' * [`sub()`][base::sub()] -#' * [`substr()`][base::substr()] +#' * [`substr()`][base::substr()]: `start` and `stop` must be length 1 #' * [`substring()`][base::substring()] #' * [`sum()`][base::sum()] #' * [`tan()`][base::tan()] @@ -217,20 +221,20 @@ #' * [`dmilliseconds()`][lubridate::dmilliseconds()] #' * [`dminutes()`][lubridate::dminutes()] #' * [`dmonths()`][lubridate::dmonths()] -#' * [`dmy()`][lubridate::dmy()] -#' * [`dmy_h()`][lubridate::dmy_h()] -#' * [`dmy_hm()`][lubridate::dmy_hm()] -#' * [`dmy_hms()`][lubridate::dmy_hms()] +#' * [`dmy()`][lubridate::dmy()]: `locale` argument not supported +#' * [`dmy_h()`][lubridate::dmy_h()]: `locale` argument not supported +#' * [`dmy_hm()`][lubridate::dmy_hm()]: `locale` argument not supported +#' * [`dmy_hms()`][lubridate::dmy_hms()]: `locale` argument not supported #' * [`dnanoseconds()`][lubridate::dnanoseconds()] -#' * [`dpicoseconds()`][lubridate::dpicoseconds()] +#' * [`dpicoseconds()`][lubridate::dpicoseconds()]: not supported #' * [`dseconds()`][lubridate::dseconds()] #' * [`dst()`][lubridate::dst()] #' * [`dweeks()`][lubridate::dweeks()] #' * [`dyears()`][lubridate::dyears()] -#' * [`dym()`][lubridate::dym()] +#' * [`dym()`][lubridate::dym()]: `locale` argument not supported #' * [`epiweek()`][lubridate::epiweek()] #' * [`epiyear()`][lubridate::epiyear()] -#' * [`fast_strptime()`][lubridate::fast_strptime()] +#' * [`fast_strptime()`][lubridate::fast_strptime()]: non-default values of `lt` and `cutoff_2000` not supported #' * [`floor_date()`][lubridate::floor_date()] #' * [`format_ISO8601()`][lubridate::format_ISO8601()] #' * [`hour()`][lubridate::hour()] @@ -242,18 +246,19 @@ #' * [`isoyear()`][lubridate::isoyear()] #' * [`leap_year()`][lubridate::leap_year()] #' * [`make_date()`][lubridate::make_date()] -#' * [`make_datetime()`][lubridate::make_datetime()] -#' * [`make_difftime()`][lubridate::make_difftime()] +#' * [`make_datetime()`][lubridate::make_datetime()]: only supports UTC (default) timezone +#' * [`make_difftime()`][lubridate::make_difftime()]: only supports `units = "secs"` (the default); +#' providing both `num` and `...` is not supported #' * [`mday()`][lubridate::mday()] -#' * [`mdy()`][lubridate::mdy()] -#' * [`mdy_h()`][lubridate::mdy_h()] -#' * [`mdy_hm()`][lubridate::mdy_hm()] -#' * [`mdy_hms()`][lubridate::mdy_hms()] +#' * [`mdy()`][lubridate::mdy()]: `locale` argument not supported +#' * [`mdy_h()`][lubridate::mdy_h()]: `locale` argument not supported +#' * [`mdy_hm()`][lubridate::mdy_hm()]: `locale` argument not supported +#' * [`mdy_hms()`][lubridate::mdy_hms()]: `locale` argument not supported #' * [`minute()`][lubridate::minute()] #' * [`month()`][lubridate::month()] -#' * [`my()`][lubridate::my()] -#' * [`myd()`][lubridate::myd()] -#' * [`parse_date_time()`][lubridate::parse_date_time()] +#' * [`my()`][lubridate::my()]: `locale` argument not supported +#' * [`myd()`][lubridate::myd()]: `locale` argument not supported +#' * [`parse_date_time()`][lubridate::parse_date_time()]: `quiet = FALSE` is not supported #' * [`pm()`][lubridate::pm()] #' * [`qday()`][lubridate::qday()] #' * [`quarter()`][lubridate::quarter()] @@ -264,17 +269,17 @@ #' * [`wday()`][lubridate::wday()] #' * [`week()`][lubridate::week()] #' * [`yday()`][lubridate::yday()] -#' * [`ydm()`][lubridate::ydm()] -#' * [`ydm_h()`][lubridate::ydm_h()] -#' * [`ydm_hm()`][lubridate::ydm_hm()] -#' * [`ydm_hms()`][lubridate::ydm_hms()] +#' * [`ydm()`][lubridate::ydm()]: `locale` argument not supported +#' * [`ydm_h()`][lubridate::ydm_h()]: `locale` argument not supported +#' * [`ydm_hm()`][lubridate::ydm_hm()]: `locale` argument not supported +#' * [`ydm_hms()`][lubridate::ydm_hms()]: `locale` argument not supported #' * [`year()`][lubridate::year()] -#' * [`ym()`][lubridate::ym()] -#' * [`ymd()`][lubridate::ymd()] -#' * [`ymd_h()`][lubridate::ymd_h()] -#' * [`ymd_hm()`][lubridate::ymd_hm()] -#' * [`ymd_hms()`][lubridate::ymd_hms()] -#' * [`yq()`][lubridate::yq()] +#' * [`ym()`][lubridate::ym()]: `locale` argument not supported +#' * [`ymd()`][lubridate::ymd()]: `locale` argument not supported +#' * [`ymd_h()`][lubridate::ymd_h()]: `locale` argument not supported +#' * [`ymd_hm()`][lubridate::ymd_hm()]: `locale` argument not supported +#' * [`ymd_hms()`][lubridate::ymd_hms()]: `locale` argument not supported +#' * [`yq()`][lubridate::yq()]: `locale` argument not supported #' #' ## methods #' @@ -290,8 +295,9 @@ #' #' ## stats #' -#' * [`median()`][stats::median()] -#' * [`quantile()`][stats::quantile()] +#' * [`median()`][stats::median()]: approximate median (t-digest) is computed +#' * [`quantile()`][stats::quantile()]: `probs` must be length 1; +#' approximate quantile (t-digest) is computed #' * [`sd()`][stats::sd()] #' * [`var()`][stats::var()] #' @@ -301,8 +307,10 @@ #' #' ## stringr #' +#' Pattern modifiers `coll()` and `boundary()` are not supported in any functions. +#' #' * [`str_c()`][stringr::str_c()]: the `collapse` argument is not yet supported -#' * [`str_count()`][stringr::str_count()] +#' * [`str_count()`][stringr::str_count()]: `pattern` must be a length 1 character vector #' * [`str_detect()`][stringr::str_detect()] #' * [`str_dup()`][stringr::str_dup()] #' * [`str_ends()`][stringr::str_ends()] @@ -311,9 +319,9 @@ #' * [`str_pad()`][stringr::str_pad()] #' * [`str_replace()`][stringr::str_replace()] #' * [`str_replace_all()`][stringr::str_replace_all()] -#' * [`str_split()`][stringr::str_split()] +#' * [`str_split()`][stringr::str_split()]: Case-insensitive string splitting and splitting into 0 parts not supported #' * [`str_starts()`][stringr::str_starts()] -#' * [`str_sub()`][stringr::str_sub()] +#' * [`str_sub()`][stringr::str_sub()]: `start` and `end` must be length 1 #' * [`str_to_lower()`][stringr::str_to_lower()] #' * [`str_to_title()`][stringr::str_to_title()] #' * [`str_to_upper()`][stringr::str_to_upper()] diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index 69102f2f710..4b87ed1e761 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -254,18 +254,22 @@ register_bindings_string_regex <- function() { notes = "not yet in a released version of `stringr`, but it is supported in `arrow`" ) - register_binding("stringr::str_count", function(string, pattern) { - opts <- get_stringr_pattern_options(enexpr(pattern)) - if (!is.string(pattern)) { - arrow_not_supported("`pattern` must be a length 1 character vector; other values") - } - arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex") - Expression$create( - arrow_fun, - string, - options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) - ) - }) + register_binding( + "stringr::str_count", + function(string, pattern) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (!is.string(pattern)) { + arrow_not_supported("`pattern` must be a length 1 character vector; other values") + } + arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex") + Expression$create( + arrow_fun, + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) + }, + notes = "`pattern` must be a length 1 character vector" + ) register_binding("base::startsWith", function(x, prefix) { Expression$create( @@ -372,58 +376,66 @@ register_bindings_string_regex <- function() { ) }) - register_binding("stringr::str_split", function(string, - pattern, - n = Inf, - simplify = FALSE) { - opts <- get_stringr_pattern_options(enexpr(pattern)) - arrow_fun <- ifelse(opts$fixed, "split_pattern", "split_pattern_regex") - if (opts$ignore_case) { - arrow_not_supported("Case-insensitive string splitting") - } - if (n == 0) { - arrow_not_supported("Splitting strings into zero parts") - } - if (identical(n, Inf)) { - n <- 0L - } - if (simplify) { - warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE) - } - # The max_splits option in the Arrow C++ library controls the maximum number - # of places at which the string is split, whereas the argument n to - # str_split() controls the maximum number of pieces to return. So we must - # subtract 1 from n to get max_splits. - Expression$create( - arrow_fun, - string, - options = list( - pattern = opts$pattern, - reverse = FALSE, - max_splits = n - 1L + register_binding( + "stringr::str_split", + function(string, + pattern, + n = Inf, + simplify = FALSE) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + arrow_fun <- ifelse(opts$fixed, "split_pattern", "split_pattern_regex") + if (opts$ignore_case) { + arrow_not_supported("Case-insensitive string splitting") + } + if (n == 0) { + arrow_not_supported("Splitting strings into zero parts") + } + if (identical(n, Inf)) { + n <- 0L + } + if (simplify) { + warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE) + } + # The max_splits option in the Arrow C++ library controls the maximum number + # of places at which the string is split, whereas the argument n to + # str_split() controls the maximum number of pieces to return. So we must + # subtract 1 from n to get max_splits. + Expression$create( + arrow_fun, + string, + options = list( + pattern = opts$pattern, + reverse = FALSE, + max_splits = n - 1L + ) ) - ) - }) + }, + notes = "Case-insensitive string splitting and splitting into 0 parts not supported" + ) } register_bindings_string_other <- function() { - register_binding("base::nchar", function(x, type = "chars", allowNA = FALSE, keepNA = NA) { - if (allowNA) { - arrow_not_supported("allowNA = TRUE") - } - if (is.na(keepNA)) { - keepNA <- !identical(type, "width") - } - if (!keepNA) { - # TODO: I think there is a fill_null kernel we could use, set null to 2 - arrow_not_supported("keepNA = TRUE") - } - if (identical(type, "bytes")) { - Expression$create("binary_length", x) - } else { - Expression$create("utf8_length", x) - } - }) + register_binding( + "base::nchar", + function(x, type = "chars", allowNA = FALSE, keepNA = NA) { + if (allowNA) { + arrow_not_supported("allowNA = TRUE") + } + if (is.na(keepNA)) { + keepNA <- !identical(type, "width") + } + if (!keepNA) { + # TODO: I think there is a fill_null kernel we could use, set null to 2 + arrow_not_supported("keepNA = TRUE") + } + if (identical(type, "bytes")) { + Expression$create("binary_length", x) + } else { + Expression$create("utf8_length", x) + } + }, + notes = "`allowNA = TRUE` and `keepNA = TRUE` not supported" + ) register_binding("stringr::str_to_lower", function(string, locale = "en") { stop_if_locale_provided(locale) @@ -450,37 +462,41 @@ register_bindings_string_other <- function() { Expression$create(trim_fun, string) }) - register_binding("base::substr", function(x, start, stop) { - assert_that( - length(start) == 1, - msg = "`start` must be length 1 - other lengths are not supported in Arrow" - ) - assert_that( - length(stop) == 1, - msg = "`stop` must be length 1 - other lengths are not supported in Arrow" - ) + register_binding( + "base::substr", + function(x, start, stop) { + assert_that( + length(start) == 1, + msg = "`start` must be length 1 - other lengths are not supported in Arrow" + ) + assert_that( + length(stop) == 1, + msg = "`stop` must be length 1 - other lengths are not supported in Arrow" + ) - # substr treats values as if they're on a continous number line, so values - # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics - # this behavior - if (start <= 0) { - start <- 1 - } + # substr treats values as if they're on a continous number line, so values + # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics + # this behavior + if (start <= 0) { + start <- 1 + } - # if `stop` is lower than `start`, this is invalid, so set `stop` to - # 0 so that an empty string will be returned (consistent with base::substr()) - if (stop < start) { - stop <- 0 - } + # if `stop` is lower than `start`, this is invalid, so set `stop` to + # 0 so that an empty string will be returned (consistent with base::substr()) + if (stop < start) { + stop <- 0 + } - Expression$create( - "utf8_slice_codeunits", - x, - # we don't need to subtract 1 from `stop` as C++ counts exclusively - # which effectively cancels out the difference in indexing between R & C++ - options = list(start = start - 1L, stop = stop) - ) - }) + Expression$create( + "utf8_slice_codeunits", + x, + # we don't need to subtract 1 from `stop` as C++ counts exclusively + # which effectively cancels out the difference in indexing between R & C++ + options = list(start = start - 1L, stop = stop) + ) + }, + notes = "`start` and `stop` must be length 1" + ) register_binding("base::substring", function(text, first, last) { call_binding("substr", x = text, start = first, stop = last) @@ -520,7 +536,9 @@ register_bindings_string_other <- function() { string, options = list(start = start, stop = end) ) - }) + }, + notes = "`start` and `end` must be length 1" + ) register_binding("stringr::str_pad", function(string, diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 429ea51e029..296133daeed 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -135,42 +135,49 @@ register_bindings_type_cast <- function() { ) }) - register_binding("base::data.frame", function(..., - row.names = NULL, - check.rows = NULL, - check.names = TRUE, - fix.empty.names = TRUE, - stringsAsFactors = FALSE) { - # we need a specific value of stringsAsFactors because the default was - # TRUE in R <= 3.6 - if (!identical(stringsAsFactors, FALSE)) { - arrow_not_supported("stringsAsFactors = TRUE") - } + register_binding( + "base::data.frame", + function(..., + row.names = NULL, + check.rows = NULL, + check.names = TRUE, + fix.empty.names = TRUE, + stringsAsFactors = FALSE) { + # we need a specific value of stringsAsFactors because the default was + # TRUE in R <= 3.6 + if (!identical(stringsAsFactors, FALSE)) { + arrow_not_supported("stringsAsFactors = TRUE") + } - # ignore row.names and check.rows with a warning - if (!is.null(row.names)) arrow_not_supported("row.names") - if (!is.null(check.rows)) arrow_not_supported("check.rows") + # ignore row.names and check.rows with a warning + if (!is.null(row.names)) arrow_not_supported("row.names") + if (!is.null(check.rows)) arrow_not_supported("check.rows") - args <- dots_list(..., .named = fix.empty.names) - if (is.null(names(args))) { - names(args) <- rep("", length(args)) - } + args <- dots_list(..., .named = fix.empty.names) + if (is.null(names(args))) { + names(args) <- rep("", length(args)) + } - if (identical(check.names, TRUE)) { - if (identical(fix.empty.names, TRUE)) { - names(args) <- make.names(names(args), unique = TRUE) - } else { - name_emtpy <- names(args) == "" - names(args)[!name_emtpy] <- make.names(names(args)[!name_emtpy], unique = TRUE) + if (identical(check.names, TRUE)) { + if (identical(fix.empty.names, TRUE)) { + names(args) <- make.names(names(args), unique = TRUE) + } else { + name_emtpy <- names(args) == "" + names(args)[!name_emtpy] <- make.names(names(args)[!name_emtpy], unique = TRUE) + } } - } - build_expr( - "make_struct", - args = unname(args), - options = list(field_names = names(args)) + build_expr( + "make_struct", + args = unname(args), + options = list(field_names = names(args)) + ) + }, + notes = c( + "`row.names` and `check.rows` arguments not supported;", + "`stringsAsFactors` must be `FALSE`" ) - }) + ) } register_bindings_type_inspect <- function() { diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 33054b6f406..3fb85f5490c 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -123,8 +123,11 @@ unregister_binding <- function(fun_name, registry = nse_funcs, invisible(previous_fun) } -register_binding_agg <- function(fun_name, agg_fun, registry = agg_funcs) { - register_binding(fun_name, agg_fun, registry = registry) +register_binding_agg <- function(fun_name, + agg_fun, + registry = agg_funcs, + notes = character(0)) { + register_binding(fun_name, agg_fun, registry = registry, notes = notes) } # Supports functions and tests that call previously-defined bindings diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 3181cee1378..20251c6a105 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -98,39 +98,50 @@ register_bindings_aggregate <- function() { options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg("stats::quantile", function(x, probs, na.rm = FALSE) { - if (length(probs) != 1) { - arrow_not_supported("quantile() with length(probs) != 1") - } - # TODO: Bind to the Arrow function that returns an exact quantile and remove - # this warning (ARROW-14021) - warn( - "quantile() currently returns an approximate quantile in Arrow", - .frequency = "once", - .frequency_id = "arrow.quantile.approximate", - class = "arrow.quantile.approximate" - ) - list( - fun = "tdigest", - data = x, - options = list(skip_nulls = na.rm, q = probs) - ) - }) - register_binding_agg("stats::median", function(x, na.rm = FALSE) { - # TODO: Bind to the Arrow function that returns an exact median and remove - # this warning (ARROW-14021) - warn( - "median() currently returns an approximate median in Arrow", - .frequency = "once", - .frequency_id = "arrow.median.approximate", - class = "arrow.median.approximate" - ) - list( - fun = "approximate_median", - data = x, - options = list(skip_nulls = na.rm) + register_binding_agg( + "stats::quantile", + function(x, probs, na.rm = FALSE) { + if (length(probs) != 1) { + arrow_not_supported("quantile() with length(probs) != 1") + } + # TODO: Bind to the Arrow function that returns an exact quantile and remove + # this warning (ARROW-14021) + warn( + "quantile() currently returns an approximate quantile in Arrow", + .frequency = "once", + .frequency_id = "arrow.quantile.approximate", + class = "arrow.quantile.approximate" + ) + list( + fun = "tdigest", + data = x, + options = list(skip_nulls = na.rm, q = probs) + ) + }, + notes = c( + "`probs` must be length 1;", + "approximate quantile (t-digest) is computed" ) - }) + ) + register_binding_agg( + "stats::median", + function(x, na.rm = FALSE) { + # TODO: Bind to the Arrow function that returns an exact median and remove + # this warning (ARROW-14021) + warn( + "median() currently returns an approximate median in Arrow", + .frequency = "once", + .frequency_id = "arrow.median.approximate", + class = "arrow.median.approximate" + ) + list( + fun = "approximate_median", + data = x, + options = list(skip_nulls = na.rm) + ) + }, + notes = "approximate median (t-digest) is computed" + ) register_binding_agg("dplyr::n_distinct", function(..., na.rm = FALSE) { list( fun = "count_distinct", diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R index 8db3bb7e804..6d8b06611c0 100644 --- a/r/data-raw/docgen.R +++ b/r/data-raw/docgen.R @@ -89,6 +89,10 @@ do_not_link <- c( "stringr::str_like" # Still only in the unreleased version ) +package_notes <- list( + stringr = "Pattern modifiers `coll()` and `boundary()` are not supported in any functions." +) + # Vectorized function to make entries for each function render_fun <- function(fun, pkg_fun, notes) { # Add () to fun if it's not an operator @@ -114,12 +118,14 @@ render_pkg <- function(df, pkg) { bullets <- df %>% transmute(render_fun(fun, pkg_fun, notes)) %>% pull() - # Add header - bullets <- c( - paste0("## ", pkg, "\n#'"), - bullets - ) - paste("#'", bullets, collapse = "\n") + header <- paste0("## ", pkg, "\n#'") + # Some packages have global notes to include + pkg_notes <- package_notes[[pkg]] + if (!is.null(pkg_notes)) { + pkg_notes <- paste(pkg_notes, collapse = "\n#' ") + header <- c(header, paste0(pkg_notes, "\n#'")) + } + paste("#'", c(header, bullets), collapse = "\n") } docs <- arrow:::.cache$docs diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 5cbe211d00d..053438e0604 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -20,38 +20,38 @@ the query on the data. To run the query, call either \code{compute()}, which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting Table into an R \code{data.frame}. \itemize{ -\item \code{\link[dplyr:filter-joins]{anti_join()}} +\item \code{\link[dplyr:filter-joins]{anti_join()}}: the \code{copy} and \code{na_matches} arguments are ignored \item \code{\link[dplyr:arrange]{arrange()}} \item \code{\link[dplyr:compute]{collapse()}} \item \code{\link[dplyr:compute]{collect()}} \item \code{\link[dplyr:compute]{compute()}} \item \code{\link[dplyr:count]{count()}} -\item \code{\link[dplyr:distinct]{distinct()}} +\item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} not supported \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} -\item \code{\link[dplyr:mutate-joins]{full_join()}} +\item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} and \code{na_matches} arguments are ignored \item \code{\link[dplyr:glimpse]{glimpse()}} \item \code{\link[dplyr:group_by]{group_by()}} \item \code{\link[dplyr:group_by_drop_default]{group_by_drop_default()}} \item \code{\link[dplyr:group_data]{group_vars()}} \item \code{\link[dplyr:group_data]{groups()}} -\item \code{\link[dplyr:mutate-joins]{inner_join()}} -\item \code{\link[dplyr:mutate-joins]{left_join()}} -\item \code{\link[dplyr:mutate]{mutate()}} -\item \code{\link[dplyr:pull]{pull()}} +\item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that require aggregation within groups) not currently supported +\item \code{\link[dplyr:pull]{pull()}}: returns an Arrow \link{ChunkedArray}, not an R vector \item \code{\link[dplyr:relocate]{relocate()}} \item \code{\link[dplyr:rename]{rename()}} \item \code{\link[dplyr:rename]{rename_with()}} -\item \code{\link[dplyr:mutate-joins]{right_join()}} +\item \code{\link[dplyr:mutate-joins]{right_join()}}: the \code{copy} and \code{na_matches} arguments are ignored \item \code{\link[dplyr:select]{select()}} -\item \code{\link[dplyr:filter-joins]{semi_join()}} +\item \code{\link[dplyr:filter-joins]{semi_join()}}: the \code{copy} and \code{na_matches} arguments are ignored \item \code{\link[dplyr:explain]{show_query()}} \item \code{\link[dplyr:slice]{slice_head()}}: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating \item \code{\link[dplyr:slice]{slice_max()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating \item \code{\link[dplyr:slice]{slice_min()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating \item \code{\link[dplyr:slice]{slice_sample()}}: slicing within groups not supported; \code{replace = TRUE} and the \code{weight_by} argument not supported; \code{n} only supported on queries where \code{nrow()} is knowable without evaluating \item \code{\link[dplyr:slice]{slice_tail()}}: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating -\item \code{\link[dplyr:summarise]{summarise()}} +\item \code{\link[dplyr:summarise]{summarise()}}: window functions not currently supported; arguments \code{.drop = FALSE} and `.groups = "rowwise" not supported \item \code{\link[dplyr:count]{tally()}} \item \code{\link[dplyr:mutate]{transmute()}} \item \code{\link[dplyr:group_by]{ungroup()}} @@ -107,8 +107,9 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[base:all]{all()}} \item \code{\link[base:any]{any()}} \item \code{\link[base:character]{as.character()}} -\item \code{\link[base:as.Date]{as.Date()}} -\item \code{\link[base:difftime]{as.difftime()}} +\item \code{\link[base:as.Date]{as.Date()}}: Multiple \code{tryFormats} not supported in Arrow. +Consider using the lubridate specialised parsing functions \code{ymd()}, \code{ymd()}, etc. +\item \code{\link[base:difftime]{as.difftime()}}: only supports \code{units = "secs"} (the default) \item \code{\link[base:double]{as.double()}} \item \code{\link[base:integer]{as.integer()}} \item \code{\link[base:logical]{as.logical()}} @@ -116,8 +117,10 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[base:Trig]{asin()}} \item \code{\link[base:Round]{ceiling()}} \item \code{\link[base:Trig]{cos()}} -\item \code{\link[base:data.frame]{data.frame()}} -\item \code{\link[base:difftime]{difftime()}} +\item \code{\link[base:data.frame]{data.frame()}}: \code{row.names} and \code{check.rows} arguments not supported; +\code{stringsAsFactors} must be \code{FALSE} +\item \code{\link[base:difftime]{difftime()}}: only supports \code{units = "secs"} (the default); +\code{tz} argument not supported \item \code{\link[base:startsWith]{endsWith()}} \item \code{\link[base:Log]{exp()}} \item \code{\link[base:Round]{floor()}} @@ -146,7 +149,7 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[base:Extremes]{max()}} \item \code{\link[base:mean]{mean()}} \item \code{\link[base:Extremes]{min()}} -\item \code{\link[base:nchar]{nchar()}} +\item \code{\link[base:nchar]{nchar()}}: \code{allowNA = TRUE} and \code{keepNA = TRUE} not supported \item \code{\link[base:paste]{paste()}}: the \code{collapse} argument is not yet supported \item \code{\link[base:paste]{paste0()}}: the \code{collapse} argument is not yet supported \item \code{\link[base:Extremes]{pmax()}} @@ -157,11 +160,12 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[base:MathFun]{sqrt()}} \item \code{\link[base:startsWith]{startsWith()}} \item \code{\link[base:strptime]{strftime()}} -\item \code{\link[base:strptime]{strptime()}} +\item \code{\link[base:strptime]{strptime()}}: accepts a \code{unit} argument not present in the \code{base} function. +Valid values are "s", "ms" (default), "us", "ns". \item \code{\link[base:strrep]{strrep()}} \item \code{\link[base:strsplit]{strsplit()}} \item \code{\link[base:grep]{sub()}} -\item \code{\link[base:substr]{substr()}} +\item \code{\link[base:substr]{substr()}}: \code{start} and \code{stop} must be length 1 \item \code{\link[base:substr]{substring()}} \item \code{\link[base:sum]{sum()}} \item \code{\link[base:Trig]{tan()}} @@ -209,20 +213,20 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[lubridate:duration]{dmilliseconds()}} \item \code{\link[lubridate:duration]{dminutes()}} \item \code{\link[lubridate:duration]{dmonths()}} -\item \code{\link[lubridate:ymd]{dmy()}} -\item \code{\link[lubridate:ymd_hms]{dmy_h()}} -\item \code{\link[lubridate:ymd_hms]{dmy_hm()}} -\item \code{\link[lubridate:ymd_hms]{dmy_hms()}} +\item \code{\link[lubridate:ymd]{dmy()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{dmy_h()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{dmy_hm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{dmy_hms()}}: \code{locale} argument not supported \item \code{\link[lubridate:duration]{dnanoseconds()}} -\item \code{\link[lubridate:duration]{dpicoseconds()}} +\item \code{\link[lubridate:duration]{dpicoseconds()}}: not supported \item \code{\link[lubridate:duration]{dseconds()}} \item \code{\link[lubridate:dst]{dst()}} \item \code{\link[lubridate:duration]{dweeks()}} \item \code{\link[lubridate:duration]{dyears()}} -\item \code{\link[lubridate:ymd]{dym()}} +\item \code{\link[lubridate:ymd]{dym()}}: \code{locale} argument not supported \item \code{\link[lubridate:week]{epiweek()}} \item \code{\link[lubridate:year]{epiyear()}} -\item \code{\link[lubridate:parse_date_time]{fast_strptime()}} +\item \code{\link[lubridate:parse_date_time]{fast_strptime()}}: non-default values of \code{lt} and \code{cutoff_2000} not supported \item \code{\link[lubridate:round_date]{floor_date()}} \item \code{\link[lubridate:format_ISO8601]{format_ISO8601()}} \item \code{\link[lubridate:hour]{hour()}} @@ -234,18 +238,19 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[lubridate:year]{isoyear()}} \item \code{\link[lubridate:leap_year]{leap_year()}} \item \code{\link[lubridate:make_datetime]{make_date()}} -\item \code{\link[lubridate:make_datetime]{make_datetime()}} -\item \code{\link[lubridate:make_difftime]{make_difftime()}} +\item \code{\link[lubridate:make_datetime]{make_datetime()}}: only supports UTC (default) timezone +\item \code{\link[lubridate:make_difftime]{make_difftime()}}: only supports \code{units = "secs"} (the default); +providing both \code{num} and \code{...} is not supported \item \code{\link[lubridate:day]{mday()}} -\item \code{\link[lubridate:ymd]{mdy()}} -\item \code{\link[lubridate:ymd_hms]{mdy_h()}} -\item \code{\link[lubridate:ymd_hms]{mdy_hm()}} -\item \code{\link[lubridate:ymd_hms]{mdy_hms()}} +\item \code{\link[lubridate:ymd]{mdy()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{mdy_h()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{mdy_hm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{mdy_hms()}}: \code{locale} argument not supported \item \code{\link[lubridate:minute]{minute()}} \item \code{\link[lubridate:month]{month()}} -\item \code{\link[lubridate:ymd]{my()}} -\item \code{\link[lubridate:ymd]{myd()}} -\item \code{\link[lubridate:parse_date_time]{parse_date_time()}} +\item \code{\link[lubridate:ymd]{my()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd]{myd()}}: \code{locale} argument not supported +\item \code{\link[lubridate:parse_date_time]{parse_date_time()}}: \code{quiet = FALSE} is not supported \item \code{\link[lubridate:am]{pm()}} \item \code{\link[lubridate:day]{qday()}} \item \code{\link[lubridate:quarter]{quarter()}} @@ -256,17 +261,17 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[lubridate:day]{wday()}} \item \code{\link[lubridate:week]{week()}} \item \code{\link[lubridate:day]{yday()}} -\item \code{\link[lubridate:ymd]{ydm()}} -\item \code{\link[lubridate:ymd_hms]{ydm_h()}} -\item \code{\link[lubridate:ymd_hms]{ydm_hm()}} -\item \code{\link[lubridate:ymd_hms]{ydm_hms()}} +\item \code{\link[lubridate:ymd]{ydm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{ydm_h()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{ydm_hm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{ydm_hms()}}: \code{locale} argument not supported \item \code{\link[lubridate:year]{year()}} -\item \code{\link[lubridate:ymd]{ym()}} -\item \code{\link[lubridate:ymd]{ymd()}} -\item \code{\link[lubridate:ymd_hms]{ymd_h()}} -\item \code{\link[lubridate:ymd_hms]{ymd_hm()}} -\item \code{\link[lubridate:ymd_hms]{ymd_hms()}} -\item \code{\link[lubridate:ymd]{yq()}} +\item \code{\link[lubridate:ymd]{ym()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd]{ymd()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{ymd_h()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{ymd_hm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hms]{ymd_hms()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd]{yq()}}: \code{locale} argument not supported } } @@ -288,8 +293,9 @@ as \code{arrow_ascii_is_decimal}. \subsection{stats}{ \itemize{ -\item \code{\link[stats:median]{median()}} -\item \code{\link[stats:quantile]{quantile()}} +\item \code{\link[stats:median]{median()}}: approximate median (t-digest) is computed +\item \code{\link[stats:quantile]{quantile()}}: \code{probs} must be length 1; +approximate quantile (t-digest) is computed \item \code{\link[stats:sd]{sd()}} \item \code{\link[stats:cor]{var()}} } @@ -302,9 +308,11 @@ as \code{arrow_ascii_is_decimal}. } \subsection{stringr}{ + +Pattern modifiers \code{coll()} and \code{boundary()} are not supported in any functions. \itemize{ \item \code{\link[stringr:str_c]{str_c()}}: the \code{collapse} argument is not yet supported -\item \code{\link[stringr:str_count]{str_count()}} +\item \code{\link[stringr:str_count]{str_count()}}: \code{pattern} must be a length 1 character vector \item \code{\link[stringr:str_detect]{str_detect()}} \item \code{\link[stringr:str_dup]{str_dup()}} \item \code{\link[stringr:str_starts]{str_ends()}} @@ -313,9 +321,9 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[stringr:str_pad]{str_pad()}} \item \code{\link[stringr:str_replace]{str_replace()}} \item \code{\link[stringr:str_replace]{str_replace_all()}} -\item \code{\link[stringr:str_split]{str_split()}} +\item \code{\link[stringr:str_split]{str_split()}}: Case-insensitive string splitting and splitting into 0 parts not supported \item \code{\link[stringr:str_starts]{str_starts()}} -\item \code{\link[stringr:str_sub]{str_sub()}} +\item \code{\link[stringr:str_sub]{str_sub()}}: \code{start} and \code{end} must be length 1 \item \code{\link[stringr:case]{str_to_lower()}} \item \code{\link[stringr:case]{str_to_title()}} \item \code{\link[stringr:case]{str_to_upper()}} diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index 81a9ba3f6e5..21a78ee06e4 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -289,7 +289,7 @@ test_that("filter environment scope", { tbl ) isShortString <- function(x) nchar(x) < 10 - skip("TODO: 14071") + skip("TODO: ARROW-14071") compare_dplyr_binding( .input %>% select(-fct) %>% @@ -419,7 +419,6 @@ test_that("filter() with namespaced functions", { }) test_that("filter() with across()", { - compare_dplyr_binding( .input %>% filter(if_any(ends_with("l"), ~ is.na(.))) %>% @@ -437,5 +436,4 @@ test_that("filter() with across()", { collect(), tbl ) - }) diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 2608f9d6545..3ddc9ec3bed 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1897,7 +1897,7 @@ test_that("`as.Date()` and `as_date()`", { ) ) %>% collect(), - regexp = "consider using the lubridate specialised parsing functions" + regexp = "Consider using the lubridate specialised parsing functions" ) # record batch test @@ -1911,7 +1911,7 @@ test_that("`as.Date()` and `as_date()`", { ) ) %>% collect(), - regexp = "consider using the lubridate specialised parsing functions" + regexp = "Consider using the lubridate specialised parsing functions" ) # strptime does not support a partial format - Arrow returns NA, while