Update coalesce() to use vec_case_when() (#6265)

* Port tidyverse/funs#80 to `coalesce()` * NEWS bullet * Use `vec_case_when()` infrastructure in `coalesce()` * Backtick `NULL` * Tweak parameter documentation one more time
tidyverse · Jul 11, 2022 · a6df2a0 · a6df2a0
1 parent 5e27ef1
commit a6df2a0
Show file tree

Hide file tree

Showing 5 changed files with 231 additions and 84 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,17 @@
 # dplyr (development version)
 
+* `coalesce()` now more fully embraces the principles of vctrs (#6265).
+
+  * `.ptype` and `.size` arguments have been added to allow you to explicitly
+    enforce an output type and size.
+
+  * `NULL` inputs are now discarded up front.
+
+  * `coalesce()` no longer iterates over the columns of data frame input.
+    Instead, a row is now only coalesced if it is entirely missing, which is
+    consistent with `vctrs::vec_equal_na()` and greatly simplifies the
+    implementation.
+
 * `group_by()` now uses a new algorithm for computing groups. It is often faster
   than the previous approach (especially when there are many groups), and in
   most cases there should be no changes. The exception is with character vector

diff --git a/R/coalesce.R b/R/coalesce.R
@@ -1,69 +1,79 @@
-#' Find first non-missing element
+#' Find the first non-missing element
 #'
-#' Given a set of vectors, `coalesce()` finds the first non-missing value
-#' at each position. This is inspired by the SQL `COALESCE` function
-#' which does the same thing for `NULL`s.
+#' Given a set of vectors, `coalesce()` finds the first non-missing value at
+#' each position. It's inspired by the SQL `COALESCE` function which does the
+#' same thing for SQL `NULL`s.
+#'
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]>
+#'
+#'   One or more vectors. These will be
+#'   [recycled][vctrs::vector_recycling_rules] against each other, and will be
+#'   cast to their common type.
+#'
+#' @param .ptype An optional prototype declaring the desired output type. If
+#'   supplied, this overrides the common type of the vectors in `...`.
+#'
+#' @param .size An optional size declaring the desired output size. If supplied,
+#'   this overrides the common size of the vectors in `...`.
+#'
+#' @return A vector with the same type and size as the common type and common
+#'   size of the vectors in `...`.
+#'
+#' @seealso [na_if()] to replace specified values with an `NA`.
+#'   [tidyr::replace_na()] to replace `NA` with a value.
 #'
-#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Vectors. Inputs should be
-#'   recyclable (either be length 1 or same length as the longest vector) and
-#'   coercible to a common type. If data frames, they are coalesced column by
-#'   column.
-#' @return A vector the same length as the first `...` argument with
-#'   missing values replaced by the first non-missing value.
-#' @seealso [na_if()] to replace specified values with a `NA`.
-#' [tidyr::replace_na()] to replace `NA` with a value
 #' @export
 #' @examples
 #' # Use a single value to replace all missing values
 #' x <- sample(c(1:5, NA, NA, NA))
 #' coalesce(x, 0L)
 #'
-#' # Or match together a complete vector from missing pieces
+#' # The equivalent to a missing value in a list is `NULL`
+#' coalesce(list(1, 2, NULL), list(NA))
+#'
+#' # Or generate a complete vector from partially missing pieces
 #' y <- c(1, 2, NA, NA, 5)
 #' z <- c(NA, NA, 3, 4, 5)
 #' coalesce(y, z)
 #'
-#' # Supply lists by with dynamic dots
+#' # Supply lists by splicing them into dots:
 #' vecs <- list(
 #'   c(1, 2, NA, NA, 5),
 #'   c(NA, NA, 3, 4, 5)
 #' )
 #' coalesce(!!!vecs)
-coalesce <- function(...) {
-  if (missing(..1)) {
-    abort("At least one argument must be supplied.")
-  }
+coalesce <- function(..., .ptype = NULL, .size = NULL) {
+  args <- list2(...)
 
-  values <- list2(...)
-  values <- fix_call(vec_cast_common(!!!values))
-  values <- fix_call(vec_recycle_common(!!!values))
+  # Drop `NULL`s
+  # TODO: Use cheaper `vctrs::vec_any_missing()` approach
+  # https://github.com/r-lib/vctrs/issues/1563
+  args <- discard(args, is.null)
 
-  x <- values[[1]]
-  values <- values[-1]
-
-  if (is.array(x) && length(dim(x)) > 1) {
-    abort("Can't coalesce matrices.")
-  }
-  if (is.data.frame(x)) {
-    df_coalesce(x, values)
-  } else {
-    vec_coalesce(x, values)
+  if (length(args) == 0L) {
+    abort("`...` can't be empty.")
   }
-}
 
-vec_coalesce <- function(x, values) {
-  for (i in seq_along(values)) {
-    x_miss <- is.na(x)
-    vec_slice(x, x_miss) <- vec_slice(values[[i]], x_miss)
-  }
+  # Recycle early so logical conditions computed below will be the same length,
+  # as required by `vec_case_when()`
+  args <- vec_recycle_common(!!!args, .size = .size)
 
-  x
-}
-df_coalesce <- function(x, values) {
-  for (i in seq_along(x)) {
-    col_values <- map(values, `[[`, i)
-    x[[i]] <- vec_coalesce(x[[i]], col_values)
-  }
+  # Name early to get correct indexing in `vec_case_when()` error messages
+  names <- names2(args)
+  names <- names_as_error_names(names)
+  args <- set_names(args, names)
+
+  conditions <- map(args, ~{
+    !vec_equal_na(.x)
+  })
 
-  x
+  vec_case_when(
+    conditions = conditions,
+    values = args,
+    conditions_arg = "",
+    values_arg = "",
+    ptype = .ptype,
+    size = .size,
+    call = current_env()
+  )
 }
diff --git a/man/coalesce.Rd b/man/coalesce.Rd
diff --git a/tests/testthat/_snaps/coalesce.md b/tests/testthat/_snaps/coalesce.md
@@ -13,3 +13,51 @@
       Error in `coalesce()`:
       ! Can't combine `..1` <integer> and `..2` <character>.
 
+# `.size` overrides the common size
+
+    Code
+      coalesce(x, 1:2, .size = vec_size(x))
+    Condition
+      Error in `coalesce()`:
+      ! Can't recycle `..2` (size 2) to size 1.
+
+# must have at least one non-`NULL` vector
+
+    Code
+      coalesce()
+    Condition
+      Error in `coalesce()`:
+      ! `...` can't be empty.
+
+---
+
+    Code
+      coalesce(NULL, NULL)
+    Condition
+      Error in `coalesce()`:
+      ! `...` can't be empty.
+
+# inputs must be vectors
+
+    Code
+      coalesce(1, environment())
+    Condition
+      Error in `coalesce()`:
+      ! `..2` must be a vector, not an environment.
+
+# names in error messages are indexed correctly
+
+    Code
+      coalesce(1, "x")
+    Condition
+      Error in `coalesce()`:
+      ! Can't combine `..1` <double> and `..2` <character>.
+
+---
+
+    Code
+      coalesce(1, y = "x")
+    Condition
+      Error in `coalesce()`:
+      ! Can't combine `..1` <double> and `y` <character>.
+
diff --git a/tests/testthat/test-coalesce.R b/tests/testthat/test-coalesce.R
@@ -10,6 +10,10 @@ test_that("coerces to common type", {
   expect_identical(coalesce(NA, f), f)
 })
 
+test_that("inputs are recycled to their common size", {
+  expect_identical(coalesce(1, c(2, 3)), c(1, 1))
+})
+
 test_that("finds non-missing values in multiple positions", {
   x1 <- c(1L, NA, NA)
   x2 <- c(NA, 2L, NA)
@@ -25,33 +29,96 @@ test_that("coalesce() gives meaningful error messages", {
   })
 })
 
-test_that("coalesce() supports data frames (#5326)", {
-  out <- coalesce(
-    data.frame(x = c(NA, 1)),
-    data.frame(x = 1:2)
+test_that("coalesce() supports one-dimensional arrays (#5557)", {
+  x <- array(1:10)
+  out <- coalesce(x, 0L)
+  expect_identical(out, x)
+})
+
+test_that("only updates entirely missing matrix rows", {
+  x <- c(
+    1, NA,
+    NA, NA
   )
-  expect_identical(out, data.frame(x = c(1, 1)))
-
-  df1 <- data.frame(x = c(NA, 1, NA), y = c(2, NA, NA), z = c(1:2, NA))
-  df2 <- tibble::tibble(x = 1:3, y = c(3, 4, NA), z = c(NA, NA, NA))
-  df3 <- data.frame(x = NA, y = c(30, 40, 50), z = 101:103)
-  out <- coalesce(df1, df2, df3)
-  exp <- tibble(x = c(1, 1, 3), y = c(2, 4, 50), z = c(1L, 2L, 103L))
-  expect_identical(out, exp)
-
-  expect_error(
-    coalesce(
-      data.frame(x = c(NA, 1)),
-      data.frame(x = c("a", "b"))
-    ),
-    class = "vctrs_error_incompatible_type"
+  x <- matrix(x, nrow = 2, byrow = TRUE)
+
+  y <- c(
+    2, 2,
+    NA, 1
   )
+  y <- matrix(y, nrow = 2, byrow = TRUE)
 
-  expect_error(coalesce(as.matrix(mtcars), as.matrix(mtcars)), "matrices")
+  expect <- c(
+    1, NA,
+    NA, 1
+  )
+  expect <- matrix(expect, nrow = 2, byrow = TRUE)
+
+  expect_identical(coalesce(x, y), expect)
 })
 
-test_that("coalesce() supports one-dimensional arrays (#5557)", {
-  x <- array(1:10)
-  out <- coalesce(x, 0)
-  expect_equal(out, x)
+test_that("only updates entirely missing data frame rows", {
+  x <- tibble(x = c(1, NA), y = c(NA, NA))
+  y <- tibble(x = c(2, NA), y = c(TRUE, TRUE))
+
+  expect <- tibble(x = c(1, NA), y = c(NA, TRUE))
+
+  expect_identical(coalesce(x, y), expect)
+})
+
+test_that("only updates entirely missing rcrd observations", {
+  x <- new_rcrd(list(x = c(1, NA), y = c(NA, NA)))
+  y <- new_rcrd(list(x = c(2, NA), y = c(TRUE, TRUE)))
+
+  expect <- new_rcrd(list(x = c(1, NA), y = c(NA, TRUE)))
+
+  expect_identical(coalesce(x, y), expect)
+})
+
+test_that("recycling is done on the values early", {
+  expect_identical(coalesce(1, 1:2), c(1, 1))
+})
+
+test_that("`.ptype` overrides the common type (r-lib/funs#64)", {
+  x <- c(1L, NA)
+  expect_identical(coalesce(x, 99, .ptype = x), c(1L, 99L))
+})
+
+test_that("`.size` overrides the common size", {
+  x <- 1L
+
+  expect_snapshot(error = TRUE, {
+    coalesce(x, 1:2, .size = vec_size(x))
+  })
+})
+
+test_that("must have at least one non-`NULL` vector", {
+  expect_snapshot(error = TRUE, {
+    coalesce()
+  })
+  expect_snapshot(error = TRUE, {
+    coalesce(NULL, NULL)
+  })
+})
+
+test_that("`NULL`s are discarded (r-lib/funs#80)", {
+  expect_identical(
+    coalesce(c(1, NA, NA), NULL, c(1, 2, NA), NULL, 3),
+    c(1, 2, 3)
+  )
+})
+
+test_that("inputs must be vectors", {
+  expect_snapshot(error = TRUE, {
+    coalesce(1, environment())
+  })
+})
+
+test_that("names in error messages are indexed correctly", {
+  expect_snapshot(error = TRUE, {
+    coalesce(1, "x")
+  })
+  expect_snapshot(error = TRUE, {
+    coalesce(1, y = "x")
+  })
 })