From 73d6c4a31eeef68cf6bf7d1e8bfbadd18b6a5d7d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 1 Dec 2021 08:34:05 -0600 Subject: [PATCH] ARROW-13371 [R] binding for make_struct -> StructArray$create() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR implements a binding for the `make_struct` compute function. This function was already being called in the translation for `case_when()` but didn't have a binding of its own. Because you can do this in dplyr too (create a nested data frame), I mapped `tibble()` and `data.frame()` since it can be tested using `compare_dplyr_binding()`. `StructArray$create()` is a nested call and is probably not that useful of a translation for dplyr users (I'd never think to use it). Basically, you can now do this: ``` r library(arrow, warn.conflicts = FALSE) library(dplyr, warn.conflicts = FALSE) df <- RecordBatch$create(a = 1, b = "two") df %>% mutate(df_col = tibble(a, b)) %>% collect() #> # A tibble: 1 × 3 #> a b df_col$a $b #> #> 1 1 two 1 two ``` Something I didn't do but could is add all the arguments of `tibble()` and `data.frame()` that aren't supported (e.g., `stringsAsFactors`). It seems like this is done for other translations but I just wanted to check! Closes #11690 from paleolimbot/r-make-struct Authored-by: Dewey Dunnington Signed-off-by: Jonathan Keane --- r/R/dplyr-functions.R | 54 ++++++++++ r/tests/testthat/test-dplyr-funcs-type.R | 125 +++++++++++++++++++++++ 2 files changed, 179 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index b1f8a65f93d3a..56e8810937a2a 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -250,6 +250,60 @@ nse_funcs$is_logical <- function(x, n = NULL) { nse_funcs$is.logical(x) } +# Create a data frame/tibble/struct column +nse_funcs$tibble <- function(..., .rows = NULL, .name_repair = NULL) { + if (!is.null(.rows)) arrow_not_supported(".rows") + if (!is.null(.name_repair)) arrow_not_supported(".name_repair") + + # use dots_list() because this is what tibble() uses to allow the + # useful shorthand of tibble(col1, col2) -> tibble(col1 = col1, col2 = col2) + # we have a stronger enforcement of unique names for arguments because + # it is difficult to replicate the .name_repair semantics and expanding of + # unnamed data frame arguments in the same way that the tibble() constructor + # does. + args <- rlang::dots_list(..., .named = TRUE, .homonyms = "error") + + build_expr( + "make_struct", + args = unname(args), + options = list(field_names = names(args)) + ) +} + +nse_funcs$data.frame <- function(..., row.names = NULL, + check.rows = NULL, check.names = TRUE, fix.empty.names = TRUE, + stringsAsFactors = FALSE) { + # we need a specific value of stringsAsFactors because the default was + # TRUE in R <= 3.6 + if (!identical(stringsAsFactors, FALSE)) { + arrow_not_supported("stringsAsFactors = TRUE") + } + + # ignore row.names and check.rows with a warning + if (!is.null(row.names)) arrow_not_supported("row.names") + if (!is.null(check.rows)) arrow_not_supported("check.rows") + + args <- rlang::dots_list(..., .named = fix.empty.names) + if (is.null(names(args))) { + names(args) <- rep("", length(args)) + } + + if (identical(check.names, TRUE)) { + if (identical(fix.empty.names, TRUE)) { + names(args) <- make.names(names(args), unique = TRUE) + } else { + name_emtpy <- names(args) == "" + names(args)[!name_emtpy] <- make.names(names(args)[!name_emtpy], unique = TRUE) + } + } + + build_expr( + "make_struct", + args = unname(args), + options = list(field_names = names(args)) + ) +} + # String functions nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) { if (allowNA) { diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 859dc14b99b00..31184477b379d 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -625,3 +625,128 @@ test_that("bad explicit type conversions with as.*()", { ) ) }) + +test_that("structs/nested data frames/tibbles can be created", { + df <- tibble(regular_col1 = 1L, regular_col2 = "a") + + compare_dplyr_binding( + .input %>% + transmute( + df_col = tibble( + regular_col1 = regular_col1, + regular_col2 = regular_col2 + ) + ) %>% + collect(), + df + ) + + # check auto column naming + compare_dplyr_binding( + .input %>% + transmute( + df_col = tibble(regular_col1, regular_col2) + ) %>% + collect(), + df + ) + + # ...and that other arguments are not supported + expect_warning( + record_batch(char_col = "a") %>% + mutate(df_col = tibble(char_col, .rows = 1L)), + ".rows not supported in Arrow" + ) + + expect_warning( + record_batch(char_col = "a") %>% + mutate(df_col = tibble(char_col, .name_repair = "universal")), + ".name_repair not supported in Arrow" + ) + + # check that data.frame is mapped too + # stringsAsFactors default is TRUE in R 3.6, which is still tested on CI + compare_dplyr_binding( + .input %>% + transmute( + df_col = data.frame(regular_col1, regular_col2, stringsAsFactors = FALSE) + ) %>% + collect() %>% + mutate(df_col = as.data.frame(df_col)), + df + ) + + # check with fix.empty.names = FALSE + compare_dplyr_binding( + .input %>% + transmute( + df_col = data.frame(regular_col1, fix.empty.names = FALSE) + ) %>% + collect() %>% + mutate(df_col = as.data.frame(df_col)), + df + ) + + # check with check.names = TRUE and FALSE + compare_dplyr_binding( + .input %>% + transmute( + df_col = data.frame(regular_col1, regular_col1, check.names = TRUE) + ) %>% + collect() %>% + mutate(df_col = as.data.frame(df_col)), + df + ) + + compare_dplyr_binding( + .input %>% + transmute( + df_col = data.frame(regular_col1, regular_col1, check.names = FALSE) + ) %>% + collect() %>% + mutate(df_col = as.data.frame(df_col)), + df + ) + + # ...and that other arguments are not supported + expect_warning( + record_batch(char_col = "a") %>% + mutate(df_col = data.frame(char_col, stringsAsFactors = TRUE)), + "stringsAsFactors = TRUE not supported in Arrow" + ) + + expect_warning( + record_batch(char_col = "a") %>% + mutate(df_col = data.frame(char_col, row.names = 1L)), + "row.names not supported in Arrow" + ) + + expect_warning( + record_batch(char_col = "a") %>% + mutate(df_col = data.frame(char_col, check.rows = TRUE)), + "check.rows not supported in Arrow" + ) +}) + +test_that("nested structs can be created from scalars and existing data frames", { + compare_dplyr_binding( + .input %>% + transmute( + df_col = tibble(b = 3) + ) %>% + collect(), + tibble(a = 1:2) + ) + + # technically this is handled by Scalar$create() since there is no + # call to data.frame or tibble() within a dplyr verb + existing_data_frame <- tibble(b = 3) + compare_dplyr_binding( + .input %>% + transmute( + df_col = existing_data_frame + ) %>% + collect(), + tibble(a = 1:2) + ) +})