Skip to content

Commit

Permalink
ARROW-13371 [R] binding for make_struct -> StructArray$create()
Browse files Browse the repository at this point in the history
This PR implements a binding for the `make_struct` compute function. This function was already being called in the translation for `case_when()` but didn't have a binding of its own. Because you can do this in dplyr too (create a nested data frame), I mapped `tibble()` and `data.frame()` since it can be tested using `compare_dplyr_binding()`. `StructArray$create()` is a nested call and is probably not that useful of a translation for dplyr users (I'd never think to use it).

Basically, you can now do this:

``` r
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

df <- RecordBatch$create(a = 1, b = "two")
df %>%
  mutate(df_col = tibble(a, b)) %>%
  collect()
#> # A tibble: 1 × 3
#>       a b     df_col$a $b
#>   <dbl> <chr>    <dbl> <chr>
#> 1     1 two          1 two
```

Something I didn't do but could is add all the arguments of `tibble()` and `data.frame()` that aren't supported (e.g., `stringsAsFactors`). It seems like this is done for other translations but I just wanted to check!

Closes apache#11690 from paleolimbot/r-make-struct

Authored-by: Dewey Dunnington <dewey@fishandwhistle.net>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
  • Loading branch information
paleolimbot authored and kou committed Dec 1, 2021
1 parent e0f50e7 commit 73d6c4a
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 0 deletions.
54 changes: 54 additions & 0 deletions r/R/dplyr-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,60 @@ nse_funcs$is_logical <- function(x, n = NULL) {
nse_funcs$is.logical(x)
}

# Create a data frame/tibble/struct column
nse_funcs$tibble <- function(..., .rows = NULL, .name_repair = NULL) {
if (!is.null(.rows)) arrow_not_supported(".rows")
if (!is.null(.name_repair)) arrow_not_supported(".name_repair")

# use dots_list() because this is what tibble() uses to allow the
# useful shorthand of tibble(col1, col2) -> tibble(col1 = col1, col2 = col2)
# we have a stronger enforcement of unique names for arguments because
# it is difficult to replicate the .name_repair semantics and expanding of
# unnamed data frame arguments in the same way that the tibble() constructor
# does.
args <- rlang::dots_list(..., .named = TRUE, .homonyms = "error")

build_expr(
"make_struct",
args = unname(args),
options = list(field_names = names(args))
)
}

nse_funcs$data.frame <- function(..., row.names = NULL,
check.rows = NULL, check.names = TRUE, fix.empty.names = TRUE,
stringsAsFactors = FALSE) {
# we need a specific value of stringsAsFactors because the default was
# TRUE in R <= 3.6
if (!identical(stringsAsFactors, FALSE)) {
arrow_not_supported("stringsAsFactors = TRUE")
}

# ignore row.names and check.rows with a warning
if (!is.null(row.names)) arrow_not_supported("row.names")
if (!is.null(check.rows)) arrow_not_supported("check.rows")

args <- rlang::dots_list(..., .named = fix.empty.names)
if (is.null(names(args))) {
names(args) <- rep("", length(args))
}

if (identical(check.names, TRUE)) {
if (identical(fix.empty.names, TRUE)) {
names(args) <- make.names(names(args), unique = TRUE)
} else {
name_emtpy <- names(args) == ""
names(args)[!name_emtpy] <- make.names(names(args)[!name_emtpy], unique = TRUE)
}
}

build_expr(
"make_struct",
args = unname(args),
options = list(field_names = names(args))
)
}

# String functions
nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) {
if (allowNA) {
Expand Down
125 changes: 125 additions & 0 deletions r/tests/testthat/test-dplyr-funcs-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -625,3 +625,128 @@ test_that("bad explicit type conversions with as.*()", {
)
)
})

test_that("structs/nested data frames/tibbles can be created", {
df <- tibble(regular_col1 = 1L, regular_col2 = "a")

compare_dplyr_binding(
.input %>%
transmute(
df_col = tibble(
regular_col1 = regular_col1,
regular_col2 = regular_col2
)
) %>%
collect(),
df
)

# check auto column naming
compare_dplyr_binding(
.input %>%
transmute(
df_col = tibble(regular_col1, regular_col2)
) %>%
collect(),
df
)

# ...and that other arguments are not supported
expect_warning(
record_batch(char_col = "a") %>%
mutate(df_col = tibble(char_col, .rows = 1L)),
".rows not supported in Arrow"
)

expect_warning(
record_batch(char_col = "a") %>%
mutate(df_col = tibble(char_col, .name_repair = "universal")),
".name_repair not supported in Arrow"
)

# check that data.frame is mapped too
# stringsAsFactors default is TRUE in R 3.6, which is still tested on CI
compare_dplyr_binding(
.input %>%
transmute(
df_col = data.frame(regular_col1, regular_col2, stringsAsFactors = FALSE)
) %>%
collect() %>%
mutate(df_col = as.data.frame(df_col)),
df
)

# check with fix.empty.names = FALSE
compare_dplyr_binding(
.input %>%
transmute(
df_col = data.frame(regular_col1, fix.empty.names = FALSE)
) %>%
collect() %>%
mutate(df_col = as.data.frame(df_col)),
df
)

# check with check.names = TRUE and FALSE
compare_dplyr_binding(
.input %>%
transmute(
df_col = data.frame(regular_col1, regular_col1, check.names = TRUE)
) %>%
collect() %>%
mutate(df_col = as.data.frame(df_col)),
df
)

compare_dplyr_binding(
.input %>%
transmute(
df_col = data.frame(regular_col1, regular_col1, check.names = FALSE)
) %>%
collect() %>%
mutate(df_col = as.data.frame(df_col)),
df
)

# ...and that other arguments are not supported
expect_warning(
record_batch(char_col = "a") %>%
mutate(df_col = data.frame(char_col, stringsAsFactors = TRUE)),
"stringsAsFactors = TRUE not supported in Arrow"
)

expect_warning(
record_batch(char_col = "a") %>%
mutate(df_col = data.frame(char_col, row.names = 1L)),
"row.names not supported in Arrow"
)

expect_warning(
record_batch(char_col = "a") %>%
mutate(df_col = data.frame(char_col, check.rows = TRUE)),
"check.rows not supported in Arrow"
)
})

test_that("nested structs can be created from scalars and existing data frames", {
compare_dplyr_binding(
.input %>%
transmute(
df_col = tibble(b = 3)
) %>%
collect(),
tibble(a = 1:2)
)

# technically this is handled by Scalar$create() since there is no
# call to data.frame or tibble() within a dplyr verb
existing_data_frame <- tibble(b = 3)
compare_dplyr_binding(
.input %>%
transmute(
df_col = existing_data_frame
) %>%
collect(),
tibble(a = 1:2)
)
})

0 comments on commit 73d6c4a

Please sign in to comment.