diff --git a/r/NAMESPACE b/r/NAMESPACE index ab45aa9985e..814868d8ade 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -327,6 +327,7 @@ importFrom(rlang,quos) importFrom(rlang,seq2) importFrom(rlang,set_names) importFrom(rlang,syms) +importFrom(rlang,trace_back) importFrom(rlang,warn) importFrom(stats,median) importFrom(stats,na.exclude) diff --git a/r/R/metadata.R b/r/R/metadata.R index 408c2214a31..505d0653b4a 100644 --- a/r/R/metadata.R +++ b/r/R/metadata.R @@ -50,6 +50,7 @@ }) } +#' @importFrom rlang trace_back apply_arrow_r_metadata <- function(x, r_metadata) { tryCatch({ columns_metadata <- r_metadata$columns @@ -60,9 +61,27 @@ apply_arrow_r_metadata <- function(x, r_metadata) { } } } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { - x <- map2(x, columns_metadata, function(.x, .y) { - apply_arrow_r_metadata(.x, .y) - }) + # If we have a list and "columns_metadata" this applies row-level metadata + # inside of a column in a dataframe. + + # However, if we are inside of a dplyr collection (including all datasets), + # we cannot apply this row-level metadata, since the order of the rows is + # not guaranteed to be the same, so don't even try, but warn what's going on + trace <- trace_back() + in_dplyr_collect <- any(map_lgl(trace$calls, function(x) { + grepl("collect.arrow_dplyr_query", x, fixed = TRUE)[[1]] + })) + if (in_dplyr_collect) { + warning( + "Row-level metadata is not compatible with this operation and has ", + "been ignored", + call. = FALSE + ) + } else { + x <- map2(x, columns_metadata, function(.x, .y) { + apply_arrow_r_metadata(.x, .y) + }) + } x } @@ -116,9 +135,23 @@ arrow_attributes <- function(x, only_top_level = FALSE) { columns <- NULL if (is.list(x) && !inherits(x, "POSIXlt")) { - # for list columns, we also keep attributes of each - # element in columns - columns <- map(x, arrow_attributes) + # However, if we are inside of a dplyr collection (including all datasets), + # we cannot apply this row-level metadata, since the order of the rows is + # not guaranteed to be the same, so don't even try, but warn what's going on + trace <- trace_back() + in_dataset_write <- any(map_lgl(trace$calls, function(x) { + grepl("write_dataset", x, fixed = TRUE)[[1]] + })) + if (in_dataset_write) { + warning( + "Row-level metadata is not compatible with datasets and will be discarded", + call. = FALSE + ) + } else { + # for list columns, we also keep attributes of each + # element in columns + columns <- map(x, arrow_attributes) + } if (all(map_lgl(columns, is.null))) { columns <- NULL } diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index 0abbfb6a13a..5f2dad841a1 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -67,3 +67,9 @@ test_that <- function(what, code) { r_only <- function(code) { withr::with_options(list(..skip.tests = FALSE), code) } + +make_temp_dir <- function() { + path <- tempfile() + dir.create(path) + normalizePath(path, winslash = "/") +} diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index a0b1bdae022..66493376e74 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -21,12 +21,6 @@ context("Dataset") library(dplyr) -make_temp_dir <- function() { - path <- tempfile() - dir.create(path) - normalizePath(path, winslash = "/") -} - dataset_dir <- make_temp_dir() hive_dir <- make_temp_dir() ipc_dir <- make_temp_dir() diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index afce1c2244c..de3542b1c60 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -205,3 +205,45 @@ test_that("metadata of list elements (ARROW-10386)", { expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar") expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux") }) + + +test_that("metadata of list elements (ARROW-10386)", { + skip_if_not_available("dataset") + skip_if_not_available("parquet") + + library(dplyr) + + df <- tibble::tibble( + metadata = list( + structure(1, my_value_as_attr = 1), + structure(2, my_value_as_attr = 2), + structure(3, my_value_as_attr = 3), + structure(4, my_value_as_attr = 3)), + int = 1L:4L, + part = c(1, 3, 2, 1) + ) + + dst_dir <- make_temp_dir() + expect_warning( + write_dataset(df, dst_dir, partitioning = "part"), + "Row-level metadata is not compatible with datasets and will be discarded" + ) + + # but we need to write a dataset with row-level metadata to make sure when + # reading ones that have been written with them we warn appropriately + fake_func_name <- write_dataset + fake_func_name(df, dst_dir, partitioning = "part") + + ds <- open_dataset(dst_dir) + expect_warning( + df_from_ds <- collect(ds), + "Row-level metadata is not compatible with this operation and has been ignored" + ) + expect_equal(df_from_ds[c(1, 4, 3, 2), ], df, check.attributes = FALSE) + + # however there is *no* warning if we don't select the metadata column + expect_warning( + df_from_ds <- ds %>% select(int) %>% collect(), + NA + ) +})