Skip to content

Commit

Permalink
Update .proto files to the same version that Arrow is using (#181)
Browse files Browse the repository at this point in the history
* update to v0.6

* .proto file modifications needed to compile with nanopb + generate R wrapper

* with compiling nanopb-generated stuff!

* update the generated types file

* fix accidental renames from regex

* update the ReadRel to use updated syntax

* maybe try actions update with new duckdb release

* typo

* with working duckdb + substrait

* slightly simpler

* substrait_project -> substrait_select

* first stab at "project" the right way

* with everything except the Emit

* with at least the project tests passing

* don't include emit clause if it isn't needed

* fix output type

* simplify duckdb interfaces

* fix output type on functions, use of arguments in functions, skip arrow for now because of the Emit clause thing

* fix more failing tests

* clean CMD check issues

* disable arrow on CI until we actually use it to test things

* maybe fix CMD check action
  • Loading branch information
paleolimbot authored Sep 12, 2022
1 parent 4931e54 commit ed6f405
Show file tree
Hide file tree
Showing 88 changed files with 6,039 additions and 4,136 deletions.
91 changes: 36 additions & 55 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,10 @@ jobs:

steps:
- uses: actions/checkout@v3
with:
path: substrait

- uses: r-lib/actions/setup-pandoc@v1
- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
Expand All @@ -57,57 +55,40 @@ jobs:
- name: Setup substrait dependencies
uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: rcmdcheck, local::., arrow=?ignore
working-directory: /home/runner/work/substrait-r/substrait-r/substrait

- name: Cache DuckDB with Substrait
uses: actions/cache@v3
with:
path: "~/.local/share/R-substrait/duckdb_lib"
key: ${{ runner.os }}-1

- name: Setup custom duckdb
run: |
if (!substrait::has_duckdb_with_substrait()) {
substrait::install_duckdb_with_substrait()
}
shell: Rscript {0}

- name: Checkout Arrow repo
uses: actions/checkout@v3
with:
repository: apache/arrow
path: arrow
# Last commit before Arrow updated to Substrait v 0.6.0
ref: '3d6240c1ee7802829d2ed209f4135906e9413915'

- name: Install Arrow with ARROW_SUBSTRAIT turned on
run: |
mkdir install_dir
cd /home/runner/work/substrait-r/substrait-r/arrow/cpp
mkdir build_dir
cd build_dir
cmake -DCMAKE_INSTALL_PREFIX=${ARROW_HOME} \
-DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_BUILD_TYPE=Debug -DARROW_COMPUTE=ON -DARROW_CSV=ON -DARROW_DATASET=OFF \
-DARROW_FILESYSTEM=ON -DARROW_JEMALLOC=OFF -DARROW_JSON=OFF -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=OFF \
-DARROW_WITH_ZLIB=OFF -DARROW_INSTALL_NAME_RPATH=OFF -DARROW_EXTRA_ERROR_CONTEXT=ON \
-DARROW_INSTALL_NAME_RPATH=OFF -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_SUBSTRAIT=ON ..
sudo make -j2 install
- name: Setup arrow dependencies
uses: r-lib/actions/setup-r-dependencies@v2
with:
working-directory: /home/runner/work/substrait-r/substrait-r/arrow/r/

- name: Install Arrow R package
run: |
cd /home/runner/work/substrait-r/substrait-r/arrow/r/
make clean
R CMD INSTALL .
extra-packages: rcmdcheck

# Not using arrow package until https://github.com/apache/arrow/pull/13914 merges
# - name: Checkout Arrow repo
# uses: actions/checkout@v3
# with:
# repository: apache/arrow
# path: arrow
#
# - name: Install Arrow with ARROW_SUBSTRAIT turned on
# run: |
# mkdir install_dir
# cd /home/runner/work/substrait-r/substrait-r/arrow/cpp
# mkdir build_dir
# cd build_dir
#
# cmake -DCMAKE_INSTALL_PREFIX=${ARROW_HOME} \
# -DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_BUILD_TYPE=Debug -DARROW_COMPUTE=ON -DARROW_CSV=ON -DARROW_DATASET=OFF \
# -DARROW_FILESYSTEM=ON -DARROW_JEMALLOC=OFF -DARROW_JSON=OFF -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=OFF \
# -DARROW_WITH_ZLIB=OFF -DARROW_INSTALL_NAME_RPATH=OFF -DARROW_EXTRA_ERROR_CONTEXT=ON \
# -DARROW_INSTALL_NAME_RPATH=OFF -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_SUBSTRAIT=ON ..
#
# sudo make -j2 install
#
# - name: Setup arrow dependencies
# uses: r-lib/actions/setup-r-dependencies@v2
# with:
# working-directory: /home/runner/work/substrait-r/substrait-r/arrow/r/
#
# - name: Install Arrow R package
# run: |
# cd /home/runner/work/substrait-r/substrait-r/arrow/r/
# make clean
# R CMD INSTALL .

- name: Run R CMD check
uses: r-lib/actions/check-r-package@v2
with:
working-directory: /home/runner/work/substrait-r/substrait-r/substrait
17 changes: 2 additions & 15 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,17 @@ jobs:
steps:
- uses: actions/checkout@v2

- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true

- if: runner.os == 'Linux'
run: sudo apt-get install -y protobuf-compiler libprotobuf-dev libprotoc-dev

- uses: r-lib/actions/setup-r-dependencies@v1
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: covr

- name: Cache DuckDB with Substrait
uses: actions/cache@v3
with:
path: "~/.local/share/R-substrait/duckdb_lib"
key: ${{ runner.os }}-1

- name: Setup custom duckdb
run: |
if (!substrait::has_duckdb_with_substrait()) {
substrait::install_duckdb_with_substrait()
}
shell: Rscript {0}

- name: Test coverage
run: covr::codecov()
shell: Rscript {0}
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ Description: Provides an R interface to the 'Substrait' cross-language
License: Apache License (>= 2)
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.0
RoxygenNote: 7.2.1
Suggests:
arrow,
covr,
duckdb,
duckdb (>= 0.5.0),
DBI,
callr,
rappdirs,
Expand Down
3 changes: 1 addition & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,9 @@ export(as_substrait)
export(duckdb_from_substrait)
export(duckdb_get_substrait)
export(duckdb_substrait_compiler)
export(duckdb_with_substrait_lib_dir)
export(filter)
export(from_substrait)
export(has_duckdb_with_substrait)
export(install_duckdb_with_substrait)
export(select)
export(substrait)
export(substrait_aggregate)
Expand All @@ -84,6 +82,7 @@ export(substrait_i8)
export(substrait_interval_day)
export(substrait_interval_year)
export(substrait_project)
export(substrait_select)
export(substrait_sort)
export(substrait_sort_field)
export(substrait_string)
Expand Down
4 changes: 2 additions & 2 deletions R/aggregate-rel.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

#' Aggregate
#'
#' @inheritParams substrait_project
#' @inheritParams substrait_select
#' @param ...
#' - `substrait_aggregate()`: A named list of expressions to be evaluated in the context
#' of the aggregation.
Expand Down Expand Up @@ -38,7 +38,7 @@ substrait_aggregate <- function(.compiler, ...) {
)
)

# reset mask and schema here (probably should do this in substrait_project
# reset mask and schema here (probably should do this in substrait_select
# too)
types <- c(
lapply(
Expand Down
18 changes: 12 additions & 6 deletions R/compiler.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' will need to subclass the [SubstraitCompiler] and implement the `$evaluate()`
#' and/or `$resolve_function()` methods. Typically users will not interact
#' with R6 methods but will use the pipeable interface
#' (e.g. [substrait_project()]). The pipeable interface clones the compiler
#' (e.g. [substrait_select()]). The pipeable interface clones the compiler
#' before it is modified to minimize the user's interaction to R6 reference
#' semantics.
#'
Expand All @@ -24,6 +24,8 @@
#' @param template A `substrait.Expression.ScalarFunction`, a
#' `substrait.Expression.WindowFunction`, or a
#' `substrait.AggregateFunction`.
#' @param output_type An explicit output type to use or a function accepting
#' one type per `args`.
#'
#' @export
SubstraitCompiler <- R6::R6Class(
Expand Down Expand Up @@ -195,7 +197,7 @@ SubstraitCompiler <- R6::R6Class(
#'
#' @return A modified `template` with `function_reference`,
#' `args`, and `output_type` set.
resolve_function = function(name, args, template) {
resolve_function = function(name, args, template, output_type = NULL) {
# resolve arguments as Expressions if they haven't been already
# (generally they should be already but this will assert that)
args <- lapply(
Expand All @@ -211,12 +213,16 @@ SubstraitCompiler <- R6::R6Class(
# resolve the function identifier
id <- self$function_id(name, arg_types)

# maybe there's a way to know this later on but for now,
# leave an unspecified type
output_type <- substrait$Type$create()
if (is.null(output_type)) {
output_type <- substrait$Type$create()
} else if (is.function(output_type)) {
output_type <- do.call(output_type, arg_types)
}

template$function_reference <- id
template$args <- args
template$arguments <- lapply(args, function(arg) {
substrait$FunctionArgument$create(value = arg)
})
template$output_type <- output_type

template
Expand Down
43 changes: 1 addition & 42 deletions R/example_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,6 @@
#'
#' Data for use in examples and tests with multiple different data types
#'
#' Currently generated via:
#' example_data <- tibble::tibble(
#' int = c(-3212L, 2L, 3L, NA_integer_, 5L, 6L, 7L, 8L, 9L,
#' 3212L),
#' dbl = c(-999, -99, -9, 0,
#' 9, pi, 99, 10000, 10000, NA_real_),
#' dbl2 = c(-Inf, 5, 5, 5, 5, 5, 5, 5, 5, 5),
#' lgl = c(NA, TRUE, NA, TRUE, FALSE, FALSE, NA, TRUE, FALSE, TRUE),
#' false = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE),
#' chr = c("a", "b", "c", "d", "e", NA, "g", "h", "i", "j"),
#' verses = c("Por cada muro, un lamento", "En Jerusalén la dorada",
#' "Y mil vidas malgastadas", "Por cada mandamiento", "Yo soy polvo de tu viento",
#' "Y aunque sangro de tu herida", "Y cada piedra querida",
#' "Guarda mi amor más profundo", "No hay una piedra en el mundo",
#' "Que valga lo que una vida"),
#' padded_strings = c(" a ", " b ",
#' " c ", " d ", " e ", " f ", " g ",
#' " h ", " i ", " j "),
#' some_negative = c(-1, 2, -3, NA, -5, 6, -7, 8, -9, 10)#,
#' # https://github.com/voltrondata/substrait-r/issues/80
#' # dttm = lubridate::ymd_hms(c(
#' # "0000-01-01 00:00:00",
#' # "1919-05-29 13:08:55",
#' # "1955-06-20 04:10:42",
#' # "1973-06-30 11:38:41",
#' # "1987-03-29 12:49:47",
#' # "1991-06-11 19:07:01",
#' # NA_character_,
#' # "2017-08-21 18:26:40",
#' # "2017-08-21 18:26:40",
#' # "9999-12-31 23:59:59"
#' # ))
#' )
#'
#' @docType data
#'
#' @usage data(example_data)
#'
#' @keywords datasets
#'
#'
#' @examples
#' data(example_data)
#' example_data
"example_data"
2 changes: 1 addition & 1 deletion R/filter-rel.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#' Append a Substrait Project Relation
#'
#' @param ... Filter expressions
#' @inheritParams substrait_project
#' @inheritParams substrait_select
#'
#' @return A modified `.compiler`
#' @export
Expand Down
3 changes: 1 addition & 2 deletions R/pkg-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,7 @@ substrait_eval_arrow <- function(plan, tables, col_names) {
items = list(
substrait$ReadRel$LocalFiles$FileOrFiles$create(
uri_file = sprintf("file://%s", temp_parquet[i]),
format = substrait$ReadRel$LocalFiles$FileOrFiles$FileFormat$
FILE_FORMAT_PARQUET
parquet = substrait$ReadRel$LocalFiles$FileOrFiles$ParquetReadOptions$create()
)
)
)
Expand Down
20 changes: 10 additions & 10 deletions R/pkg-dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ select.SubstraitCompiler <- function(.data, ...) {
names(column_indices)
)

substrait_project(.data, !!!new_mask)
substrait_select(.data, !!!new_mask)
}

#' @rdname select.SubstraitCompiler
Expand All @@ -91,13 +91,13 @@ rename.SubstraitCompiler <- function(.data, ...) {
new_column_names
)

substrait_project(.data, !!!new_mask)
substrait_select(.data, !!!new_mask)
}

#' @rdname select.SubstraitCompiler
#' @importFrom dplyr rename_with
#' @export
rename_with.SubstraitCompiler <- function(.data, .fn, .cols = everything(), ...) {
rename_with.SubstraitCompiler <- function(.data, .fn, .cols = dplyr::everything(), ...) {
.fn <- rlang::as_function(.fn)
old_names <- dplyr::select(.data, {{ .cols }})$schema$names
dplyr::rename(.data, !!rlang::set_names(old_names, .fn(old_names)))
Expand All @@ -118,22 +118,22 @@ mutate.SubstraitCompiler <- function(.data, ...,
.keep <- match.arg(.keep)
mask <- .data$mask

out <- substrait_project(.data, !!!mask, ...)
out <- substrait_select(.data, !!!mask, ...)
if (.keep == "all") {
return(out)
}

# if only keeping a subset of columns, work out which and project again
cols <- names(mutate(simulate_data_frame(.data), ..., .keep = .keep))
substrait_project(out, !!!out$mask[cols], !!!rlang::syms(cols))
substrait_select(out, !!!out$mask[cols], !!!rlang::syms(cols))
}

#' @rdname select.SubstraitCompiler
#' @importFrom dplyr transmute
#' @export
transmute.SubstraitCompiler <- function(.data, ...) {
check_transmute_args(...)
substrait_project(.data, ...)
substrait_select(.data, ...)
}

#' @rdname select.SubstraitCompiler
Expand Down Expand Up @@ -284,7 +284,7 @@ relocate.SubstraitCompiler <- function(.data, ..., .before = NULL, .after = NULL
new_column_names[pos]
)

substrait_project(.data, !!!new_mask)
substrait_select(.data, !!!new_mask)
}

# translate desc() call to the equivalent
Expand Down Expand Up @@ -327,12 +327,12 @@ simulate_data_frame <- function(compiler) {

check_transmute_args <- function(..., .keep, .before, .after, error_call = rlang::caller_env()) {
if (!missing(.keep)) {
abort("The `.keep` argument is not supported.", call = error_call)
rlang::abort("The `.keep` argument is not supported.", call = error_call)
}
if (!missing(.before)) {
abort("The `.before` argument is not supported.", call = error_call)
rlang::abort("The `.before` argument is not supported.", call = error_call)
}
if (!missing(.after)) {
abort("The `.after` argument is not supported.", call = error_call)
rlang::abort("The `.after` argument is not supported.", call = error_call)
}
}
Loading

0 comments on commit ed6f405

Please sign in to comment.