From 9e1897f80bece946f31180f25a5e47c6622004f8 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 2 Jan 2019 15:27:17 +0100 Subject: [PATCH 1/4] styling etc ... --- r/DESCRIPTION | 1 + r/NAMESPACE | 1 + r/R/RcppExports.R | 4 +++ r/R/parquet.R | 27 ++++++++++++++++ r/README.Rmd | 2 +- r/README.md | 61 ++++++++++--------------------------- r/configure | 2 +- r/man/read_parquet.Rd | 14 +++++++++ r/src/RcppExports.cpp | 12 ++++++++ r/src/parquetfilereader.cpp | 38 +++++++++++++++++++++++ 10 files changed, 115 insertions(+), 47 deletions(-) create mode 100644 r/R/parquet.R create mode 100644 r/man/read_parquet.Rd create mode 100644 r/src/parquetfilereader.cpp diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 5f93c83f236eb..bfd4d1e892eb2 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -61,6 +61,7 @@ Collate: 'memory_pool.R' 'message.R' 'on_exit.R' + 'parquet.R' 'read_record_batch.R' 'read_table.R' 'reexports-bit64.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 65d60d846f4cb..d039e573f4f32 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -113,6 +113,7 @@ export(print.integer64) export(read_arrow) export(read_feather) export(read_message) +export(read_parquet) export(read_record_batch) export(read_schema) export(read_table) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 0310eab2027b9..db2ddae295dad 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -617,6 +617,10 @@ ipc___ReadMessage <- function(stream) { .Call(`_arrow_ipc___ReadMessage`, stream) } +read_parquet_file <- function(filename) { + .Call(`_arrow_read_parquet_file`, filename) +} + RecordBatch__num_columns <- function(x) { .Call(`_arrow_RecordBatch__num_columns`, x) } diff --git a/r/R/parquet.R b/r/R/parquet.R new file mode 100644 index 0000000000000..e15f59cf284e8 --- /dev/null +++ b/r/R/parquet.R @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read parquet file from disk +#' +#' @param files a vector of filenames +#' +#' @importFrom purrr map_dfr +#' +#' @export +read_parquet <- function(files) { + map_dfr(files, ~as_tibble(shared_ptr(`arrow::Table`, read_parquet_file(f)))) +} diff --git a/r/README.Rmd b/r/README.Rmd index 2c51d01c0f00f..9f0f39fef5352 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -25,7 +25,7 @@ git clone https://github.com/apache/arrow.git cd arrow/cpp && mkdir release && cd release # It is important to statically link to boost libraries -cmake .. -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off +cmake .. -DARROW_PARQUET=ON -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off make install ``` diff --git a/r/README.md b/r/README.md index 868fdff0a06e0..987d0c24a185b 100644 --- a/r/README.md +++ b/r/README.md @@ -14,7 +14,7 @@ git clone https://github.com/apache/arrow.git cd arrow/cpp && mkdir release && cd release # It is important to statically link to boost libraries -cmake .. -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off +cmake .. -DARROW_PARQUET=ON -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off make install ``` @@ -38,48 +38,19 @@ tf <- tempfile() #> # A tibble: 10 x 2 #> x y #> -#> 1 1 -0.255 -#> 2 2 -0.162 -#> 3 3 -0.614 -#> 4 4 -0.322 -#> 5 5 0.0693 -#> 6 6 -0.920 -#> 7 7 -1.08 -#> 8 8 0.658 -#> 9 9 0.821 -#> 10 10 0.539 -arrow::write_arrow(tib, tf) - -# read it back with pyarrow -pa <- import("pyarrow") -as_tibble(pa$open_file(tf)$read_pandas()) -#> # A tibble: 10 x 2 -#> x y -#> -#> 1 1 -0.255 -#> 2 2 -0.162 -#> 3 3 -0.614 -#> 4 4 -0.322 -#> 5 5 0.0693 -#> 6 6 -0.920 -#> 7 7 -1.08 -#> 8 8 0.658 -#> 9 9 0.821 -#> 10 10 0.539 -``` - -## Development - -### Code style - -We use Google C++ style in our C++ code. Check for style errors with - -``` -./lint.sh -``` - -You can fix the style issues with - +#> 1 1 0.0855 +#> 2 2 -1.68 +#> 3 3 -0.0294 +#> 4 4 -0.124 +#> 5 5 0.0675 +#> 6 6 1.64 +#> 7 7 1.54 +#> 8 8 -0.0209 +#> 9 9 -0.982 +#> 10 10 0.349 +# arrow::write_arrow(tib, tf) + +# # read it back with pyarrow +# pa <- import("pyarrow") +# as_tibble(pa$open_file(tf)$read_pandas()) ``` -./lint.sh --fix -``` \ No newline at end of file diff --git a/r/configure b/r/configure index 28f6a73ac7ef5..093f7cdcf6e2c 100755 --- a/r/configure +++ b/r/configure @@ -26,7 +26,7 @@ # R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib' # Library settings -PKG_CONFIG_NAME="arrow" +PKG_CONFIG_NAME="arrow parquet" PKG_DEB_NAME="arrow" PKG_RPM_NAME="arrow" PKG_CSW_NAME="arrow" diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd new file mode 100644 index 0000000000000..08e0862445d11 --- /dev/null +++ b/r/man/read_parquet.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\name{read_parquet} +\alias{read_parquet} +\title{Read parquet file from disk} +\usage{ +read_parquet(files) +} +\arguments{ +\item{files}{a vector of filenames} +} +\description{ +Read parquet file from disk +} diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index e5a784eb70c23..e8ad609cadcfc 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -1721,6 +1721,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// read_parquet_file +std::shared_ptr read_parquet_file(std::string filename); +RcppExport SEXP _arrow_read_parquet_file(SEXP filenameSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type filename(filenameSEXP); + rcpp_result_gen = Rcpp::wrap(read_parquet_file(filename)); + return rcpp_result_gen; +END_RCPP +} // RecordBatch__num_columns int RecordBatch__num_columns(const std::shared_ptr& x); RcppExport SEXP _arrow_RecordBatch__num_columns(SEXP xSEXP) { @@ -2306,6 +2317,7 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, {"_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, {"_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, + {"_arrow_read_parquet_file", (DL_FUNC) &_arrow_read_parquet_file, 1}, {"_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, {"_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, {"_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, diff --git a/r/src/parquetfilereader.cpp b/r/src/parquetfilereader.cpp new file mode 100644 index 0000000000000..3924953e93da5 --- /dev/null +++ b/r/src/parquetfilereader.cpp @@ -0,0 +1,38 @@ +// // Licensed to the Apache Software Foundation (ASF) under one +// // or more contributor license agreements. See the NOTICE file +// // distributed with this work for additional information +// // regarding copyright ownership. The ASF licenses this file +// // to you under the Apache License, Version 2.0 (the +// // "License"); you may not use this file except in compliance +// // with the License. You may obtain a copy of the License at +// // +// // http://www.apache.org/licenses/LICENSE-2.0 +// // +// // Unless required by applicable law or agreed to in writing, +// // software distributed under the License is distributed on an +// // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// // KIND, either express or implied. See the License for the +// // specific language governing permissions and limitations +// // under the License. +// +// +#include +#include +#include +#include +#include + +// [[Rcpp::export]] +std::shared_ptr read_parquet_file(std::string filename) { + std::shared_ptr infile; + PARQUET_THROW_NOT_OK( + arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool(), &infile)); + + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + + return table; +} From ff260c587e9870c8b1e78cc28a9dbb22f806a8f1 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 2 Jan 2019 15:29:32 +0100 Subject: [PATCH 2/4] header was too commented, renamed to parquet.cpp --- r/src/parquet.cpp | 37 ++++++++++++++++++++++++++++++++++++ r/src/parquetfilereader.cpp | 38 ------------------------------------- 2 files changed, 37 insertions(+), 38 deletions(-) create mode 100644 r/src/parquet.cpp delete mode 100644 r/src/parquetfilereader.cpp diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp new file mode 100644 index 0000000000000..859bd4826e7c2 --- /dev/null +++ b/r/src/parquet.cpp @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +// [[Rcpp::export]] +std::shared_ptr read_parquet_file(std::string filename) { + std::shared_ptr infile; + PARQUET_THROW_NOT_OK( + arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool(), &infile)); + + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + + return table; +} diff --git a/r/src/parquetfilereader.cpp b/r/src/parquetfilereader.cpp deleted file mode 100644 index 3924953e93da5..0000000000000 --- a/r/src/parquetfilereader.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// // Licensed to the Apache Software Foundation (ASF) under one -// // or more contributor license agreements. See the NOTICE file -// // distributed with this work for additional information -// // regarding copyright ownership. The ASF licenses this file -// // to you under the Apache License, Version 2.0 (the -// // "License"); you may not use this file except in compliance -// // with the License. You may obtain a copy of the License at -// // -// // http://www.apache.org/licenses/LICENSE-2.0 -// // -// // Unless required by applicable law or agreed to in writing, -// // software distributed under the License is distributed on an -// // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// // KIND, either express or implied. See the License for the -// // specific language governing permissions and limitations -// // under the License. -// -// -#include -#include -#include -#include -#include - -// [[Rcpp::export]] -std::shared_ptr read_parquet_file(std::string filename) { - std::shared_ptr infile; - PARQUET_THROW_NOT_OK( - arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool(), &infile)); - - std::unique_ptr reader; - PARQUET_THROW_NOT_OK( - parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr table; - PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); - - return table; -} From e936b44009b812c8e1ebef19c352b2d4c6e40074 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 2 Jan 2019 15:45:23 +0100 Subject: [PATCH 3/4] need parquet on travis too --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 10300c9b6e287..57fb10ea388b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -307,6 +307,8 @@ matrix: language: r cache: packages latex: false + env: + - ARROW_TRAVIS_PARQUET=1 before_install: # Have to copy-paste this here because of how R's build steps work - eval `python $TRAVIS_BUILD_DIR/ci/detect-changes.py` From 7d6e64df29ea8a6ca73670eb119123ae04b168ec Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Thu, 3 Jan 2019 10:14:23 +0100 Subject: [PATCH 4/4] read_parquet() only reading one parquet file, and gains a `as_tibble` argument --- r/R/parquet.R | 14 ++++++++++---- r/man/read_parquet.Rd | 11 +++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/r/R/parquet.R b/r/R/parquet.R index e15f59cf284e8..141da7bd04b2c 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -17,11 +17,17 @@ #' Read parquet file from disk #' -#' @param files a vector of filenames +#' @param file a file path +#' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble. +#' @param ... currently ignored #' -#' @importFrom purrr map_dfr +#' @return a [arrow::Table][arrow__Table], or a data frame if `as_tibble` is `TRUE`. #' #' @export -read_parquet <- function(files) { - map_dfr(files, ~as_tibble(shared_ptr(`arrow::Table`, read_parquet_file(f)))) +read_parquet <- function(file, as_tibble = TRUE, ...) { + tab <- shared_ptr(`arrow::Table`, read_parquet_file(f)) + if (isTRUE(as_tibble)) { + tab <- as_tibble(tab) + } + tab } diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index 08e0862445d11..c29e18bca5baf 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -4,10 +4,17 @@ \alias{read_parquet} \title{Read parquet file from disk} \usage{ -read_parquet(files) +read_parquet(file, as_tibble = TRUE, ...) } \arguments{ -\item{files}{a vector of filenames} +\item{file}{a file path} + +\item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.} + +\item{...}{currently ignored} +} +\value{ +a \link[=arrow__Table]{arrow::Table}, or a data frame if \code{as_tibble} is \code{TRUE}. } \description{ Read parquet file from disk