From 9e1897f80bece946f31180f25a5e47c6622004f8 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 2 Jan 2019 15:27:17 +0100 Subject: [PATCH] styling etc ... --- r/DESCRIPTION | 1 + r/NAMESPACE | 1 + r/R/RcppExports.R | 4 +++ r/R/parquet.R | 27 ++++++++++++++++ r/README.Rmd | 2 +- r/README.md | 61 ++++++++++--------------------------- r/configure | 2 +- r/man/read_parquet.Rd | 14 +++++++++ r/src/RcppExports.cpp | 12 ++++++++ r/src/parquetfilereader.cpp | 38 +++++++++++++++++++++++ 10 files changed, 115 insertions(+), 47 deletions(-) create mode 100644 r/R/parquet.R create mode 100644 r/man/read_parquet.Rd create mode 100644 r/src/parquetfilereader.cpp diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 5f93c83f236eb..bfd4d1e892eb2 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -61,6 +61,7 @@ Collate: 'memory_pool.R' 'message.R' 'on_exit.R' + 'parquet.R' 'read_record_batch.R' 'read_table.R' 'reexports-bit64.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 65d60d846f4cb..d039e573f4f32 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -113,6 +113,7 @@ export(print.integer64) export(read_arrow) export(read_feather) export(read_message) +export(read_parquet) export(read_record_batch) export(read_schema) export(read_table) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 0310eab2027b9..db2ddae295dad 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -617,6 +617,10 @@ ipc___ReadMessage <- function(stream) { .Call(`_arrow_ipc___ReadMessage`, stream) } +read_parquet_file <- function(filename) { + .Call(`_arrow_read_parquet_file`, filename) +} + RecordBatch__num_columns <- function(x) { .Call(`_arrow_RecordBatch__num_columns`, x) } diff --git a/r/R/parquet.R b/r/R/parquet.R new file mode 100644 index 0000000000000..e15f59cf284e8 --- /dev/null +++ b/r/R/parquet.R @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read parquet file from disk +#' +#' @param files a vector of filenames +#' +#' @importFrom purrr map_dfr +#' +#' @export +read_parquet <- function(files) { + map_dfr(files, ~as_tibble(shared_ptr(`arrow::Table`, read_parquet_file(f)))) +} diff --git a/r/README.Rmd b/r/README.Rmd index 2c51d01c0f00f..9f0f39fef5352 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -25,7 +25,7 @@ git clone https://github.com/apache/arrow.git cd arrow/cpp && mkdir release && cd release # It is important to statically link to boost libraries -cmake .. -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off +cmake .. -DARROW_PARQUET=ON -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off make install ``` diff --git a/r/README.md b/r/README.md index 868fdff0a06e0..987d0c24a185b 100644 --- a/r/README.md +++ b/r/README.md @@ -14,7 +14,7 @@ git clone https://github.com/apache/arrow.git cd arrow/cpp && mkdir release && cd release # It is important to statically link to boost libraries -cmake .. -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off +cmake .. -DARROW_PARQUET=ON -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off make install ``` @@ -38,48 +38,19 @@ tf <- tempfile() #> # A tibble: 10 x 2 #> x y #> -#> 1 1 -0.255 -#> 2 2 -0.162 -#> 3 3 -0.614 -#> 4 4 -0.322 -#> 5 5 0.0693 -#> 6 6 -0.920 -#> 7 7 -1.08 -#> 8 8 0.658 -#> 9 9 0.821 -#> 10 10 0.539 -arrow::write_arrow(tib, tf) - -# read it back with pyarrow -pa <- import("pyarrow") -as_tibble(pa$open_file(tf)$read_pandas()) -#> # A tibble: 10 x 2 -#> x y -#> -#> 1 1 -0.255 -#> 2 2 -0.162 -#> 3 3 -0.614 -#> 4 4 -0.322 -#> 5 5 0.0693 -#> 6 6 -0.920 -#> 7 7 -1.08 -#> 8 8 0.658 -#> 9 9 0.821 -#> 10 10 0.539 -``` - -## Development - -### Code style - -We use Google C++ style in our C++ code. Check for style errors with - -``` -./lint.sh -``` - -You can fix the style issues with - +#> 1 1 0.0855 +#> 2 2 -1.68 +#> 3 3 -0.0294 +#> 4 4 -0.124 +#> 5 5 0.0675 +#> 6 6 1.64 +#> 7 7 1.54 +#> 8 8 -0.0209 +#> 9 9 -0.982 +#> 10 10 0.349 +# arrow::write_arrow(tib, tf) + +# # read it back with pyarrow +# pa <- import("pyarrow") +# as_tibble(pa$open_file(tf)$read_pandas()) ``` -./lint.sh --fix -``` \ No newline at end of file diff --git a/r/configure b/r/configure index 28f6a73ac7ef5..093f7cdcf6e2c 100755 --- a/r/configure +++ b/r/configure @@ -26,7 +26,7 @@ # R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib' # Library settings -PKG_CONFIG_NAME="arrow" +PKG_CONFIG_NAME="arrow parquet" PKG_DEB_NAME="arrow" PKG_RPM_NAME="arrow" PKG_CSW_NAME="arrow" diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd new file mode 100644 index 0000000000000..08e0862445d11 --- /dev/null +++ b/r/man/read_parquet.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\name{read_parquet} +\alias{read_parquet} +\title{Read parquet file from disk} +\usage{ +read_parquet(files) +} +\arguments{ +\item{files}{a vector of filenames} +} +\description{ +Read parquet file from disk +} diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index e5a784eb70c23..e8ad609cadcfc 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -1721,6 +1721,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// read_parquet_file +std::shared_ptr read_parquet_file(std::string filename); +RcppExport SEXP _arrow_read_parquet_file(SEXP filenameSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type filename(filenameSEXP); + rcpp_result_gen = Rcpp::wrap(read_parquet_file(filename)); + return rcpp_result_gen; +END_RCPP +} // RecordBatch__num_columns int RecordBatch__num_columns(const std::shared_ptr& x); RcppExport SEXP _arrow_RecordBatch__num_columns(SEXP xSEXP) { @@ -2306,6 +2317,7 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, {"_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, {"_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, + {"_arrow_read_parquet_file", (DL_FUNC) &_arrow_read_parquet_file, 1}, {"_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, {"_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, {"_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, diff --git a/r/src/parquetfilereader.cpp b/r/src/parquetfilereader.cpp new file mode 100644 index 0000000000000..3924953e93da5 --- /dev/null +++ b/r/src/parquetfilereader.cpp @@ -0,0 +1,38 @@ +// // Licensed to the Apache Software Foundation (ASF) under one +// // or more contributor license agreements. See the NOTICE file +// // distributed with this work for additional information +// // regarding copyright ownership. The ASF licenses this file +// // to you under the Apache License, Version 2.0 (the +// // "License"); you may not use this file except in compliance +// // with the License. You may obtain a copy of the License at +// // +// // http://www.apache.org/licenses/LICENSE-2.0 +// // +// // Unless required by applicable law or agreed to in writing, +// // software distributed under the License is distributed on an +// // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// // KIND, either express or implied. See the License for the +// // specific language governing permissions and limitations +// // under the License. +// +// +#include +#include +#include +#include +#include + +// [[Rcpp::export]] +std::shared_ptr read_parquet_file(std::string filename) { + std::shared_ptr infile; + PARQUET_THROW_NOT_OK( + arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool(), &infile)); + + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + + return table; +}